diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3dad41a88c8212b7445c32f241d887306d3c19ad..8669c25c452b53da48239bc20c9a2d3528e75422 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,5 +1,16 @@
 # Contributing guidelines
 
+## Pull Request Checklist
+
+Before sending your pull requests, make sure you followed this list.
+
+- Read [contributing guidelines](CONTRIBUTING.md).
+- Read [Code of Conduct](CODE_OF_CONDUCT.md).
+- Ensure you have signed the [Contributor License Agreement (CLA)](https://cla.developers.google.com/).
+- Check if my changes are consistent with the [guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md#general-guidelines-and-philosophy-for-contribution).
+- Changes are consistent with the [Coding Style](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md#c-coding-style).
+- Run [Unit Tests](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md#running-unit-tests).
+
 ## How to become a contributor and submit your own code
 
 ### Contributor License Agreements
diff --git a/README.md b/README.md
index e1a50c87e26d493ba3ac760f357905d89aa40dab..6fb4486d0de9ff476b5cf1dbd63d66879637df84 100644
--- a/README.md
+++ b/README.md
@@ -5,9 +5,9 @@
 -----------------
 
 
-| **`Documentation`** | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
-|-----------------|---------------------|------------------|-------------------|---------------|---------------|
-| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.png) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-cc.png) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.png) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
+| **`Documentation`** |
+|-----------------|
+| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) |
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
@@ -40,15 +40,6 @@ environment to install the nightly TensorFlow build. We support CPU and GPU
 packages on Linux, Mac, and Windows.
 
 
-**Individual whl files**
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/)) / [Python 3.6](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp36-cp36m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=cpu-slave/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/42/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/)) / [Python 3.6](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/))
-* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
-([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
-
 #### *Try your first TensorFlow program*
 ```shell
 $ python
@@ -82,6 +73,30 @@ The TensorFlow project strives to abide by generally accepted best practices in
 
 [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/1486/badge)](https://bestpractices.coreinfrastructure.org/projects/1486)
 
+
+## Continuous build status
+
+### Official Builds
+
+| Build Type      | Status | Artifacts |
+| ---             | ---    | ---       |
+| **Linux CPU**   | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.png) | [pypi](https://pypi.org/project/tf-nightly/) |
+| **Linux GPU**   | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-cc.png) | [pypi](https://pypi.org/project/tf-nightly-gpu/) |
+| **Linux XLA**   | TBA | TBA |
+| **MacOS**       | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.png) | [pypi](https://pypi.org/project/tf-nightly/) |
+| **Windows CPU** | [![Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [pypi](https://pypi.org/project/tf-nightly/) |
+| **Windows GPU** | [![Status](http://ci.tensorflow.org/job/tf-master-win-gpu-cmake/badge/icon)](http://ci.tensorflow.org/job/tf-master-win-gpu-cmake/) | [pypi](https://pypi.org/project/tf-nightly-gpu/) |
+| **Android**     | [![Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/) [build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/) |
+
+
+### Community Supported Builds
+
+| Build Type      | Status | Artifacts |
+| ---             | ---    | ---       |
+| **IBM s390x**       | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/) | TBA |
+| **IBM ppc64le CPU** | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/) | TBA |
+
+
 ## For more information
 
 * [TensorFlow Website](https://www.tensorflow.org)
diff --git a/SECURITY.md b/SECURITY.md
index a5ce3a62ee202f6e7d83f0fedc2777d9c88ba9b5..01886b613e5d93793953124331b57f075fe7a373 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -173,7 +173,7 @@ the progress being made towards a fix and announcement.
 In addition, please include the following information along with your report:
 
 * Your name and affiliation (if any).
-* A description the technical details of the vulnerabilities. It is very
+* A description of the technical details of the vulnerabilities. It is very
   important to let us know how we can reproduce your findings.
 * An explanation who can exploit this vulnerability, and what they gain when
   doing so -- write an attack scenario. This will help us evaluate your report
diff --git a/configure.py b/configure.py
index fe15bfc1a43bac5d9c249bf5b61854ff0e07aec7..b6c32543cf707983d48e390cc89abf13dafd55d3 100644
--- a/configure.py
+++ b/configure.py
@@ -498,10 +498,6 @@ def set_cc_opt_flags(environ_cp):
   if not is_ppc64le() and not is_windows():
     write_to_bazelrc('build:opt --host_copt=-march=native')
   write_to_bazelrc('build:opt --define with_default_optimizations=true')
-  # TODO(mikecase): Remove these default defines once we are able to get
-  # TF Lite targets building without them.
-  write_to_bazelrc('build --copt=-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK')
-  write_to_bazelrc('build --host_copt=-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK')
 
 def set_tf_cuda_clang(environ_cp):
   """set TF_CUDA_CLANG action_env.
@@ -845,8 +841,8 @@ def reformat_version_sequence(version_str, sequence_count):
 def set_tf_cuda_version(environ_cp):
   """Set CUDA_TOOLKIT_PATH and TF_CUDA_VERSION."""
   ask_cuda_version = (
-      'Please specify the CUDA SDK version you want to use, '
-      'e.g. 7.0. [Leave empty to default to CUDA %s]: ') % _DEFAULT_CUDA_VERSION
+      'Please specify the CUDA SDK version you want to use. '
+      '[Leave empty to default to CUDA %s]: ') % _DEFAULT_CUDA_VERSION
 
   for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
     # Configure the Cuda SDK version to use.
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 82dbd3cdbc6e8fb0c6fbcddb33b6a95c87a83225..95b04f9058afdfaadbc24f0238860279fcd3e800 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -8407,3 +8407,51 @@ TF_Tensor* TF_DequeueNamedTensor(TF_Session* session, int tensor_id,
   }
   return ret;
 }
+
+void TF_EnqueueNamedTensor(TF_Session* session, int tensor_id,
+                           TF_Tensor* tensor, TF_Status* status) {
+  assert(session);
+  {
+    tensorflow::mutex_lock c(session->graph->mu);
+    if (VLOG_IS_ON(1)) {
+      VLOG(1) << "Enqueuing named tensor with id " << tensor_id
+              << ", with input graph: "
+              << session->graph->graph.ToGraphDefDebug().DebugString();
+      tensorflow::Tensor internal_tensor;
+      if (tensorflow::TF_TensorToTensor(tensor, &internal_tensor).ok()) {
+        VLOG(1) << "Enqueu'ing tensor content: "
+                << internal_tensor.DebugString();
+      }
+    }
+  }
+
+  TF_Operation* enqueue_op = TF_GraphOperationByName(
+      session->graph,
+      tensorflow::strings::StrCat("fifo_queue_enqueue_", tensor_id).c_str());
+  if (enqueue_op == nullptr) {
+    status->status = tensorflow::errors::Internal(
+        "Unable to find the enqueue node in the TF graph.");
+    return;
+  }
+
+  TF_Operation* placeholder_op = TF_GraphOperationByName(
+      session->graph,
+      tensorflow::strings::StrCat("arg_tensor_enqueue_", tensor_id).c_str());
+  if (placeholder_op == nullptr) {
+    status->status = tensorflow::errors::Internal(
+        "Unable to find the placeholder node as input to enqueue in the TF "
+        "graph.");
+    return;
+  }
+
+  VLOG(1) << "Running the enqueue op";
+  TF_Output input{placeholder_op, 0};
+  TF_SessionRun(session, /*run_options*/ nullptr,
+                // input related parameters
+                /*inputs*/ &input, /*input_values*/ &tensor, /*ninputs*/ 1,
+                // output related parameters
+                /*outputs*/ nullptr, /*output_values*/ nullptr, /*noutputs*/ 0,
+                /*targets*/ &enqueue_op, /*ntargets*/ 1,
+                /*run_metadata*/ nullptr, status);
+  VLOG(1) << "Enqueuing is done.";
+}
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index e6757c065fc540fa789cdbb694e66ca0b00c4832..20bdace40f1272ded06e710034053a7610326e7f 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -87,8 +87,11 @@ TF_CAPI_EXPORT extern TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
     unsigned char is_mnist, TF_Status* status);
 
 // On success, dequeues a tensor from a TF-managed FifoQueue given by
-// `tensor_id`, associated with `session`. Caller must call TF_DeleteTensor()
-// over the returned tensor. If the queue is empty, this call is blocked.
+// `tensor_id`, associated with `session`. There must be a graph node named
+// "fifo_queue_dequeue_<tensor_id>", to be executed by this API call.
+
+// Caller must call TF_DeleteTensor() over the returned tensor. If the queue is
+// empty, this call is blocked.
 //
 // Tensors are enqueued via the corresponding TF enqueue op.
 // TODO(hongm): Add support for `timeout_ms`.
@@ -96,6 +99,22 @@ TF_CAPI_EXPORT extern TF_Tensor* TF_DequeueNamedTensor(TF_Session* session,
                                                        int tensor_id,
                                                        TF_Status* status);
 
+// On success, enqueues `tensor` into a TF-managed FifoQueue given by
+// `tensor_id`, associated with `session`. There must be a graph node named
+// "fifo_queue_enqueue_<tensor_id>", to be executed by this API call. It reads
+// from a placeholder node "arg_tensor_enqueue_<tensor_id>".
+//
+// `tensor` is still owned by the caller. This call will be blocked if the queue
+// has reached its capacity, and will be unblocked when the queued tensors again
+// drop below the capacity due to dequeuing.
+//
+// Tensors are dequeued via the corresponding TF dequeue op.
+// TODO(hongm): Add support for `timeout_ms`.
+TF_CAPI_EXPORT extern void TF_EnqueueNamedTensor(TF_Session* session,
+                                                 int tensor_id,
+                                                 TF_Tensor* tensor,
+                                                 TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h
index cd19cf8d624d9b914b61132f93d918b046cdbd30..c16aba666ee6974fed5351c2d9ac291dcbcdecab 100644
--- a/tensorflow/c/c_test_util.h
+++ b/tensorflow/c/c_test_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 14321191625e448637aa44a7f6a17820159b97c2..9ce781fab0be709fb0f115a2206eea4c2826bf36 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -24,10 +24,10 @@ tf_cuda_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            ":runtime",
             "//tensorflow/c:c_api",
             "//tensorflow/c:c_api_internal",
             "//tensorflow/core:core_cpu",
+            "//tensorflow/core/common_runtime/eager:attr_builder",
             "//tensorflow/core/common_runtime/eager:context",
             "//tensorflow/core/common_runtime/eager:eager_executor",
             "//tensorflow/core/common_runtime/eager:execute",
@@ -49,6 +49,17 @@ tf_cuda_library(
         "//conditions:default": [],
     }) + [
         "//tensorflow/core/common_runtime/eager:eager_operation",
+        "//tensorflow/core/distributed_runtime/eager:eager_client",
+        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/distributed_runtime/rpc/eager:eager_grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
+        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime:remote_device",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core:gpu_runtime",
     ],
 )
@@ -59,7 +70,6 @@ tf_cuda_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":c_api",
-        ":runtime",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/core:core_cpu",
@@ -69,11 +79,23 @@ tf_cuda_library(
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/common_runtime/eager:attr_builder",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:eager_executor",
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/common_runtime/eager:kernel_and_device",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/distributed_runtime:remote_device",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime/eager:eager_client",
+        "//tensorflow/core/distributed_runtime/eager:remote_tensor_handle",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
+        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime/rpc/eager:eager_grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
     ],
 )
 
@@ -92,47 +114,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-    ],
-)
-
-tf_cuda_library(
-    name = "runtime",
-    srcs = ["runtime.cc"],
-    hdrs = ["runtime.h"],
-    copts = tf_copts(),
-    visibility = ["//tensorflow:internal"],
-    deps = select({
-        "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
-        ],
-        "//conditions:default": [
-            "//tensorflow/c:c_api",
-            "//tensorflow/core:core_cpu",
-            "//tensorflow/core/common_runtime/eager:kernel_and_device",
-            "//tensorflow/core:core_cpu_internal",
-            "//tensorflow/core:framework",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:lib_internal",
-            "//tensorflow/core:protos_all_cc",
-        ],
-    }),
-)
-
-tf_cc_test(
-    name = "runtime_test",
-    srcs = ["runtime_test.cc"],
-    deps = [
-        ":runtime",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:client_session",
-        "//tensorflow/cc:ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
+        "//tensorflow/core/distributed_runtime/rpc/eager:eager_grpc_server_lib",
     ],
 )
 
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 3bf071f3abaac7dfd4113964fd49cd9322913bd5..216210c88c1593ebc68f604547ab06b543a7b2af 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_internal.h"
-#include "tensorflow/c/eager/runtime.h"
 #ifdef TENSORFLOW_EAGER_USE_XLA
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #endif  // TENSORFLOW_EAGER_USE_XLA
@@ -32,15 +31,22 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/copy_to_device_node.h"
 #include "tensorflow/core/common_runtime/eager/execute.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
@@ -71,6 +77,121 @@ string DeviceName(const tensorflow::Device* d) {
 std::atomic_int_fast64_t func_id_generator(0);
 #endif  // TENSORFLOW_EAGER_USE_XLA
 
+tensorflow::Status GetAllRemoteDevices(
+    const std::vector<string>& remote_workers,
+    tensorflow::WorkerCacheInterface* worker_cache,
+    std::unique_ptr<tensorflow::DeviceMgr>* device_mgr) {
+  std::vector<tensorflow::Device*> remote_devices;
+  tensorflow::Status status;
+  // TODO(nareshmodi) do this in parallel instead of serially.
+  for (const string& remote_worker : remote_workers) {
+    tensorflow::Notification n;
+    tensorflow::NewRemoteDevices(
+        tensorflow::Env::Default(), worker_cache, remote_worker,
+        [&status, &n, &remote_devices](
+            const tensorflow::Status& s,
+            std::vector<tensorflow::Device*>* devices) {
+          status = s;
+          if (s.ok()) {
+            for (tensorflow::Device* d : *devices) {
+              remote_devices.push_back(d);
+            }
+          }
+          n.Notify();
+        });
+    n.WaitForNotification();
+  }
+  std::unique_ptr<tensorflow::DeviceMgr> remote_device_mgr(
+      new tensorflow::DeviceMgr(remote_devices));
+
+  TF_RETURN_IF_ERROR(status);
+
+  *device_mgr = std::move(remote_device_mgr);
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status CreateRemoteContexts(
+    const std::vector<string>& remote_workers,
+    tensorflow::eager::EagerClientCache* remote_eager_workers, bool async,
+    tensorflow::gtl::FlatMap<string, tensorflow::uint64>* remote_contexts) {
+  for (int i = 0; i < remote_workers.size(); i++) {
+    const string& remote_worker = remote_workers[i];
+
+    tensorflow::eager::CreateContextRequest request;
+    tensorflow::eager::CreateContextResponse response;
+    tensorflow::DeviceNameUtils::ParsedName parsed_name;
+    if (!tensorflow::DeviceNameUtils::ParseFullName(remote_worker,
+                                                    &parsed_name)) {
+      return tensorflow::errors::InvalidArgument(
+          "Unable to parse ", remote_worker, " as a device name");
+    }
+    request.mutable_server_def()->set_job_name(parsed_name.job);
+    request.mutable_server_def()->set_task_index(parsed_name.task);
+    request.set_async(async);
+    auto* eager_client = remote_eager_workers->GetClient(remote_worker);
+    if (eager_client == nullptr) {
+      return tensorflow::errors::Internal(
+          "Cannot find a client for the given target:", remote_worker);
+    }
+    tensorflow::Notification n;
+    tensorflow::Status status;
+    // TODO(nareshmodi) do this in parallel instead of serially.
+    eager_client->CreateContextAsync(
+        &request, &response, [&status, &n](const tensorflow::Status& s) {
+          status = s;
+          n.Notify();
+        });
+    n.WaitForNotification();
+    TF_RETURN_IF_ERROR(status);
+
+    remote_contexts->emplace(remote_worker, response.context_id());
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status NewRemoteAwareTFE_Context(const TFE_ContextOptions* opts,
+                                             TFE_Context** ctx) {
+  string worker_name = tensorflow::strings::StrCat(
+      "/job:", opts->server_def.job_name(),
+      "/replica:0/task:", opts->server_def.task_index());
+  std::unique_ptr<tensorflow::eager::EagerGrpcServer> server;
+  TF_RETURN_IF_ERROR(
+      tensorflow::eager::EagerGrpcServer::Create(opts->server_def, &server));
+
+  TF_RETURN_IF_ERROR(server->Start());
+
+  std::vector<string> remote_workers;
+  server->master_env()->worker_cache->ListWorkers(&remote_workers);
+  remote_workers.erase(
+      std::remove(remote_workers.begin(), remote_workers.end(), worker_name),
+      remote_workers.end());
+
+  std::unique_ptr<tensorflow::DeviceMgr> remote_device_mgr;
+  TF_RETURN_IF_ERROR(GetAllRemoteDevices(
+      remote_workers, server->master_env()->worker_cache, &remote_device_mgr));
+
+  std::shared_ptr<tensorflow::GrpcChannelCache> channel_cache =
+      server->channel_cache();
+  std::unique_ptr<tensorflow::eager::EagerClientCache> remote_eager_workers(
+      tensorflow::eager::NewGrpcEagerClientCache(channel_cache));
+
+  // Initialize remote eager workers.
+  tensorflow::gtl::FlatMap<string, tensorflow::uint64> remote_contexts;
+  TF_RETURN_IF_ERROR(CreateRemoteContexts(remote_workers,
+                                          remote_eager_workers.get(),
+                                          opts->async, &remote_contexts));
+
+  tensorflow::RemoteRendezvous* r =
+      server->worker_env()->rendezvous_mgr->Find(0);
+
+  auto* device_mgr = server->worker_env()->device_mgr;
+  *ctx = new TFE_Context(opts->session_options.options, opts->policy,
+                         opts->async, device_mgr, r, std::move(server),
+                         std::move(remote_eager_workers),
+                         std::move(remote_device_mgr), remote_contexts);
+
+  return tensorflow::Status::OK();
+}
 }  // namespace
 
 extern "C" {
@@ -91,6 +212,15 @@ void TFE_ContextOptionsSetDevicePlacementPolicy(
   options->policy = policy;
 }
 
+TF_CAPI_EXPORT extern void TFE_ContextOptionsSetServerDef(
+    TFE_ContextOptions* options, const void* proto, size_t proto_len,
+    TF_Status* status) {
+  if (!options->server_def.ParseFromArray(proto, proto_len)) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Invalid tensorflow.ServerDef protocol buffer");
+  }
+}
+
 TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx,
                                                         unsigned char async,
                                                         TF_Status* status) {
@@ -100,17 +230,23 @@ TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx,
 void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; }
 
 TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
+  if (!opts->server_def.job_name().empty()) {
+    TFE_Context* ctx = nullptr;
+    status->status = NewRemoteAwareTFE_Context(opts, &ctx);
+    return ctx;
+  }
+
   std::vector<tensorflow::Device*> devices;
   status->status = tensorflow::DeviceFactory::AddDevices(
       opts->session_options.options, "/job:localhost/replica:0/task:0",
       &devices);
-  if (!status->status.ok()) {
-    return nullptr;
-  }
+  if (!status->status.ok()) return nullptr;
   std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
       new tensorflow::DeviceMgr(devices));
+
   tensorflow::Rendezvous* r =
       new tensorflow::IntraProcessRendezvous(device_mgr.get());
+
   return new TFE_Context(opts->session_options.options, opts->policy,
                          opts->async, std::move(device_mgr), r);
 }
@@ -119,7 +255,10 @@ void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status) { delete ctx; }
 
 TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) {
   TF_DeviceList* list = new TF_DeviceList;
-  ctx->context.device_mgr()->ListDeviceAttributes(&list->response);
+  ctx->context.local_device_mgr()->ListDeviceAttributes(&list->response);
+  if (ctx->context.remote_device_mgr()) {
+    ctx->context.remote_device_mgr()->ListDeviceAttributes(&list->response);
+  }
   return list;
 }
 
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index c06ce84a8c578aa60dd626c24bd58098b78ae750..574a097e0d6f5d6e7acd77cae246678b6675129b 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -81,6 +81,16 @@ TF_CAPI_EXPORT extern void TFE_ContextOptionsSetAsync(TFE_ContextOptions*,
 TF_CAPI_EXPORT extern void TFE_ContextOptionsSetDevicePlacementPolicy(
     TFE_ContextOptions*, TFE_ContextDevicePlacementPolicy);
 
+// A tensorflow.ServerDef specifies remote workers (in addition to the current
+// workers name). Operations created on this context can then be executed on
+// any of these remote workers by setting an appropriate device.
+//
+// If the following is set, all servers identified by the
+// ServerDef must be up when the context is created.
+TF_CAPI_EXPORT extern void TFE_ContextOptionsSetServerDef(
+    TFE_ContextOptions* options, const void* proto, size_t proto_len,
+    TF_Status* status);
+
 // Destroy an options object.
 TF_CAPI_EXPORT extern void TFE_DeleteContextOptions(TFE_ContextOptions*);
 
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 49e1aab1cef9577256d9b081858cf094c788caf8..2b8384d72038c3b4a050c70b3e7c5e0ca0bd94f3 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -28,8 +28,8 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
-#include "tensorflow/c/eager/runtime.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
@@ -37,6 +37,14 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/distributed_runtime/remote_device.h"
+#include "tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
+#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -51,6 +59,7 @@ struct TFE_ContextOptions {
   // true if async execution is enabled.
   bool async = false;
   TFE_ContextDevicePlacementPolicy policy{TFE_DEVICE_PLACEMENT_SILENT};
+  tensorflow::ServerDef server_def;
 };
 
 struct TFE_Context {
@@ -64,6 +73,23 @@ struct TFE_Context {
                     default_policy),
                 async, std::move(device_mgr), rendezvous) {}
 
+  explicit TFE_Context(
+      const tensorflow::SessionOptions& opts,
+      TFE_ContextDevicePlacementPolicy default_policy, bool async,
+      tensorflow::DeviceMgr* local_device_mgr,
+      tensorflow::Rendezvous* rendezvous,
+      std::unique_ptr<tensorflow::GrpcServer> server,
+      std::unique_ptr<tensorflow::eager::EagerClientCache> remote_eager_workers,
+      std::unique_ptr<tensorflow::DeviceMgr> remote_device_mgr,
+      const tensorflow::gtl::FlatMap<tensorflow::string, tensorflow::uint64>&
+          remote_contexts)
+      : context(opts,
+                static_cast<tensorflow::ContextDevicePlacementPolicy>(
+                    default_policy),
+                async, local_device_mgr, rendezvous, std::move(server),
+                std::move(remote_eager_workers), std::move(remote_device_mgr),
+                remote_contexts) {}
+
   tensorflow::EagerContext context;
 };
 
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 701175e4943d1d23532fe595319f67711316ed4d..49646bb73599d96fce2df90f918e692df7972aeb 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 
 #include <string.h>
+#include "tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
@@ -23,7 +24,9 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 
 using tensorflow::string;
 
@@ -220,6 +223,103 @@ TEST(CAPI, Context) {
   TF_DeleteStatus(status);
 }
 
+tensorflow::ServerDef GetServerDef(int num_tasks) {
+  tensorflow::ServerDef server_def;
+  server_def.set_protocol("grpc");
+  server_def.set_job_name("localhost");
+  server_def.set_task_index(0);
+  tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster();
+  tensorflow::JobDef* job_def = cluster_def->add_job();
+  job_def->set_name("localhost");
+  for (int i = 0; i < num_tasks; i++) {
+    int port = tensorflow::testing::PickUnusedPortOrDie();
+    job_def->mutable_tasks()->insert(
+        {i, tensorflow::strings::StrCat("localhost:", port)});
+  }
+  return server_def;
+}
+
+void TestRemoteExecute(bool async) {
+  tensorflow::ServerDef server_def = GetServerDef(2);
+
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+
+  std::unique_ptr<tensorflow::eager::EagerGrpcServer> worker_server;
+  ASSERT_TRUE(
+      tensorflow::eager::EagerGrpcServer::Create(server_def, &worker_server)
+          .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetServerDef(opts, serialized.data(), serialized.size(),
+                                 status);
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(1));
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle();
+  TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle();
+  const char remote_device_name[] =
+      "/job:localhost/replica:0/task:1/device:CPU:0";
+  auto* h0_task1 =
+      TFE_TensorHandleCopyToDevice(h0_task0, ctx, remote_device_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  auto* h1_task1 =
+      TFE_TensorHandleCopyToDevice(h1_task0, ctx, remote_device_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_Op* matmul = MatMulOp(ctx, h0_task1, h1_task1);
+  TFE_OpSetDevice(matmul, remote_device_name, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* retvals[1];
+  int num_retvals = 1;
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  auto* retval_task0 = TFE_TensorHandleCopyToDevice(
+      retvals[0], ctx, "/job:localhost/replica:0/task:0/device:CPU:0", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retval_task0, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(retval_task0);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+
+  TFE_DeleteTensorHandle(h0_task0);
+  TFE_DeleteTensorHandle(h1_task0);
+  TFE_DeleteTensorHandle(h0_task1);
+  TFE_DeleteTensorHandle(h1_task1);
+  TFE_DeleteTensorHandle(retvals[0]);
+
+  TFE_DeleteOp(matmul);
+
+  TFE_ContextAsyncWait(ctx, status);
+  TFE_DeleteContext(ctx, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_DeleteStatus(status);
+
+  // TODO(nareshmodi): Figure out how to correctly shut the server down.
+  worker_server.release();
+}
+
+TEST(CAPI, RemoteExecute) { TestRemoteExecute(false); }
+TEST(CAPI, RemoteExecuteAsync) { TestRemoteExecute(true); }
+
 TEST(CAPI, TensorHandle) {
   TFE_TensorHandle* h = TestMatrixTensorHandle();
   EXPECT_EQ(TF_FLOAT, TFE_TensorHandleDataType(h));
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 8026076b9ef3bf07f6f84caf80329a1301c6a27f..1833b25fea0047c9652318e49599ba623daaec26 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -104,6 +104,10 @@ class VSpace {
       gtl::ArraySlice<Gradient*> output_gradients,
       std::vector<Gradient*>* result) const = 0;
 
+  // Marks the following gradient as a result so it's not consumed by backward
+  // functions.
+  virtual void MarkAsResult(Gradient* gradient) const = 0;
+
   // Deletes the input tensor.
   virtual void DeleteGradient(Gradient* gradient) const = 0;
 
@@ -130,13 +134,15 @@ class GradientTape {
     }
   }
 
-  bool ShouldRecord(gtl::ArraySlice<int64> tensor_ids);
+  bool ShouldRecord(gtl::ArraySlice<int64> tensor_ids,
+                    gtl::ArraySlice<tensorflow::DataType> dtypes);
 
   void Watch(int64 tensor_id);
 
   void RecordOperation(const string& op_type,
                        gtl::ArraySlice<TapeTensor> output_tensors,
                        gtl::ArraySlice<int64> input_tensor_id,
+                       gtl::ArraySlice<tensorflow::DataType> input_dtypes,
                        BackwardFunction* backward_function,
                        const std::function<void()>& backward_function_deleter);
 
@@ -170,12 +176,32 @@ class GradientTape {
 
 // Template instantiations here
 
+inline bool IsDtypeTrainable(DataType dtype) {
+  switch (dtype) {
+    case DT_HALF:
+    case DT_BFLOAT16:
+    case DT_FLOAT:
+    case DT_DOUBLE:
+    case DT_COMPLEX64:
+    case DT_COMPLEX128:
+    case DT_RESOURCE:
+    case DT_VARIANT:
+      return true;
+    default:
+      return false;
+  }
+}
+
 template <typename Gradient, typename BackwardFunction>
 bool GradientTape<Gradient, BackwardFunction>::ShouldRecord(
-    gtl::ArraySlice<int64> tensor_ids) {
-  for (int64 i : tensor_ids) {
-    if (tensor_tape_.find(i) != tensor_tape_.end()) {
-      return true;
+    gtl::ArraySlice<int64> tensor_ids,
+    gtl::ArraySlice<tensorflow::DataType> dtypes) {
+  CHECK_EQ(tensor_ids.size(), dtypes.size());
+  for (int i = 0; i < tensor_ids.size(); ++i) {
+    if (tensor_tape_.find(tensor_ids[i]) != tensor_tape_.end()) {
+      if (IsDtypeTrainable(dtypes[i])) {
+        return true;
+      }
     }
   }
   return false;
@@ -189,9 +215,11 @@ void GradientTape<Gradient, BackwardFunction>::Watch(int64 tensor_id) {
 template <typename Gradient, typename BackwardFunction>
 void GradientTape<Gradient, BackwardFunction>::RecordOperation(
     const string& op_type, gtl::ArraySlice<TapeTensor> output_tensors,
-    gtl::ArraySlice<int64> input_tensor_id, BackwardFunction* backward_function,
+    gtl::ArraySlice<int64> input_tensor_id,
+    gtl::ArraySlice<tensorflow::DataType> input_dtypes,
+    BackwardFunction* backward_function,
     const std::function<void()>& backward_function_deleter) {
-  if (!ShouldRecord(input_tensor_id)) {
+  if (!ShouldRecord(input_tensor_id, input_dtypes)) {
     backward_function_deleter();
     return;
   }
@@ -332,8 +360,7 @@ BackpropInitialState<BackwardFunction> PrepareBackprop(
         count_it->second++;
       } else {
         result.tensor_usage_counts[it] = 1;
-        if (sources_set.find(it) == sources_set.end() &&
-            tensor_tape.find(it) != tensor_tape.end()) {
+        if (tensor_tape.find(it) != tensor_tape.end()) {
           tensor_stack.push_back(it);
         }
       }
@@ -498,10 +525,15 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
         }
       } else {
         any_gradient_nonzero = true;
-        out_gradients.push_back(vspace.AggregateGradients(grad_it->second));
+        auto new_gradients = vspace.AggregateGradients(grad_it->second);
         if (sources_set.find(grad_it->first) == sources_set.end()) {
           gradients.erase(grad_it);
+        } else {
+          grad_it->second.clear();
+          grad_it->second.push_back(new_gradients);
+          vspace.MarkAsResult(new_gradients);
         }
+        out_gradients.push_back(new_gradients);
       }
     }
     std::vector<Gradient*> in_gradients;
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index 1b4c7c2688083e74433da3dce2849b8c37443684..fd7b6fe6625f27bda92e2f56f60908658cdecd7e 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -31,7 +31,6 @@ using ops::AddN;
 using ops::BatchMatMul;
 using ops::Const;
 using ops::Div;
-using ops::Greater;
 using ops::MatMul;
 using ops::Max;
 using ops::Maximum;
@@ -46,7 +45,6 @@ using ops::RealDiv;
 using ops::SquaredDifference;
 using ops::Sub;
 using ops::Sum;
-using ops::Where3;
 
 // TODO(andydavis) Test gradient function against numeric gradients output.
 // TODO(andydavis) As more gradients are added move common test functions
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 0cb3132e94e381f672d69aefe4a199d2b590830c..c73482d5f4d13ade0dc0412941251d1651371b6e 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -255,6 +255,53 @@ Status LRNGradHelper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("LRN", LRNGradHelper);
 
+Status SoftplusGradHelper(const Scope& scope, const Operation& op,
+                          const std::vector<Output>& grad_inputs,
+                          std::vector<Output>* grad_outputs) {
+  auto dx = internal::SoftplusGrad(scope, grad_inputs[0], op.input(0));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Softplus", SoftplusGradHelper);
+
+Status SoftsignGradHelper(const Scope& scope, const Operation& op,
+                          const std::vector<Output>& grad_inputs,
+                          std::vector<Output>* grad_outputs) {
+  auto dx = internal::SoftsignGrad(scope, grad_inputs[0], op.input(0));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Softsign", SoftsignGradHelper);
+
+Status FractionalAvgPoolGradHelper(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  bool overlapping;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
+  auto dx = internal::FractionalAvgPoolGrad(
+      scope, Shape(scope, op.input(0), Shape::OutType(DT_INT64)),
+      grad_inputs[0], op.output(1), op.output(2),
+      internal::FractionalAvgPoolGrad::Overlapping(overlapping));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("FractionalAvgPool", FractionalAvgPoolGradHelper);
+
+Status FractionalMaxPoolGradHelper(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  bool overlapping;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
+  auto dx = internal::FractionalMaxPoolGrad(
+      scope, op.input(0), op.output(0), grad_inputs[0], op.output(1),
+      op.output(2), internal::FractionalMaxPoolGrad::Overlapping(overlapping));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("FractionalMaxPool", FractionalMaxPoolGradHelper);
+
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index c4eba7ecb017fe4628140d75a63bc7f0f09deb7f..b4d457a9d14eb79232cda9412fa0050f6a9968cc 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -28,6 +28,8 @@ namespace {
 using ops::BiasAdd;
 using ops::Conv2D;
 using ops::Elu;
+using ops::FractionalAvgPool;
+using ops::FractionalMaxPool;
 using ops::L2Loss;
 using ops::LogSoftmax;
 using ops::LRN;
@@ -41,6 +43,8 @@ using ops::Relu;
 using ops::Relu6;
 using ops::Selu;
 using ops::Softmax;
+using ops::Softplus;
+using ops::Softsign;
 
 class NNGradTest : public ::testing::Test {
  protected:
@@ -71,22 +75,30 @@ class NNGradTest : public ::testing::Test {
     EXPECT_LT(max_error, 1e-3);
   }
 
-  // Sets tensor with random values, ensuring that the max value is largest by
-  // a reasonable amount.
-  // This is an issue for MaxPool, MaxPoolV2 and MaxPool3D, in which
-  // perturbations by the numeric gradient computation in the gradient checker
-  // can change the max value if values are too close together.
+  // Sets tensor with random values, ensuring that every pair of elements are at
+  // least a reasonable amount apart.
+  // This is an issue for max pooling operations, in which perturbations by the
+  // numeric gradient computation in the gradient checker can change the max
+  // value if a pool has values that are too close together.
   template <typename T>
-  void SetRandomValuesWithBumpedMax(Tensor* tensor) {
+  void SetRandomValuesForMaxPooling(Tensor* tensor) {
     auto tensor_flat = tensor->flat<T>();
-    tensor_flat.setRandom();
-    int32 max_index = 0;
-    for (size_t i = 1; i < tensor->NumElements(); i++) {
-      if (tensor_flat(i) > tensor_flat(max_index)) {
-        max_index = i;
-      }
+    // First set the array to an increasing sequence of values spaced
+    // a reasonable amount apart
+    T cur = 0;
+    for (size_t i = 0; i < tensor->NumElements(); i++) {
+      tensor_flat(i) = cur;
+      cur += 5e-2;
+    }
+    // Fischer-Yates shuffle the array
+    for (size_t i = tensor->NumElements() - 1; i >= 1; i--) {
+      // j <- random integer 0 <= j <= i
+      size_t j = random::New64() % (i + 1);
+      // swap values at i, j
+      T tmp = tensor_flat(i);
+      tensor_flat(i) = tensor_flat(j);
+      tensor_flat(j) = tmp;
     }
-    tensor_flat(max_index) += 1e-2;
   }
 
   Scope scope_;
@@ -189,7 +201,7 @@ TEST_F(NNGradTest, MaxPoolGradHelper) {
   const std::vector<int> strides{1, 2, 2, 1};
   auto y = MaxPool(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -202,7 +214,7 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
   Tensor strides = test::AsTensor<int>({1, 2, 2, 1}, {4});
   auto y = MaxPoolV2(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -215,7 +227,7 @@ TEST_F(NNGradTest, MaxPool3DGradHelper) {
   const std::vector<int> strides{1, 3, 3, 3, 1};
   auto y = MaxPool3D(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -248,5 +260,45 @@ TEST_F(NNGradTest, LRN){
   RunTest(x, x_shape, y, x_shape);
 }
 
+TEST_F(NNGradTest, SoftplusGrad) {
+  TensorShape shape({3, 7});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Softplus(scope_, x);
+  RunTest(x, shape, y, shape);
+}
+
+TEST_F(NNGradTest, SoftsignGrad) {
+  TensorShape shape({3, 7});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Softsign(scope_, x);
+  RunTest(x, shape, y, shape);
+}
+
+TEST_F(NNGradTest, FractionalAvgPoolGradHelper) {
+  TensorShape x_shape({1, 3, 7, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Force consistent pooling regions for unit testing.
+  auto y = FractionalAvgPool(
+      scope_, x, {1, 1.2, 1.9, 1},
+      FractionalAvgPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
+          2));
+  TensorShape y_shape({1, 2, 3, 1});
+  RunTest(x, x_shape, y.output, y_shape);
+}
+
+TEST_F(NNGradTest, FractionalMaxPoolGradHelper) {
+  TensorShape x_shape({1, 3, 7, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Force consistent pooling regions for unit testing.
+  auto y = FractionalMaxPool(
+      scope_, x, {1, 1.2, 1.9, 1},
+      FractionalMaxPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
+          2));
+  Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
+  TensorShape y_shape({1, 2, 3, 1});
+  RunTest(x, x_init_value, y.output, y_shape);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/tools/freeze_saved_model.cc b/tensorflow/cc/tools/freeze_saved_model.cc
index 4ddddcb5863c9ffb1e5367db750b0d2ffd29cd5e..23e9dc40d23899b9cef168c9128b6d8ed1be3ee9 100644
--- a/tensorflow/cc/tools/freeze_saved_model.cc
+++ b/tensorflow/cc/tools/freeze_saved_model.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/cc/tools/freeze_saved_model.h"
 
+#include <iostream>
 #include <queue>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -71,6 +72,15 @@ void GetNodeNameToNodeDefMap(
   }
 }
 
+// Strips off the tensor part of the tensor_name to get the node_name.
+const string GetNodeNameFromTensorName(string tensor_name) {
+  if (tensor_name[0] == '^') {
+    tensor_name.erase(0, 1);
+  }
+  std::vector<string> tensor_name_parts = str_util::Split(tensor_name, ':');
+  return tensor_name_parts[0];
+}
+
 // Gets the set of node names needed by `outputs` and the corresponding set of
 // variable nodes to convert.
 void GetReachableNodesAndVariables(
@@ -83,10 +93,8 @@ void GetReachableNodesAndVariables(
       new std::unordered_set<string>({"Variable", "VariableV2", "VarHandleOp"});
 
   std::queue<string> nodes_to_visit;
-  for (const string& tensor_name : outputs) {
-    // We need to strip off the tensor part to get the node name.
-    std::vector<string> tensor_name_parts = str_util::Split(tensor_name, ':');
-    nodes_to_visit.push(tensor_name_parts[0]);
+  for (const string& output_tensor_name : outputs) {
+    nodes_to_visit.push(GetNodeNameFromTensorName(output_tensor_name));
   }
   // We do a traversal backwards from the outputs specified in the MetaGraphDef.
   while (!nodes_to_visit.empty()) {
@@ -100,8 +108,8 @@ void GetReachableNodesAndVariables(
     if (kVariableTypes->find(node->op()) != kVariableTypes->end()) {
       variable_node_names->insert(node->name());
     }
-    for (const string& input : node->input()) {
-      nodes_to_visit.push(input);
+    for (const string& input_tensor_name : node->input()) {
+      nodes_to_visit.push(GetNodeNameFromTensorName(input_tensor_name));
     }
   }
 }
diff --git a/tensorflow/cc/tools/freeze_saved_model_test.cc b/tensorflow/cc/tools/freeze_saved_model_test.cc
index cd35fd3b95deec669218cfa4f25fea2c3ac9e56e..979b23c3fc5f66ec574736cb4d39cec0ffd8e6b6 100644
--- a/tensorflow/cc/tools/freeze_saved_model_test.cc
+++ b/tensorflow/cc/tools/freeze_saved_model_test.cc
@@ -351,6 +351,56 @@ TEST_F(FreezeTest, GraphDefWithNoVariables) {
   GraphDefEqual(frozen_graph_def, graph_def);
 }
 
+TEST_F(FreezeTest, GraphDefWithMultiOutputOperation) {
+  // Tensors from operations with multiple outputs get tensor suffixes when used
+  // in input fields of following nodes, i.e. split:0, split:1.
+  // Test that we traverse those correctly.
+  SavedModelBundle saved_model_bundle;
+  GraphDef graph_def;
+  Scope scope = Scope::NewRootScope();
+  Output a = ops::Const(scope.WithOpName("a"), {10.0f, 10.0f}, {2});
+  Output axis = ops::Const(scope.WithOpName("axis"), 0, {});
+  OutputList split = ops::Split(scope.WithOpName("split"), axis, a, 2).output;
+  Output b = ops::Const(scope.WithOpName("b"), 10.0f, {});
+  Output c = ops::Mul(scope.WithOpName("c"), split[1], b);
+  TF_ASSERT_OK(scope.ToGraphDef(&graph_def));
+  TF_ASSERT_OK(AddGraphDefWithOutputsToSavedModelBundle(graph_def, {"c:0"}, "",
+                                                        &saved_model_bundle));
+
+  GraphDef frozen_graph_def;
+  std::unordered_set<string> inputs;
+  std::unordered_set<string> outputs;
+  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
+                                &outputs));
+
+  GraphDefEqual(frozen_graph_def, graph_def);
+}
+
+TEST_F(FreezeTest, GraphDefWithControlDependency) {
+  // Inputs that are control dependencies get tensor prefixes,
+  // i.e. ^control_dependency.
+  // Test that we traverse those correctly.
+  SavedModelBundle saved_model_bundle;
+  GraphDef graph_def;
+  Scope scope = Scope::NewRootScope();
+  Output source = ops::Const(scope.WithOpName("source"), 10.0f, {});
+  Output a = ops::Const(scope.WithOpName("a").WithControlDependencies(source),
+                        {10.0f, 10.0f}, {2});
+  Output b = ops::Const(scope.WithOpName("b"), 10.0f, {});
+  Output c = ops::Mul(scope.WithOpName("c"), a, b);
+  TF_ASSERT_OK(scope.ToGraphDef(&graph_def));
+  TF_ASSERT_OK(AddGraphDefWithOutputsToSavedModelBundle(graph_def, {"c:0"}, "",
+                                                        &saved_model_bundle));
+
+  GraphDef frozen_graph_def;
+  std::unordered_set<string> inputs;
+  std::unordered_set<string> outputs;
+  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
+                                &outputs));
+
+  GraphDefEqual(frozen_graph_def, graph_def);
+}
+
 TEST_F(FreezeTest, GraphDefWithoutDependentVariables) {
   TestFreezeGraphWithoutDependentVariables(false);
 }
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 19e6bf68e77725bb3cae4e1d338c52dff472cb18..2119c8ec47f941a76e81346ae5d20da78eae11a3 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -214,7 +214,6 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
         "@llvm//:core",
-        "@llvm//:execution_engine",
         "@llvm//:support",
         "@llvm//:target",
     ],
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 6e050cf56494e6d26e3647e3261a657eeaad64fa..6641d45e83020f4144616a6a2837c844330298f5 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -56,9 +56,9 @@ namespace bar {
 //
 // Memory stats:
 //   arg bytes total:    104
-//   arg bytes aligned:  128
+//   arg bytes aligned:  192
 //   temp bytes total:   126
-//   temp bytes aligned: 224
+//   temp bytes aligned: 320
 class MyClass : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.cc b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
index 63d22de1ca4aa0872b6fad3e0ac0182306d7cb8c..4e27aafec7747655d8e4ea3ddd1788d495ca0710 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.cc
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
@@ -82,7 +82,8 @@ static StatusOr<string> CodegenModule(llvm::TargetMachine* target_machine,
   llvm::legacy::PassManager codegen_passes;
 
   if (target_machine->addPassesToEmitFile(
-          codegen_passes, ostream, llvm::TargetMachine::CGFT_ObjectFile)) {
+          codegen_passes, ostream, nullptr,
+          llvm::TargetMachine::CGFT_ObjectFile)) {
     return xla::InternalError(
         "Could not create pass pipeline to generate object file");
   }
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.h b/tensorflow/compiler/aot/embedded_protocol_buffers.h
index ebfe4806c203e901358d5c5096c10c03d4c738c3..4e194a6aba9a9efcad27c47c42e148d8e537ae68 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.h
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.h
@@ -71,7 +71,7 @@ struct ProtobufToEmbed {
   const ::tensorflow::protobuf::MessageLite* message;
 };
 
-// Embeds a a sequence of protocol buffers into an object file.
+// Embeds a sequence of protocol buffers into an object file.
 //
 // `target_triple` is the target triple for the target architecture for the
 // generated object file.
diff --git a/tensorflow/compiler/aot/runtime.h b/tensorflow/compiler/aot/runtime.h
index d085864f0012e4de55685bb46961417bb3070e6f..d1a669ceb17b9fd71d26e978035283f8824b0376 100644
--- a/tensorflow/compiler/aot/runtime.h
+++ b/tensorflow/compiler/aot/runtime.h
@@ -25,8 +25,8 @@ namespace tensorflow {
 namespace tfcompile {
 namespace runtime {
 
-// Align to 32-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
-static constexpr size_t kAlign = 32;
+// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
+static constexpr size_t kAlign = 64;
 
 // aligned_buffer_bytes returns the sum of each size in `sizes`, skipping -1
 // values.  There are `n` entries in `sizes`.  Each buffer is aligned to kAlign
diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/aot/runtime_test.cc
index 6d603a02eb4ceade6832ba67b2981814ee25327a..06ec623eb2dce5f8dc7156fb7e7b9ad57d90c8ee 100644
--- a/tensorflow/compiler/aot/runtime_test.cc
+++ b/tensorflow/compiler/aot/runtime_test.cc
@@ -24,7 +24,7 @@ namespace runtime {
 namespace {
 
 TEST(Runtime, AlignmentValue) {
-  // We've chosen 32 byte alignment for the tfcompile runtime to mimic the
+  // We've chosen 64 byte alignment for the tfcompile runtime to mimic the
   // regular tensorflow allocator, which was chosen to play nicely with Eigen.
   // The tfcompile runtime also has a requirement that comes from the xla
   // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8
@@ -39,13 +39,13 @@ TEST(Runtime, AlignedBufferBytes) {
   EXPECT_EQ(aligned_buffer_bytes(sizesA, 1), 0);
 
   static constexpr intptr_t sizesB[1] = {3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 64);
 
   static constexpr intptr_t sizesC[1] = {32};
-  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 64);
 
   static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 192);
+  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 320);
 }
 
 void* add_ptr(void* base, uintptr_t delta) {
@@ -101,11 +101,11 @@ TEST(Runtime, MallocFreeContiguousBuffers) {
   EXPECT_NE(base, nullptr);
   EXPECT_EQ(bufD[0], add_ptr(base, 0));
   EXPECT_EQ(bufD[1], nullptr);
-  EXPECT_EQ(bufD[2], add_ptr(base, 32));
+  EXPECT_EQ(bufD[2], add_ptr(base, 64));
   EXPECT_EQ(bufD[3], nullptr);
-  EXPECT_EQ(bufD[4], add_ptr(base, 64));
-  EXPECT_EQ(bufD[5], add_ptr(base, 128));
-  EXPECT_EQ(bufD[6], add_ptr(base, 160));
+  EXPECT_EQ(bufD[4], add_ptr(base, 128));
+  EXPECT_EQ(bufD[5], add_ptr(base, 192));
+  EXPECT_EQ(bufD[6], add_ptr(base, 256));
   for (int i = 0; i < 7; ++i) {
     const intptr_t size = sizesD[i];
     if (size != -1) {
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 222e26810ac1157152ea81a56749b6652aa1f137..fd2cf2b67d4618dd626b8eef78eed044d7fde0a4 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -15,6 +15,7 @@ test_suite(
         ":test_graph_tfadd_with_ckpt_saver_test",
         ":test_graph_tfadd_with_ckpt_test",
         ":test_graph_tfassert_eq_test",
+        ":test_graph_tfcond_test",
         ":test_graph_tffunction_test",
         ":test_graph_tfgather_test",
         ":test_graph_tfmatmul_test",
@@ -55,6 +56,7 @@ genrule(
         "test_graph_tfadd_with_ckpt_saver.pb",
         "test_graph_tfadd_with_ckpt_saver.saver",
         "test_graph_tfassert_eq.pb",
+        "test_graph_tfcond.pb",
         "test_graph_tffunction.pb",
         "test_graph_tfgather.pb",
         "test_graph_tfmatmul.pb",
@@ -118,6 +120,17 @@ tf_library(
     ],
 )
 
+tf_library(
+    name = "test_graph_tfcond",
+    testonly = 1,
+    config = "test_graph_tfcond.config.pbtxt",
+    cpp_class = "CondComp",
+    graph = "test_graph_tfcond.pb",
+    tags = [
+        "manual",
+    ],
+)
+
 tf_library(
     name = "test_graph_tffunction",
     testonly = 1,
@@ -194,6 +207,7 @@ tf_cc_test(
         ":test_graph_tfadd_with_ckpt",
         ":test_graph_tfadd_with_ckpt_saver",
         ":test_graph_tfassert_eq",
+        ":test_graph_tfcond",
         ":test_graph_tffunction",
         ":test_graph_tfgather",
         ":test_graph_tfmatmul",
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index 67767f55dae9b15aafbd8b129328bde2c59a9ef3..9ec7df163b1425f917e9ec51559efad3e6f05e75 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -78,6 +78,22 @@ def tfadd_with_ckpt_saver(out_dir):
       f.write(saver.as_saver_def().SerializeToString())
 
 
+def tfassert_eq(_):
+  x = array_ops.placeholder(dtypes.int32, name='x_hold')
+  y = array_ops.placeholder(dtypes.int32, name='y_hold')
+  control_flow_ops.Assert(
+      math_ops.equal(x, y), ['Expected x == y.'], name='assert_eq')
+  math_ops.add(x, math_ops.negative(y), name='x_y_diff')
+
+
+def tfcond(_):
+  p = array_ops.placeholder(dtypes.bool, name='p_hold')
+  x = array_ops.placeholder(dtypes.int32, name='x_hold')
+  y = array_ops.placeholder(dtypes.int32, name='y_hold')
+  z = control_flow_ops.cond(p, lambda: x, lambda: y)
+  array_ops.identity(z, name='result')
+
+
 def tfgather(_):
   params = array_ops.placeholder(dtypes.float32, name='params')
   indices = array_ops.placeholder(dtypes.int32, name='indices')
@@ -126,14 +142,6 @@ def tfsplits(_):
   array_ops.identity(y, name='result')
 
 
-def tfassert_eq(_):
-  x = array_ops.placeholder(dtypes.int32, name='x_hold')
-  y = array_ops.placeholder(dtypes.int32, name='y_hold')
-  control_flow_ops.Assert(
-      math_ops.equal(x, y), ['Expected x == y.'], name='assert_eq')
-  math_ops.add(x, math_ops.negative(y), name='x_y_diff')
-
-
 def write_graph(build_graph, out_dir):
   """Build a graph using build_graph and write it out."""
   g = ops.Graph()
@@ -148,12 +156,13 @@ def main(_):
   write_graph(tfadd, FLAGS.out_dir)
   write_graph(tfadd_with_ckpt, FLAGS.out_dir)
   write_graph(tfadd_with_ckpt_saver, FLAGS.out_dir)
+  write_graph(tfassert_eq, FLAGS.out_dir)
+  write_graph(tfcond, FLAGS.out_dir)
+  write_graph(tffunction, FLAGS.out_dir)
   write_graph(tfgather, FLAGS.out_dir)
   write_graph(tfmatmul, FLAGS.out_dir)
   write_graph(tfmatmulandadd, FLAGS.out_dir)
-  write_graph(tffunction, FLAGS.out_dir)
   write_graph(tfsplits, FLAGS.out_dir)
-  write_graph(tfassert_eq, FLAGS.out_dir)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/aot/tests/test_graph_tfcond.config.pbtxt b/tensorflow/compiler/aot/tests/test_graph_tfcond.config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94a01ad4abfaab5e4b087b7cc219e86c1d0179b8
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_graph_tfcond.config.pbtxt
@@ -0,0 +1,20 @@
+# Text form of tensorflow.tf2xla.Config proto.
+feed {
+  id { node_name: "p_hold" }
+  shape {}
+}
+feed {
+  id { node_name: "x_hold" }
+  shape {
+    dim { size: 1 }
+  }
+}
+feed {
+  id { node_name: "y_hold" }
+  shape {
+    dim { size: 1 }
+  }
+}
+fetch {
+  id { node_name: "result" }
+}
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index 27ba42b31fc2504a570584a9881c032582731baf..fee46280e9a0e7ba2cf7c3ed46469ae8cc0841d4 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt_saver.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfassert_eq.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfcond.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tffunction.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfgather.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmul.h"
@@ -39,7 +40,7 @@ namespace tfcompile {
 namespace {
 
 using ::testing::HasSubstr;
-using ::testing::UnorderedElementsAre;
+using ::testing::IsSupersetOf;
 
 TEST(TFCompileTest, Add) {
   AddComp add;
@@ -150,6 +151,31 @@ TEST(TFCompileTest, AddWithCkptSaver) {
   EXPECT_EQ(add_const.result0_data(), add_const.results()[0]);
 }
 
+TEST(TFCompileTest, Cond) {
+  CondComp cond;
+  EXPECT_EQ(cond.arg0_data(), cond.args()[0]);
+  EXPECT_EQ(cond.arg1_data(), cond.args()[1]);
+  EXPECT_EQ(cond.arg2_data(), cond.args()[2]);
+  cond.arg1() = 10;
+  cond.arg2() = 20;
+  {
+    cond.arg0() = true;
+    const int32 expected_result = cond.arg1();
+    EXPECT_TRUE(cond.Run());
+    EXPECT_EQ(cond.result0(), expected_result);
+    EXPECT_EQ(cond.result0_data()[0], expected_result);
+    EXPECT_EQ(cond.result0_data(), cond.results()[0]);
+  }
+  {
+    cond.arg0() = false;
+    const int32 expected_result = cond.arg2();
+    EXPECT_TRUE(cond.Run());
+    EXPECT_EQ(cond.result0(), expected_result);
+    EXPECT_EQ(cond.result0_data()[0], expected_result);
+    EXPECT_EQ(cond.result0_data(), cond.results()[0]);
+  }
+}
+
 TEST(TFCompileTest, Gather) {
   GatherComp gather;
   EXPECT_EQ(gather.arg0_data(), gather.args()[0]);
@@ -525,25 +551,20 @@ TEST(TFCompileTest, HloProfiling) {
   auto header = HasSubstr("Execution profile for");
   auto total_cycles_profile_line = HasSubstr("[total]");
   auto dot_profile_line = HasSubstr(
-      "%dot.0.2 = f32[2,2]{1,0} dot(f32[2,2]{1,0} %arg0.0.0, f32[2,2]{1,0} "
+      "%dot.0.4 = f32[2,2]{1,0} dot(f32[2,2]{1,0} %arg0.0.0, f32[2,2]{1,0} "
       "%arg1.0.1)");
   auto add_profile_line = HasSubstr(
-      "%add.0.5 = f32[2,2]{1,0} add(f32[2,2]{1,0} %arg0.0.0, f32[2,2]{1,0} "
+      "%add.0.6 = f32[2,2]{1,0} add(f32[2,2]{1,0} %arg0.0.0, f32[2,2]{1,0} "
       "%arg1.0.1)");
   auto tuple_profile_line = HasSubstr(
       "%tuple.0.8 = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(f32[2,2]{1,0} "
-      "%dot.0.2, f32[2,2]{1,0} %add.0.5)");
+      "%dot.0.4, f32[2,2]{1,0} %add.0.6)");
   auto arg0_profile_line = HasSubstr("%arg0.0.0 = f32[2,2]{1,0} parameter(0)");
   auto arg1_profile_line = HasSubstr("%arg1.0.1 = f32[2,2]{1,0} parameter(1)");
 
-  hlo_profile_lines.erase(hlo_profile_lines.begin() + 7,
-                          hlo_profile_lines.end());
-
-  EXPECT_THAT(
-      hlo_profile_lines,
-      UnorderedElementsAre(header, total_cycles_profile_line, dot_profile_line,
-                           add_profile_line, tuple_profile_line,
-                           arg0_profile_line, arg1_profile_line));
+  EXPECT_THAT(hlo_profile_lines,
+              IsSupersetOf({header, total_cycles_profile_line, dot_profile_line,
+                            add_profile_line, tuple_profile_line}));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 07136d6a746604f148b62e48480a4fa0d253927d..980e0eec9e23b15a97b826067bac08053a437712 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -124,7 +124,6 @@ cc_library(
     srcs = ["xla_tensor.cc"],
     hdrs = ["xla_tensor.h"],
     deps = [
-        ":common",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:shaped_buffer",
@@ -176,11 +175,11 @@ cc_library(
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:control_flow_ops",
+        "//tensorflow/core/kernels:identity_n_op",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:no_op",
         "//tensorflow/core/kernels:sendrecv_ops",
         "//tensorflow/core/kernels:variable_ops",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -217,6 +216,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_runtime",
@@ -261,6 +261,7 @@ cc_library(
     name = "create_xla_launch_op",
     srcs = [
         "create_xla_launch_op.cc",
+        "create_xla_launch_op.h",
     ],
     deps = [
         ":common",
@@ -270,6 +271,27 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "create_xla_launch_op_test",
+    srcs = [
+        "create_xla_launch_op.h",
+        "create_xla_launch_op_test.cc",
+    ],
+    deps = [
+        ":create_xla_launch_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
diff --git a/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc b/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc
index 9a2bb0007527557f79b70ad2b9c9576af2ab10ea..b17ff589e2597f8d1b5e61f4eaaed7d6ebe6214c 100644
--- a/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc
@@ -40,7 +40,7 @@ static Status BuildLaunchNode(
     Graph* graph, Node** node) {
   NodeDef def;
   def.set_name(graph->NewName(nodename));
-  def.set_op("_XlaLaunch");
+  def.set_op("XlaLaunch");
   def.set_device(device_name);
   AddNodeAttr("Tconstants", constant_dtypes, &def);
   AddNodeAttr("Targs", arg_dtypes, &def);
@@ -79,7 +79,7 @@ static Status ReplaceNodeWithXlaLaunch(Graph* graph, Node* node) {
       node->input_types().begin() + num_constant_args,
       node->input_types().begin() + num_constant_args + num_nonconst_args);
 
-  // Build a _XlaLaunch operator to execute the function body.
+  // Build a XlaLaunch operator to execute the function body.
   Node* launch_node;
   TF_RETURN_IF_ERROR(BuildLaunchNode(
       graph->NewName(node->name()), node->type_string(), node->def().attr(),
diff --git a/tensorflow/compiler/jit/create_xla_launch_op.cc b/tensorflow/compiler/jit/create_xla_launch_op.cc
index 18d901323f108505979be484c2bfad5998ab0748..731b8ebfdc6262500940274c94a03ae7c0376096 100644
--- a/tensorflow/compiler/jit/create_xla_launch_op.cc
+++ b/tensorflow/compiler/jit/create_xla_launch_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/compiler/jit/create_xla_launch_op.h"
 
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/kernels/xla_launch_op.h"
@@ -21,82 +22,194 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace {
 
-// Givens a NodeDef 'ndef' and the function library runtime 'flr', if
-// 'ndef' is a call to a compilable function defined in 'flr', returns OK
-// and fills in 'kernel' with a XlaLaunchOp kernel which computes the
-// node. Otherwise, returns a non-OK.
+// Utility which searches for values in a sorted list by scanning over it once.
+// No matter how many times ScanForValue is called, the list is scanned at most
+// once. However, if a call to ScanForValue skips over a value, that value is
+// not revisited in future calls to ScanForValue, so callers must take
+// care to order their calls.
 //
-// This routine is here so that FunctionLibraryRuntime can jit a
-// specific function call as requested.
-Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& ndef,
-                         std::unique_ptr<OpKernel>* kernel) {
-  bool xla_compile = false;
-  if (!flr->GetFunctionLibraryDefinition()
-           ->GetAttr(ndef, kXlaCompileAttr, &xla_compile)
-           .ok() ||
-      !xla_compile) {
-    // Not marked as _XlaCompile=true.
-    return errors::InvalidArgument("No ", kXlaCompileAttr, " for ", ndef.op());
+// Useful for merging multiple sorted lists in O(n) time.
+class SinglePassSearch {
+ public:
+  // Creates a SinglePassSearch object that can be used to search in `values`.
+  // Does not take ownership of `values`. `values` must outlive this.
+  // `values` must be sorted.
+  explicit SinglePassSearch(const std::vector<int>* values)
+      : current_index_(0), values_(values) {}
+
+  // Scans forward in the vector looking for "value", updating the internal
+  // position in to the vector.
+  // Returns true iff the vector contains the given value at or after current
+  // position.
+  // Not thread-safe.
+  bool ScanForValue(int value) {
+    while (current_index_ < values_->size() &&
+           (*values_)[current_index_] <= value) {
+      if ((*values_)[current_index_] == value) {
+        current_index_++;
+        return true;
+      }
+      current_index_++;
+    }
+    return false;
   }
-  // Make sure that kernels have been registered on the JIT device.
-  XlaOpRegistry::RegisterCompilationKernels();
-  if (!IsCompilable(flr, ndef)) {
-    // ndef is calling a function that XLA can't compile.
-    return errors::InvalidArgument("Not compilable: ", ndef.ShortDebugString());
+
+ private:
+  int current_index_;
+  const std::vector<int>* values_;
+};
+
+Status CompilationRequested(const FunctionLibraryRuntime& flr,
+                            const NodeDef& node_def) {
+  bool xla_compile = false;
+  // Check if op is marked _XlaCompile=true.
+  Status status = flr.GetFunctionLibraryDefinition()->GetAttr(
+      node_def, kXlaCompileAttr, &xla_compile);
+  if (!status.ok() || !xla_compile) {
+    if (VLOG_IS_ON(3)) {
+      if (!status.ok()) {
+        VLOG(3) << "No " << kXlaCompileAttr << " attr defined for "
+                << node_def.op() << ". status=" << status.ToString();
+      } else {
+        VLOG(3) << node_def.op() << " is explicitly marked not to be compiled";
+      }
+    }
+    return Status(error::INVALID_ARGUMENT, "");
   }
+  return Status::OK();
+}
+
+// Given a FunctionLibraryRuntime and a NodeDef calling a function in the
+// runtime, returns this function's body in `fbody` as well as the indices
+// of its constant and resource arguments.
+// `fbody` is owned by `flr`.
+// `constant_arg_indices` and `resource_arg_indices` should be empty vector.
+// They are sorted in ascending order on this function's return.
+Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,
+                                       const NodeDef& node_def,
+                                       const FunctionBody** fbody,
+                                       std::vector<int>* constant_arg_indices,
+                                       std::vector<int>* resource_arg_indices) {
   FunctionLibraryRuntime::Handle handle;
-  // If ndef is not instantiable, e.g., the function does not exist,
+  // If node_def is not instantiable, e.g., the function does not exist,
   // simply bail out.
   TF_RETURN_IF_ERROR(
-      flr->Instantiate(ndef.op(), AttrSlice(&ndef.attr()), &handle));
-  const FunctionBody* fbody = flr->GetFunctionBody(handle);
-  CHECK(fbody);  // Can't be nullptr since we just instantiated it.
-  std::vector<bool> const_args(fbody->arg_types.size());
+      flr->Instantiate(node_def.op(), AttrSlice(&node_def.attr()), &handle));
+  *fbody = flr->GetFunctionBody(handle);
+  CHECK(*fbody);  // Can't be nullptr since we just instantiated it.
+  const DataTypeVector& arg_types = (*fbody)->arg_types;
+  std::vector<bool> const_args(arg_types.size());
   // If we can't analyze the const args. Bail out.
-  TF_RETURN_IF_ERROR(BackwardsConstAnalysis(*(fbody->graph), &const_args));
+  TF_RETURN_IF_ERROR(BackwardsConstAnalysis(*((*fbody)->graph), &const_args));
 
   for (int i = 0; i < const_args.size(); ++i) {
     if (const_args[i]) {
-      // There is a const arg. Bail out.
-      return errors::InvalidArgument("Const arg: ", i, " in ",
-                                     DebugString(fbody->fdef));
+      constant_arg_indices->push_back(i);
+    }
+  }
+
+  // There can be hundreds of resource variables. Reserve the space for them.
+  // We don't reserve for constants above as they are usually few.
+  resource_arg_indices->reserve(arg_types.size());
+  for (int i = 0; i < arg_types.size(); ++i) {
+    if (arg_types[i] == DT_RESOURCE) {
+      resource_arg_indices->push_back(i);
     }
   }
 
-  NodeDef launch_def;
-  launch_def.set_name(ndef.name());
-  launch_def.set_op("_XlaLaunch");
-  launch_def.set_device(flr->device()->name());
-  AddNodeAttr("Tconstants", DataTypeVector{}, &launch_def);
-  AddNodeAttr("Nresources", 0, &launch_def);
-  AddNodeAttr("Targs", fbody->arg_types, &launch_def);
-  AddNodeAttr("Tresults", fbody->ret_types, &launch_def);
-  NameAttrList func;
-  func.set_name(ndef.op());
-  *(func.mutable_attr()) = ndef.attr();
-  AddNodeAttr("function", func, &launch_def);
-
-  // TODO(b/32387911): Handles the host memory types across function
-  // calls properly. For now, we assume all inputs and outputs are on
-  // the device memory.
+  return Status::OK();
+}
+
+}  // namespace
+
+Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& node_def,
+                         std::unique_ptr<OpKernel>* kernel) {
+  TF_RETURN_IF_ERROR(CompilationRequested(*flr, node_def));
+
+  VLOG(3) << "Creating XlaLaunchOp for " << node_def.DebugString();
+
+  // Make sure that kernels have been registered on the JIT device.
+  XlaOpRegistry::RegisterCompilationKernels();
+  if (!IsCompilable(flr, node_def)) {
+    // node_def is calling a function that XLA can't compile.
+    return errors::InvalidArgument("Not compilable: ",
+                                   node_def.ShortDebugString());
+  }
+
+  // Get function body, constant args, and resource args.
+  const FunctionBody* fbody = nullptr;
+  std::vector<int> constant_arg_indices;
+  std::vector<int> resource_arg_indices;
+  TF_RETURN_IF_ERROR(GetBodyAndConstantsAndResources(
+      flr, node_def, &fbody, &constant_arg_indices, &resource_arg_indices));
+
+  // Set input and output memory types.
   MemoryTypeVector input_memory_types(fbody->arg_types.size(), DEVICE_MEMORY);
+  // These indices are used only for optimization purposes. They allow us
+  // to loop over constant_arg_indices and resource_arg_indices only once
+  // while iterating over all the function arguments checking if it is a
+  // resource or a constant.
+  // The reason we optimized this code is because functions can have a lot of
+  // captured arguments. For example, the backward pass of ResNet50 takes in all
+  // 214 variables and a similar number of activations.
+  SinglePassSearch constants_search(&constant_arg_indices);
+  SinglePassSearch resources_search(&resource_arg_indices);
+  for (int i = 0; i < fbody->arg_types.size(); ++i) {
+    if (resources_search.ScanForValue(i) || constants_search.ScanForValue(i)) {
+      // Compile-time constants and resource handles are expected to be in
+      // host memory.
+      input_memory_types[i] = HOST_MEMORY;
+    }
+  }
+  // One might wonder, about the case where a compile-time constant argument
+  // (which must be in host memory) is also used as an input into an op,
+  // e.g. Add, that expects its inputs in device memory. Here is how it
+  // works now.
+  // First, what do we mean by "op expects an input in XYZ memory"?
+  // There are two types of "ops" here: the tf2xla kernel and the HLO
+  // computation it builds. The tf2xla kernel needs to retrieve the actual
+  // numeric value of the compile-time constant tensors, so it really expects
+  // them to be on in host memory. However, for other inputs, it refers to them
+  // using xla::ComputationDataHandle, which is just a symbolic handle that
+  // xla::ComputationBuilder assigns. How does this handle gets assigned for
+  // constant arguments? Even constant arguments get an _Arg node in the graph
+  // instatiated for Function compilation. The tf2xla kernel for constant _Arg
+  // nodes takes the constant value, converts it to XlaLiteral, and feeds it
+  // to xla::ComputationBuilder.ConstantLiteral, which returns the handle. This
+  // constant XlaLiteral is included in the HLO graph, and subsequently, in
+  // the actual executable, which is copied to the device before being
+  // executed. Thus, when this executable runs, the constant is available in
+  // device memory.
+
+  // XlaLaunch kernel keeps all outputs (including constants, which it copies),
+  // in device memory
   MemoryTypeVector output_memory_types(fbody->ret_types.size(), DEVICE_MEMORY);
 
+  // Create the kernel.
+  NameAttrList function;
+  function.set_name(node_def.op());
+  *(function.mutable_attr()) = node_def.attr();
+
   Device* dev = flr->device();
   Status s;
   OpKernelConstruction construction(
       DeviceType(dev->device_type()), dev,
-      dev->GetAllocator(AllocatorAttributes()), &launch_def,
+      dev->GetAllocator(AllocatorAttributes()), &node_def,
       &fbody->fdef.signature(), flr, fbody->arg_types, input_memory_types,
       fbody->ret_types, output_memory_types, flr->graph_def_version(), &s);
-  kernel->reset(new XlaLocalLaunchOp(&construction));
+
+  *kernel = MakeUnique<XlaLocalLaunchBase>(&construction, constant_arg_indices,
+                                           resource_arg_indices, function);
   return s;
 }
 
+namespace {
+
 bool RegisterLaunchOpCreator() {
   RegisterDefaultCustomKernelCreator(CreateXlaLaunchOp);
   return true;
diff --git a/tensorflow/compiler/jit/create_xla_launch_op.h b/tensorflow/compiler/jit/create_xla_launch_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..98a22e351532c197c69c5ea908305d885fd2c9d0
--- /dev/null
+++ b/tensorflow/compiler/jit/create_xla_launch_op.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_CREATE_XLA_LAUNCH_OP_H_
+#define TENSORFLOW_COMPILER_JIT_CREATE_XLA_LAUNCH_OP_H_
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class FunctionLibraryRuntime;
+class OpKernel;
+
+// Given a NodeDef 'node_def' and the function library runtime 'flr', if
+// 'node_def' is a call to a compilable function defined in 'flr', returns OK
+// and fills in 'kernel' with a XlaLaunchOp kernel which computes the
+// node. Otherwise, returns a non-OK.
+Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& node_def,
+                         std::unique_ptr<OpKernel>* kernel);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_CREATE_XLA_LAUNCH_OP_H_
diff --git a/tensorflow/compiler/jit/create_xla_launch_op_test.cc b/tensorflow/compiler/jit/create_xla_launch_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b75ab486b80e098bc0a59f9ea8cdbaa23a28fef9
--- /dev/null
+++ b/tensorflow/compiler/jit/create_xla_launch_op_test.cc
@@ -0,0 +1,145 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/create_xla_launch_op.h"
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+
+NodeDef ToNodeDef(const string& text) {
+  NodeDef node_def;
+  EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &node_def));
+  return node_def;
+}
+
+// Create a FunctionDef that takes one resource and one regular param
+FunctionDef XTimesY() {
+  return FunctionDefHelper::Define(
+      // Name
+      "XTimesY",
+      // Args
+      {"x: float", "y: resource"},
+      // Return values
+      {"z: float"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"y0"}, "ReadVariableOp", {"y"}, {{"dtype", DT_FLOAT}}},
+          {{"z"}, "Mul", {"x", "y0"}, {{"T", DT_FLOAT}}},
+      });
+}
+
+class CreateXlaLaunchOpTest : public ::testing::Test {
+ protected:
+  void Init(const std::vector<FunctionDef>& flib) {
+    SessionOptions options;
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", 1});
+    TF_CHECK_OK(DeviceFactory::AddDevices(
+        options, "/job:localhost/replica:0/task:0", &devices_));
+
+    FunctionDefLibrary proto;
+    for (const auto& fdef : flib) {
+      *(proto.add_function()) = fdef;
+    }
+    lib_def_ =
+        MakeUnique<FunctionLibraryDefinition>(OpRegistry::Global(), proto);
+    OptimizerOptions opts;
+    device_mgr_ = MakeUnique<DeviceMgr>(devices_);
+    pflr_ = MakeUnique<ProcessFunctionLibraryRuntime>(
+        device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
+        opts, /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
+    flr_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
+  }
+
+  FunctionLibraryRuntime* flr_;
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+
+  std::unique_ptr<OpKernel> kernel_;
+};
+
+AttrValue BoolAttr(bool b) {
+  AttrValue v;
+  v.set_b(b);
+  return v;
+}
+
+TEST_F(CreateXlaLaunchOpTest, OneFloatOneResourceArgument) {
+  FunctionDef fdef = XTimesY();
+  (*fdef.mutable_attr())["_XlaCompile"] = BoolAttr(true);
+  Init({fdef});
+
+  Status status = CreateXlaLaunchOp(
+      flr_, ToNodeDef(R"pb(
+        name: 'XTimesY' op: 'XTimesY' input: 'a' input: 'b'
+      )pb"), &kernel_);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+
+  EXPECT_EQ("XTimesY", kernel_->name());
+  EXPECT_EQ("XTimesY", kernel_->type_string());
+
+  EXPECT_EQ(2, kernel_->num_inputs());
+  EXPECT_EQ(DT_FLOAT, kernel_->input_type(0));
+  EXPECT_EQ(DT_RESOURCE, kernel_->input_type(1));
+  EXPECT_EQ(DEVICE_MEMORY, kernel_->input_memory_types()[0]);
+  EXPECT_EQ(HOST_MEMORY, kernel_->input_memory_types()[1]);
+
+  EXPECT_EQ(1, kernel_->num_outputs());
+  EXPECT_EQ(DT_FLOAT, kernel_->output_type(0));
+  EXPECT_EQ(DEVICE_MEMORY, kernel_->output_memory_types()[0]);
+}
+
+TEST_F(CreateXlaLaunchOpTest, FailsIfXlaCompileAttrNotSet) {
+  FunctionDef fdef = XTimesY();
+  Init({fdef});
+
+  Status status = CreateXlaLaunchOp(flr_, ToNodeDef(R"proto(
+                                      name: 'XTimesY'
+                                      op: 'XTimesY'
+                                      input: 'a'
+                                      input: 'b'
+                                    )proto"), &kernel_);
+  EXPECT_TRUE(errors::IsInvalidArgument(status)) << status.ToString();
+}
+
+TEST_F(CreateXlaLaunchOpTest, FailsIfXlaCompileAttrIsSetToFalse) {
+  FunctionDef fdef = XTimesY();
+  (*fdef.mutable_attr())["_XlaCompile"] = BoolAttr(false);
+  Init({fdef});
+
+  Status status = CreateXlaLaunchOp(flr_, ToNodeDef(R"proto(
+                                      name: 'XTimesY'
+                                      op: 'XTimesY'
+                                      input: 'a'
+                                      input: 'b'
+                                    )proto"), &kernel_);
+  EXPECT_TRUE(errors::IsInvalidArgument(status)) << status.ToString();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
index 34be4409a381197d2191e083727aa8d48ab8cd63..5fee36f022a7515504cb6faa5cca658481b784c5 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
@@ -80,7 +80,7 @@ Status EncapsulateSubgraphsInFunctions(
     std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library);
 
 // The attribute that marks function calls produced by the encapsulate
-// subgraphs pass and that should in turn be compiled via _XlaLaunch operators.
+// subgraphs pass and that should in turn be compiled via XlaLaunch operators.
 extern const char* const kXlaCompiledKernelAttr;
 
 // Does `node` have the kXlaCompiledKernelAttr attribute?
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.cc b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
index bc68afb322b5cfc814ce0537254ba14053ae4550..805bbc62c1e2e877de87ab8faf3d60b829743df8 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.cc
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
@@ -354,6 +354,16 @@ bool GraphCycles::IsReachableNonConst(int32 x, int32 y) {
   return reachable;
 }
 
+bool GraphCycles::CanContractEdge(int32 a, int32 b) {
+  CHECK(HasEdge(a, b)) << "No edge exists from " << a << " to " << b;
+  RemoveEdge(a, b);
+  bool reachable = IsReachableNonConst(a, b);
+  // Restore the graph to its original state.
+  InsertEdge(a, b);
+  // If reachable, then contracting edge will cause cycle.
+  return !reachable;
+}
+
 bool GraphCycles::ContractEdge(int32 a, int32 b) {
   CHECK(HasEdge(a, b));
   RemoveEdge(a, b);
@@ -388,4 +398,8 @@ std::unordered_set<int32> GraphCycles::Successors(int32 node) {
   return rep_->nodes_[node]->out;
 }
 
+std::unordered_set<int32> GraphCycles::Predecessors(int32 node) {
+  return rep_->nodes_[node]->in;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.h b/tensorflow/compiler/jit/graphcycles/graphcycles.h
index d11d6e27b1b7bb514127e16a9be21f044100d885..44448fa3d787d0785a797d40ed1b968438a903c9 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.h
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles.h
@@ -85,6 +85,9 @@ class GraphCycles {
   // and returns false.
   bool ContractEdge(int32 a, int32 b);
 
+  // Return true if can contract edge, otherwise return false.
+  bool CanContractEdge(int32 a, int32 b);
+
   // Return whether dest_node is reachable from source_node
   // by following edges.
   bool IsReachable(int32 source_node, int32 dest_node) const;
@@ -115,6 +118,7 @@ class GraphCycles {
   bool CheckInvariants() const;
 
   std::unordered_set<int32> Successors(int32 node);
+  std::unordered_set<int32> Predecessors(int32 node);
 
   // ----------------------------------------------------
   struct Rep;
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc b/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc
index e47b782207e9122740fe9d5daf1fa0dbaeb47754..274f5938a1228baf68ad4d8e1a7b13f276321d27 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc
@@ -494,6 +494,20 @@ TEST_F(GraphCyclesTest, ContractEdge) {
   EXPECT_TRUE(g_.HasEdge(1, 4));
 }
 
+TEST_F(GraphCyclesTest, CanContractEdge) {
+  ASSERT_TRUE(AddEdge(1, 2));
+  ASSERT_TRUE(AddEdge(1, 3));
+  ASSERT_TRUE(AddEdge(2, 3));
+  ASSERT_TRUE(AddEdge(2, 4));
+  ASSERT_TRUE(AddEdge(3, 4));
+
+  EXPECT_FALSE(g_.CanContractEdge(1, 3));
+  EXPECT_FALSE(g_.CanContractEdge(2, 4));
+  EXPECT_TRUE(g_.CanContractEdge(1, 2));
+  EXPECT_TRUE(g_.CanContractEdge(2, 3));
+  EXPECT_TRUE(g_.CanContractEdge(3, 4));
+}
+
 static void BM_StressTest(int iters, int num_nodes) {
   while (iters > 0) {
     tensorflow::GraphCycles g;
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 049d170fa48928474b894f2d0e1f2243c5f87275..27287e0f9637929b2e04c6a76de19c2785ec357e 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -39,15 +39,15 @@ limitations under the License.
 
 namespace tensorflow {
 
-XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
-    : OpKernel(ctx), device_type_(ctx->device_type()) {
-  const NameAttrList* func;
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("function", &func));
-  function_ = *func;
-  DataTypeVector constant_types;
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("Tconstants", &constant_types));
-  num_constant_args_ = constant_types.size();
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("Nresources", &num_resource_args_));
+XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx,
+                                       const std::vector<int>& constants,
+                                       const std::vector<int>& resources,
+                                       const NameAttrList& function)
+    : OpKernel(ctx),
+      constants_(constants),
+      resources_(resources),
+      device_type_(ctx->device_type()),
+      function_(function) {
   if (device_type_ == DeviceType(DEVICE_CPU)) {
     platform_id_ = se::host::kHostPlatformId;
   } else if (device_type_ == DeviceType(DEVICE_GPU)) {
@@ -57,8 +57,8 @@ XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
   }
 }
 
-Status XlaLocalLaunchOp::BuildCompilationCache(OpKernelContext* ctx,
-                                               XlaCompilationCache** cache) {
+Status XlaLocalLaunchBase::BuildCompilationCache(OpKernelContext* ctx,
+                                                 XlaCompilationCache** cache) {
   const XlaDevice::Metadata* metadata;
   Status s = XlaDevice::GetMetadata(ctx, &metadata);
   if (s.ok()) {
@@ -90,8 +90,8 @@ Status XlaLocalLaunchOp::BuildCompilationCache(OpKernelContext* ctx,
   return Status::OK();
 }
 
-void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
-  VLOG(1) << "XlaLocalLaunchOp::Compute "
+void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
+  VLOG(1) << "XlaLocalLaunchOpBase::Compute "
           << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
   // We store information about the JIT-compiled XLA computation
   // in the ResourceMgr.
@@ -112,7 +112,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   // this is more obviously correct.)
   core::ScopedUnref cache_ref(cache);
 
-  const XlaDevice::Metadata* metadata;
+  const XlaDevice::Metadata* metadata = nullptr;
   Status s = XlaDevice::GetMetadata(ctx, &metadata);
   bool allocate_xla_tensors = s.ok();
 
@@ -124,7 +124,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   }
 
   std::map<int, OptionalTensor> variables =
-      SnapshotResourceVariables(ctx, num_resource_args_);
+      SnapshotResourceVariables(ctx, resources_);
 
   xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
 
@@ -153,25 +153,27 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls = (platform_id_ == se::host::kHostPlatformId);
   options.device_allocator = xla_allocator;
-  // TODO(b/77671268): We don't set variable_representation_shape_fn here. This
-  // is restricted to Variables, but we need something like this to apply to
-  // normal Tensors too.
+  if (metadata) {
+    options.shape_representation_fn = metadata->shape_representation_fn();
+  }
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
 
   std::map<int, Tensor> constant_args;
-  for (int i = 0; i < num_constant_args_; ++i) {
+  for (int i : constants_) {
     constant_args.insert({i, ctx->input(i)});
   }
-  OP_REQUIRES_OK(ctx, cache->Compile(options, function_, constant_args,
-                                     variables, ctx, &kernel, &executable,
-                                     /*compile_options=*/nullptr));
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.is_entry_computation = true;
+  OP_REQUIRES_OK(
+      ctx, cache->Compile(options, function_, constant_args, variables, ctx,
+                          &kernel, &executable, &compile_options));
 
   VLOG(1) << "Executing XLA Computation...";
 
-  XlaComputationLaunchContext launch_context(
-      num_resource_args_, client, xla_allocator, allocate_xla_tensors);
+  XlaComputationLaunchContext launch_context(client, xla_allocator,
+                                             allocate_xla_tensors);
   launch_context.PopulateInputs(ctx, kernel, variables);
 
   // Execute the computation.
@@ -194,14 +196,69 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "Done";
 }
 
+namespace {
+
+// OP_REQUIRES_OK_RETURN is the same as OP_REQUIRES_OK except that
+// in error case, it returns RET instead of void.
+#define OP_REQUIRES_OK_RETURN(CTX, RET, ...)                \
+  do {                                                      \
+    ::tensorflow::Status _s(__VA_ARGS__);                   \
+    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
+      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
+      return RET;                                           \
+    }                                                       \
+  } while (0)
+
+// Helper static functions to construct parameters for
+// XlaLocalLaunchBase constructor from OpKernelConstruction.
+std::vector<int> ConstantsVector(OpKernelConstruction* ctx) {
+  DataTypeVector constant_types;
+  OP_REQUIRES_OK_RETURN(ctx, std::vector<int>(),
+                        ctx->GetAttr("Tconstants", &constant_types));
+  std::vector<int> constants(constant_types.size());
+  std::iota(constants.begin(), constants.end(), 0);
+  return constants;
+}
+
+std::vector<int> ResourcesVector(OpKernelConstruction* ctx) {
+  DataTypeVector constant_types;
+  OP_REQUIRES_OK_RETURN(ctx, std::vector<int>(),
+                        ctx->GetAttr("Tconstants", &constant_types));
+
+  DataTypeVector arg_types;
+  OP_REQUIRES_OK_RETURN(ctx, std::vector<int>(),
+                        ctx->GetAttr("Targs", &arg_types));
+
+  int num_resources;
+  OP_REQUIRES_OK_RETURN(ctx, std::vector<int>(),
+                        ctx->GetAttr("Nresources", &num_resources));
+
+  std::vector<int> resources(num_resources);
+  std::iota(resources.begin(), resources.end(),
+            constant_types.size() + arg_types.size());
+  return resources;
+}
+
+NameAttrList FunctionAttr(OpKernelConstruction* ctx) {
+  const NameAttrList* func;
+  OP_REQUIRES_OK_RETURN(ctx, NameAttrList(), ctx->GetAttr("function", &func));
+  return *func;
+}
+
+#undef OP_REQUIRES_OK_RETURN
+}  // namespace
+
+XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
+    : XlaLocalLaunchBase(ctx, ConstantsVector(ctx), ResourcesVector(ctx),
+                         FunctionAttr(ctx)) {}
+
 XlaLocalLaunchOp::~XlaLocalLaunchOp() {
   VLOG(1) << "XlaLocalLaunchOp destroyed";
 }
 
-REGISTER_KERNEL_BUILDER(Name("_XlaLaunch").Device(DEVICE_CPU),
-                        XlaLocalLaunchOp);
+REGISTER_KERNEL_BUILDER(Name("XlaLaunch").Device(DEVICE_CPU), XlaLocalLaunchOp);
 
-REGISTER_KERNEL_BUILDER(Name("_XlaLaunch")
+REGISTER_KERNEL_BUILDER(Name("XlaLaunch")
                             .Device(DEVICE_GPU)
                             .HostMemory("constants")
                             .HostMemory("resources"),
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.h b/tensorflow/compiler/jit/kernels/xla_launch_op.h
index 8f8e646f0ff6d94dfdf56721cacfce7fa658beb6..8dfc4b382d51151b6383fe7dd75429f3124d39be 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.h
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.h
@@ -26,6 +26,41 @@ limitations under the License.
 
 namespace tensorflow {
 
+// XlaLocalLaunchBase is almost the same as XlaLocalLaunchOp.
+// The only difference is that it does not require arguments to follow
+// the "constants, then regular args, then resources" order.
+// It takes vectors of constant and resource arguments explicitly.
+// It does not have corresponding OpDef because it is never present
+// in the GraphDef.
+// Currently, it is used by eager runtime. FunctionLibraryRuntime creates
+// this kernel when asked to create a kernel for an XLA-compiled function.
+class XlaLocalLaunchBase : public OpKernel {
+ public:
+  XlaLocalLaunchBase(OpKernelConstruction* ctx,
+                     const std::vector<int>& constants,
+                     const std::vector<int>& resources,
+                     const NameAttrList& function);
+  XlaLocalLaunchBase(const XlaLocalLaunchBase&) = delete;
+  XlaLocalLaunchBase& operator=(const XlaLocalLaunchBase&) = delete;
+  ~XlaLocalLaunchBase() override = default;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ protected:
+  // Builds a XlaCompilationCache class suitable for the current device.
+  Status BuildCompilationCache(OpKernelContext* ctx,
+                               XlaCompilationCache** cache);
+
+  // Indexes of compile-time constant inputs
+  std::vector<int> constants_;
+  // Indexes of resource inputs
+  std::vector<int> resources_;
+
+  DeviceType device_type_;
+  NameAttrList function_;
+  se::Platform::Id platform_id_;
+};
+
 // XlaLocalLaunchOp is used to replace a region of the TensorFlow graph
 // which will be compiled and executed using XLA.  The XlaLocalLaunchOp is
 // responsible for handling interactions with the TensorFlow executor.
@@ -35,26 +70,12 @@ namespace tensorflow {
 // XlaLocalLaunchOp uses xla::LocalClient::Compile() and
 // xla::LocalExecutable::Run(), and passes arguments into/out of XLA in device
 // memory.
-class XlaLocalLaunchOp : public OpKernel {
+class XlaLocalLaunchOp : public XlaLocalLaunchBase {
  public:
   explicit XlaLocalLaunchOp(OpKernelConstruction* ctx);
   ~XlaLocalLaunchOp() override;
 
-  void Compute(OpKernelContext* ctx) override;
-
  private:
-  // Builds a XlaCompilationCache class suitable for the current device.
-  Status BuildCompilationCache(OpKernelContext* ctx,
-                               XlaCompilationCache** compiler);
-
-  DeviceType device_type_;
-  NameAttrList function_;
-  int num_constant_args_;
-  // Number of resource variable arguments.
-  int num_resource_args_;
-
-  se::Platform::Id platform_id_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(XlaLocalLaunchOp);
 };
 
diff --git a/tensorflow/compiler/jit/ops/xla_ops.cc b/tensorflow/compiler/jit/ops/xla_ops.cc
index 07320b43dab790e6cda5e85688bdacf48a35adc4..f2473d98ffd5dae55983e601b8d2d65af6a6d54c 100644
--- a/tensorflow/compiler/jit/ops/xla_ops.cc
+++ b/tensorflow/compiler/jit/ops/xla_ops.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("_XlaLaunch")
+REGISTER_OP("XlaLaunch")
     .Input("constants: Tconstants")
     .Attr("Tconstants: list(type) >= 0")
     .Input("args: Targs")
@@ -28,7 +28,7 @@ REGISTER_OP("_XlaLaunch")
     .Attr("Tresults: list(type) >= 0")
     .Attr("function: func")
     // XLA random-number generation ops are stateful.
-    // TODO(phawkins): create stateful and non-stateful variants of _XlaLaunch.
+    // TODO(phawkins): create stateful and non-stateful variants of XlaLaunch.
     .SetIsStateful()
     .Doc("XLA Launch Op. For use by the XLA JIT only.");
 
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 6430975335f5eef5b53c80213e6090ffd6166a91..7ed609c43748062656b631243c01d790519c54fd 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -122,8 +122,7 @@ Status XlaCompilationCache::BuildSignature(
 
 namespace {
 
-// Builds a XlaCompiler::Argument vector from the arguments to the _XlaLaunch
-// op.
+// Builds a XlaCompiler::Argument vector from the arguments to the XlaLaunch op.
 Status BuildArguments(const std::map<int, Tensor>& constant_args,
                       const std::map<int, OptionalTensor>& variable_args,
                       OpKernelContext* ctx,
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 60458f6f3314b2c3b65be1c90e051b2a670383bc..ab644ff5a61c407b246b97af5328bf5cd8c1893b 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -48,13 +48,12 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
                                  const XlaCompiler::CompilationResult* result,
                                  xla::LocalExecutable* executable) {
   std::map<int, OptionalTensor> variables = GetVariables(ctx);
-  int64 num_resource_args = variables.size();
 
   xla::LocalClient* client = metadata.client();
 
   // Builds an XLA allocator for the device.
   XlaComputationLaunchContext launch_context(
-      num_resource_args, client, client->backend().memory_allocator(), true);
+      client, client->backend().memory_allocator(), true);
 
   launch_context.PopulateInputs(ctx, result, variables);
 
@@ -157,11 +156,14 @@ Status XlaCompileOnDemandOp::Compile(
   options.client = metadata.client();
   options.flib_def =
       new FunctionLibraryDefinition(OpRegistry::Global(), FunctionDefLibrary{});
+  options.shape_representation_fn = metadata.shape_representation_fn();
+
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.is_entry_computation = true;
 
   std::map<int, OptionalTensor> variable_args = GetVariables(ctx);
   return cache->CompileSingleOp(options, constant_arguments, variable_args, ctx,
-                                result, executable,
-                                /*compile_options=*/nullptr);
+                                result, executable, &compile_options);
 }
 
 void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.h b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
index 23c6f3903f841a6c39104983c6f7f409757a7319..7cc3d0e007ba2974fbfbe6fbabc4aa08f9fa910f 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.h
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
@@ -29,11 +29,8 @@ limitations under the License.
 namespace tensorflow {
 
 // An OpKernel that compiles an op to an XLA computation and runs it. Unlike
-// _XlaLaunch this doesn't rely on any rewrites of the graphdef - it will run a
+// XlaLaunch this doesn't rely on any rewrites of the graphdef - it will run a
 // vanilla TensorFlow op as long as the bridge supports it.
-//
-// Importantly _XlaLaunch assumes all input and output tensors are on the host,
-// whereas XlacompileOnDemandOp works with tensors in device memory.
 class XlaCompileOnDemandOp : public OpKernel {
  public:
   explicit XlaCompileOnDemandOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index bc07dbd7bdf005fde781f7a1e6775080e363abfb..ea9e0366043a4a64bfe43703c55d4470693bbac8 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -50,10 +50,11 @@ Status XlaCpuDeviceFactory::CreateDevices(const SessionOptions& options,
   (void)registrations;
 
   std::unique_ptr<XlaDevice> device;
-  TF_RETURN_IF_ERROR(XlaDevice::Create("Host", DEVICE_XLA_CPU, 0,
-                                       DEVICE_CPU_XLA_JIT, options, name_prefix,
-                                       registration,
-                                       /*transfer_as_literal=*/false, &device));
+  TF_RETURN_IF_ERROR(
+      XlaDevice::Create("Host", DEVICE_XLA_CPU, 0, DEVICE_CPU_XLA_JIT, options,
+                        name_prefix, registration,
+                        /*transfer_as_literal=*/false,
+                        /*shape_representation_fn=*/{}, &device));
   devices->push_back(device.release());
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 70263b1ff936757101a3c47d192b2ba58271dc79..f13b46c532e6008477849f2e06887901c90038ab 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <stdlib.h>
 #include <unordered_set>
 
-#include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_device_context.h"
@@ -49,6 +48,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
 namespace tensorflow {
@@ -110,7 +110,9 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
     const string& jit_device_name, const SessionOptions& options,
     const string& name_prefix,
     const XlaOpRegistry::DeviceRegistration& registration,
-    bool transfer_as_literal, std::unique_ptr<XlaDevice>* device) {
+    bool transfer_as_literal,
+    const XlaCompiler::ShapeRepresentationFn& shape_representation_fn,
+    std::unique_ptr<XlaDevice>* device) {
   VLOG(1) << "XlaDevice::Create " << platform_name << " " << device_name << ":"
           << device_ordinal;
 
@@ -129,17 +131,19 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
       DeviceType(device_name), Bytes(16ULL << 30), DeviceLocality(),
       strings::StrCat("device: ", device_name, " device"));
 
-  device->reset(new XlaDevice(options, attrs, device_ordinal,
-                              DeviceType(jit_device_name),
-                              platform.ValueOrDie(), transfer_as_literal));
+  device->reset(new XlaDevice(
+      options, attrs, device_ordinal, DeviceType(jit_device_name),
+      platform.ValueOrDie(), transfer_as_literal, shape_representation_fn));
   return Status::OK();
 }
 
-XlaDevice::Metadata::Metadata(int device_ordinal, se::Platform* platform,
-                              const DeviceType& device_type)
+XlaDevice::Metadata::Metadata(
+    int device_ordinal, se::Platform* platform, const DeviceType& device_type,
+    XlaCompiler::ShapeRepresentationFn shape_representation_fn)
     : device_ordinal_(device_ordinal),
       device_type_(device_type),
-      platform_(platform) {}
+      platform_(platform),
+      shape_representation_fn_(std::move(shape_representation_fn)) {}
 
 int XlaDevice::Metadata::device_ordinal() const { return device_ordinal_; }
 
@@ -170,17 +174,20 @@ const DeviceType& XlaDevice::Metadata::jit_device_type() const {
   return Status::OK();
 }
 
-XlaDevice::XlaDevice(const SessionOptions& options,
-                     const DeviceAttributes& attrs, int device_ordinal,
-                     const DeviceType& jit_device_name, se::Platform* platform,
-                     bool transfer_as_literal)
+XlaDevice::XlaDevice(
+    const SessionOptions& options, const DeviceAttributes& attrs,
+    int device_ordinal, const DeviceType& jit_device_name,
+    se::Platform* platform, bool transfer_as_literal,
+    const XlaCompiler::ShapeRepresentationFn& shape_representation_fn)
     : LocalDevice(options, attrs),
-      xla_metadata_(device_ordinal, platform, jit_device_name),
+      xla_metadata_(device_ordinal, platform, jit_device_name,
+                    shape_representation_fn),
       device_ordinal_(device_ordinal),
       jit_device_name_(jit_device_name),
       xla_allocator_(nullptr),
       platform_(platform),
-      transfer_as_literal_(transfer_as_literal) {
+      transfer_as_literal_(transfer_as_literal),
+      shape_representation_fn_(shape_representation_fn) {
   VLOG(1) << "Created XLA device " << jit_device_name;
 }
 
@@ -230,10 +237,10 @@ Status XlaDevice::CreateAndSetGpuDeviceInfo() {
     GetAllocator({});
     // XlaDevice owns both gpu_device_info_ and
     // gpu_device_info_->default_context.
-    gpu_device_info_ = absl::make_unique<GpuDeviceInfo>();
+    gpu_device_info_ = MakeUnique<GpuDeviceInfo>();
     gpu_device_info_->stream = stream;
-    gpu_device_info_->default_context =
-        new XlaDeviceContext(stream, client(), transfer_as_literal_);
+    gpu_device_info_->default_context = new XlaDeviceContext(
+        stream, client(), transfer_as_literal_, shape_representation_fn_);
     set_tensorflow_gpu_device_info(gpu_device_info_.get());
   }
 
@@ -247,7 +254,8 @@ Status XlaDevice::FillContextMap(const Graph* graph,
   TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream());
   // Call GetAllocator for the side-effect of ensuring the allocator is created.
   GetAllocator({});
-  auto ctx = new XlaDeviceContext(stream, client(), transfer_as_literal_);
+  auto ctx = new XlaDeviceContext(stream, client(), transfer_as_literal_,
+                                  shape_representation_fn_);
   for (Node* n : graph->nodes()) {
     VLOG(2) << n->id() << " : " << n->type_string() << " : " << n->name();
     ctx->Ref();
@@ -294,7 +302,8 @@ Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
     Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape());
     Notification n;
     TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream());
-    XlaTransferManager manager(stream, client(), transfer_as_literal_);
+    XlaTransferManager manager(stream, client(), transfer_as_literal_,
+                               shape_representation_fn_);
     manager.CopyCPUTensorToDevice(&parsed, this, &copy,
                                   [&n, &status](const Status& s) {
                                     status = s;
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 3ae87308cc7cffa916e178893df70a3f314b11b0..d5d345d43b16c43c7a202791b2604b39d29e8cdb 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -17,8 +17,7 @@ limitations under the License.
 // runtime.
 //
 // Operators assigned to an XlaDevice are compiled into XLA computations.
-// Tensors on an XlaDevice are thin wrappers around XLA GlobalDataHandles; state
-// is managed by XLA.
+// Tensors on an XlaDevice are thin wrappers around XLA ScopedShapedBuffers.
 //
 // XlaDevice is instantiated separately for each XLA backend (e.g., CPU or GPU),
 // under different names (e.g., XLA_CPU or XLA_GPU).
@@ -27,6 +26,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
 
 #include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -50,7 +50,8 @@ class XlaDevice : public LocalDevice {
   class Metadata {
    public:
     Metadata(int device_ordinal, se::Platform* platform,
-             const DeviceType& device_type);
+             const DeviceType& device_type,
+             XlaCompiler::ShapeRepresentationFn shape_representation_fn);
 
     // The index of the device on this host.
     int device_ordinal() const;
@@ -58,11 +59,15 @@ class XlaDevice : public LocalDevice {
     se::Platform* platform() const;
     xla::LocalClient* client() const;
     const DeviceType& jit_device_type() const;
+    const XlaCompiler::ShapeRepresentationFn& shape_representation_fn() const {
+      return shape_representation_fn_;
+    }
 
    private:
     const int device_ordinal_;
     const DeviceType device_type_;
     se::Platform* platform_;  // Not owned.
+    XlaCompiler::ShapeRepresentationFn shape_representation_fn_;
 
     TF_DISALLOW_COPY_AND_ASSIGN(Metadata);
   };
@@ -76,16 +81,19 @@ class XlaDevice : public LocalDevice {
   // 'transfer_as_literal' is true if device<->host transfers must be done using
   // XLA's TransferLiteral{To,From}Device interface. If false, we can use
   // ThenMemcpy instead.
-  static Status Create(const string& platform_name, const string& device_name,
-                       int device_ordinal, const string& jit_device_name,
-                       const SessionOptions& options, const string& name_prefix,
-                       const XlaOpRegistry::DeviceRegistration& registration,
-                       bool transfer_as_literal,
-                       std::unique_ptr<XlaDevice>* device);
+  static Status Create(
+      const string& platform_name, const string& device_name,
+      int device_ordinal, const string& jit_device_name,
+      const SessionOptions& options, const string& name_prefix,
+      const XlaOpRegistry::DeviceRegistration& registration,
+      bool transfer_as_literal,
+      const XlaCompiler::ShapeRepresentationFn& shape_representation_fn,
+      std::unique_ptr<XlaDevice>* device);
 
   XlaDevice(const SessionOptions& options, const DeviceAttributes& attrs,
             int device_ordinal, const DeviceType& jit_device_name,
-            se::Platform* platform, bool transfer_as_literal);
+            se::Platform* platform, bool transfer_as_literal,
+            const XlaCompiler::ShapeRepresentationFn& shape_representation_fn);
   ~XlaDevice() override;
 
   Allocator* GetAllocator(AllocatorAttributes attr) override;
@@ -116,8 +124,8 @@ class XlaDevice : public LocalDevice {
   // The name of the device that is used to compile Ops for this XlaDevice.
   DeviceType jit_device_name_;
   // Memory allocator associated with this device.
-  Allocator* xla_allocator_;                   // Not owned.
-  se::Platform* platform_;                     // Not owned.
+  Allocator* xla_allocator_;  // Not owned.
+  se::Platform* platform_;    // Not owned.
   // Stream associated with this device. Operations enqueued on this
   // stream are executed on the device. Operations include data
   // copying back and forth between CPU and the device, and
@@ -126,6 +134,7 @@ class XlaDevice : public LocalDevice {
   // Must we use XLA's transfer manager for correct host<->device transfers? if
   // false, we can use ThenMemcpy() instead.
   bool transfer_as_literal_;
+  XlaCompiler::ShapeRepresentationFn shape_representation_fn_;
 
   // If set, holds default device context (that we must Unref)
   // and its stream.
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index bf8c1886a022310eeaacdf69463f575a393dd8d0..ff30b62bad782f281bcd25275521ed8b0c4c0bfd 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -47,13 +47,14 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
 
 void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
 
-XlaTransferManager::XlaTransferManager(se::Stream* stream,
-                                       xla::LocalClient* client,
-                                       bool transfer_as_literal)
+XlaTransferManager::XlaTransferManager(
+    se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal,
+    XlaCompiler::ShapeRepresentationFn shape_representation_fn)
     : stream_(stream),
       client_(client),
       transfer_manager_(client->backend().transfer_manager()),
-      transfer_as_literal_(transfer_as_literal) {}
+      transfer_as_literal_(transfer_as_literal),
+      shape_representation_fn_(std::move(shape_representation_fn)) {}
 
 Status XlaTransferManager::TransferLiteralToDevice(
     const Tensor& host_tensor, Tensor* device_tensor) const {
@@ -76,7 +77,15 @@ Status XlaTransferManager::TransferLiteralFromDevice(
                       transfer_manager_->TransferLiteralFromDevice(
                           stream_->parent(), shaped_buffer));
   VLOG(1) << "Transfer from device as literal: " << literal->ToString();
-  return LiteralToHostTensor(*literal, host_tensor->dtype(), host_tensor);
+  Tensor tensor;
+  TF_RETURN_IF_ERROR(
+      LiteralToHostTensor(*literal, host_tensor->dtype(), &tensor));
+  // Reshape the tensor back to its declared shape.
+  if (!host_tensor->CopyFrom(tensor, device_tensor.shape())) {
+    return errors::Internal(
+        "Tensor::CopyFrom failed when copying from XLA device to CPU");
+  }
+  return Status::OK();
 }
 
 void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
@@ -96,9 +105,17 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
 
     XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
     CHECK(xla_tensor);
+
+    TensorShape shape;
+    if (shape_representation_fn_) {
+      shape = shape_representation_fn_(device_tensor->shape(),
+                                       device_tensor->dtype());
+    } else {
+      shape = device_tensor->shape();
+    }
     if (!xla_tensor->has_shaped_buffer()) {
       Status s = xla_tensor->AllocateShapedBuffer(
-          device_tensor->dtype(), device_tensor->shape(), client_,
+          device_tensor->dtype(), shape, client_,
           stream_->parent()->device_ordinal());
       if (!s.ok()) {
         done(s);
@@ -106,12 +123,18 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
       }
     }
 
-    se::DeviceMemoryBase dev_dst_ptr =
-        XlaTensor::DeviceMemoryFromTensor(*device_tensor);
     Status status;
     if (transfer_as_literal_) {
-      status = TransferLiteralToDevice(*cpu_tensor, device_tensor);
+      Tensor reshaped_cpu_tensor;
+      if (!reshaped_cpu_tensor.CopyFrom(*cpu_tensor, shape)) {
+        done(errors::Internal(
+            "Tensor::CopyFrom failed when copying from CPU to XLA device"));
+        return;
+      }
+      status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor);
     } else {
+      se::DeviceMemoryBase dev_dst_ptr =
+          XlaTensor::DeviceMemoryFromTensor(*device_tensor);
       stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
       // TODO(hpucha): Make this asynchronous.
       Status block_status = stream_->BlockHostUntilDone();
@@ -171,9 +194,11 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   done(Status::OK());
 }
 
-XlaDeviceContext::XlaDeviceContext(se::Stream* stream, xla::LocalClient* client,
-                                   bool transfer_as_literal)
-    : manager_(stream, client, transfer_as_literal) {}
+XlaDeviceContext::XlaDeviceContext(
+    se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal,
+    XlaCompiler::ShapeRepresentationFn shape_representation_fn)
+    : manager_(stream, client, transfer_as_literal,
+               std::move(shape_representation_fn)) {}
 
 void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                              Device* device,
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index d7f5f1d208989256f8043d2e6d93cf9bd89333b2..9af9655868448ce5116db3611c5f88339135947e 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -45,8 +46,9 @@ class XlaDeviceAllocator : public Allocator {
 // Helper class for managing data transfers between host and XLA devices.
 class XlaTransferManager {
  public:
-  explicit XlaTransferManager(se::Stream* stream, xla::LocalClient* client,
-                              bool transfer_as_literal);
+  explicit XlaTransferManager(
+      se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal,
+      XlaCompiler::ShapeRepresentationFn shape_representation_fn);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
                              Tensor* device_tensor, StatusCallback done) const;
@@ -69,7 +71,8 @@ class XlaTransferManager {
   // Transfer manager, for marshalling data to and from the device.
   xla::TransferManager* transfer_manager_;
   // True if we must use XLA's TransferManager for correct device transfers.
-  bool transfer_as_literal_;
+  const bool transfer_as_literal_;
+  const XlaCompiler::ShapeRepresentationFn shape_representation_fn_;
 };
 
 // DeviceContext for operators assigned to XlaDevice devices. The
@@ -77,8 +80,9 @@ class XlaTransferManager {
 // wraps the methods in XlaTransferManager.
 class XlaDeviceContext : public DeviceContext {
  public:
-  explicit XlaDeviceContext(se::Stream* stream, xla::LocalClient* client,
-                            bool transfer_as_literal);
+  explicit XlaDeviceContext(
+      se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal,
+      XlaCompiler::ShapeRepresentationFn shape_representation_fn);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
                              Tensor* device_tensor,
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 498d25cf566a91f68e5eb1ac312e17900471aeca..9c00a0682ccdc08e7bb09e32d32f01e87e7aaf8d 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/kernels/constant_op.h"
 #include "tensorflow/core/kernels/control_flow_ops.h"
+#include "tensorflow/core/kernels/identity_n_op.h"
 #include "tensorflow/core/kernels/identity_op.h"
 #include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/kernels/sendrecv_ops.h"
@@ -32,7 +33,7 @@ namespace tensorflow {
 
 // Dummy OpKernel, used for kernels assigned to an XLA device that should be
 // compiled. Should never be called at runtime since such ops should be
-// rewritten to a _XlaLaunch op. If it is called, it means the placer placed an
+// rewritten to a XlaLaunch op. If it is called, it means the placer placed an
 // operator on an XLA device but the compiler did not compile it.
 class XlaDeviceDummyOp : public OpKernel {
  public:
@@ -41,7 +42,7 @@ class XlaDeviceDummyOp : public OpKernel {
 };
 
 #define REGISTER_XLA_LAUNCH_KERNEL(DEVICE, KERNEL, TYPES) \
-  REGISTER_KERNEL_BUILDER(Name("_XlaLaunch")              \
+  REGISTER_KERNEL_BUILDER(Name("XlaLaunch")               \
                               .Device(DEVICE)             \
                               .HostMemory("constants")    \
                               .HostMemory("resources"),   \
@@ -63,6 +64,9 @@ class XlaDeviceDummyOp : public OpKernel {
       ConstantOp);                                                             \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("Identity").Device(DEVICE).TypeConstraint("T", TYPES), IdentityOp); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("IdentityN").Device(DEVICE).TypeConstraint("T", TYPES),             \
+      IdentityNOp);                                                            \
   REGISTER_KERNEL_BUILDER(Name("Placeholder").Device(DEVICE), PlaceholderOp);  \
   REGISTER_KERNEL_BUILDER(Name("PlaceholderV2").Device(DEVICE),                \
                           PlaceholderOp);                                      \
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index a8afbf9dcd736bb292b7c5f52c7cce2b47fb85b6..26842fbe5cc110fa9ce7a2767d245484fd67556d 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -48,7 +48,8 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& options,
   Status status =
       XlaDevice::Create("CUDA", DEVICE_XLA_GPU, 0, DEVICE_GPU_XLA_JIT, options,
                         name_prefix, registration,
-                        /*transfer_as_literal=*/false, &device);
+                        /*transfer_as_literal=*/false,
+                        /*shape_representation_fn=*/{}, &device);
   if (!status.ok()) {
     // Treat failures as non-fatal; there might not be a GPU in the machine.
     VLOG(1) << "Failed to create XLA_GPU device: " << status;
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
index 9e098c46f422b436c722bb909dc58930ab7c0ef6..4146996f6346446e715ffb225882cfb20359dae1 100644
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@@ -48,10 +48,11 @@ Status XlaInterpreterDeviceFactory::CreateDevices(
   registration.compile_resource_ops = true;
 
   std::unique_ptr<XlaDevice> device;
-  TF_RETURN_IF_ERROR(XlaDevice::Create("Interpreter", DEVICE_XLA_INTERPRETER, 0,
-                                       DEVICE_INTERPRETER_XLA_JIT, options,
-                                       name_prefix, registration,
-                                       /*transfer_as_literal=*/false, &device));
+  TF_RETURN_IF_ERROR(XlaDevice::Create(
+      "Interpreter", DEVICE_XLA_INTERPRETER, 0, DEVICE_INTERPRETER_XLA_JIT,
+      options, name_prefix, registration,
+      /*transfer_as_literal=*/false,
+      /*shape_representation_fn=*/{}, &device));
   devices->push_back(device.release());
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 33e53612b91315349cc7ac276021150d701ccdf3..d0c7a9365125708b2af43f87c7617d8d84050a61 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -38,14 +38,13 @@ using xla::ScopedShapedBuffer;
 using xla::ShapedBuffer;
 }  // anonymous namespace
 
-std::map<int, OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
-                                                        int num_variables) {
+std::map<int, OptionalTensor> SnapshotResourceVariables(
+    OpKernelContext* ctx, const std::vector<int>& variables) {
   std::map<int, OptionalTensor> snapshot;
-  int first_variable = ctx->num_inputs() - num_variables;
-  for (int i = 0; i < num_variables; ++i) {
+  for (int i : variables) {
     Var* variable = nullptr;
-    ResourceHandle handle = HandleFromInput(ctx, first_variable + i);
-    OptionalTensor& tensor = snapshot[first_variable + i];
+    ResourceHandle handle = HandleFromInput(ctx, i);
+    OptionalTensor& tensor = snapshot[i];
     if (LookupResource(ctx, handle, &variable).ok()) {
       tf_shared_lock lock(*variable->mu());
       tensor.name = handle.name();
@@ -61,19 +60,22 @@ XlaAllocator::XlaAllocator(const se::Platform* platform, Allocator* wrapped)
 
 XlaAllocator::~XlaAllocator() {}
 
-xla::StatusOr<se::DeviceMemoryBase> XlaAllocator::Allocate(
+xla::StatusOr<xla::OwningDeviceMemory> XlaAllocator::Allocate(
     int device_ordinal, uint64 size, bool retry_on_failure) {
-  void* data = wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size);
+  AllocationAttributes attrs;
+  attrs.no_retry_on_failure = !retry_on_failure;
+  void* data =
+      wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size, attrs);
   if (data == nullptr) {
     return errors::ResourceExhausted("Out of memory while trying to allocate ",
                                      size, " bytes.");
-  } else {
-    return se::DeviceMemoryBase(data, size);
   }
+  return xla::OwningDeviceMemory(se::DeviceMemoryBase(data, size),
+                                 device_ordinal, this);
 }
 
-Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) {
-  wrapped_->DeallocateRaw(mem->opaque());
+Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
+  wrapped_->DeallocateRaw(mem.opaque());
   return Status::OK();
 }
 
@@ -112,10 +114,9 @@ ScopedShapedBuffer ExtractSubShapedBuffer(
 using internal::ExtractSubShapedBuffer;
 
 XlaComputationLaunchContext::XlaComputationLaunchContext(
-    int64 num_resource_args, xla::LocalClient* client,
-    xla::DeviceMemoryAllocator* xla_allocator, bool allocate_xla_tensors)
-    : num_resource_args_(num_resource_args),
-      client_(client),
+    xla::LocalClient* client, xla::DeviceMemoryAllocator* xla_allocator,
+    bool allocate_xla_tensors)
+    : client_(client),
       xla_allocator_(xla_allocator),
       allocate_xla_tensors_(allocate_xla_tensors) {}
 
@@ -194,11 +195,6 @@ void XlaComputationLaunchContext::PopulateOutputs(
 
         OP_REQUIRES_OK(
             ctx, ctx->allocate_output(i, const_tensor.shape(), &output_tensor));
-        if (XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor)) {
-          OP_REQUIRES_OK(ctx, xla_tensor->AllocateShapedBuffer(
-                                  const_tensor.dtype(), const_tensor.shape(),
-                                  client_, stream->parent()->device_ordinal()));
-        }
 
         Device* device = dynamic_cast<Device*>(ctx->device());
         OP_REQUIRES(ctx, device != nullptr,
@@ -240,7 +236,7 @@ void XlaComputationLaunchContext::PopulateOutputs(
       } else {
         Tensor output_tensor = XlaTensorBuffer::MakeTensor(
             ctx->expected_output_dtype(i), shape, buffer, allocator);
-        output.set_buffer(se::DeviceMemoryBase(nullptr, 0), {output_num});
+        output.set_buffer(xla::OwningDeviceMemory(), {output_num});
         ctx->set_output(i, output_tensor);
       }
       ++output_num;
@@ -290,7 +286,7 @@ void XlaComputationLaunchContext::PopulateOutputs(
     } else {
       Tensor output_tensor = XlaTensorBuffer::MakeTensor(
           write.type, write.shape, buffer, allocator);
-      output.set_buffer(se::DeviceMemoryBase(nullptr, 0), {output_num});
+      output.set_buffer(xla::OwningDeviceMemory(), {output_num});
       *variable->tensor() = output_tensor;
     }
     ++output_num;
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 38291b0bd429b2b4f7939b2ec84213380f23d8bc..4390701ccbd0bc3971413ddcd917c11019990087 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -22,6 +22,8 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
@@ -31,15 +33,17 @@ limitations under the License.
 namespace tensorflow {
 class XlaAllocator;
 
-// Takes a snapshot of the values of resource variable arguments, which are
-// the last `num_variables` arguments. We snapshot tensors that back
+// Takes a snapshot of the values of resource variable arguments, whose
+// indices are specified in `variables` argument. We snapshot tensors that back
 // resource variables since concurrent updates may modify the shape, and it is
 // important that the shapes used for compilation match the true shapes of the
 // buffers.
 //
-// Returns a map of TensorFlow argument index to resource variable.
-std::map<int, OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
-                                                        int num_variables);
+// Returns a map of TensorFlow argument index to resource variable. If a
+// resource variable is not initialized, the corresponding OptionalTensor
+// will have its `present` field set to false.
+std::map<int, OptionalTensor> SnapshotResourceVariables(
+    OpKernelContext* ctx, const std::vector<int>& variables);
 
 // Adapter class that wraps a Tensorflow allocator as an XLA allocator.
 // Assumes that the Tensorflow allocator permits asynchronous deallocation:
@@ -48,9 +52,9 @@ class XlaAllocator : public xla::DeviceMemoryAllocator {
  public:
   XlaAllocator(const se::Platform* platform, Allocator* wrapped);
   ~XlaAllocator() override;
-  xla::StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
-                                               bool retry_on_failure) override;
-  Status Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) override;
+  xla::StatusOr<xla::OwningDeviceMemory> Allocate(
+      int device_ordinal, uint64 size, bool retry_on_failure) override;
+  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
 
   // The Tensorflow BFC allocator used on GPU allows host-side deallocation
   // before GPU execution takes place. Tensorflow uses the ordering of the main
@@ -72,7 +76,7 @@ class XlaComputationLaunchContext {
   // Create a new launch context. 'allocate_xla_tensors' is true if allocated
   // output tensors and variables are always XlaTensors. If false they are
   // assumed to be "normal" device pointers.
-  XlaComputationLaunchContext(int64 num_resource_args, xla::LocalClient* client,
+  XlaComputationLaunchContext(xla::LocalClient* client,
                               xla::DeviceMemoryAllocator* xla_allocator,
                               bool allocate_xla_tensors);
 
@@ -92,7 +96,6 @@ class XlaComputationLaunchContext {
   const std::vector<xla::ShapedBuffer*>& arguments() const { return arg_ptrs_; }
 
  private:
-  int64 num_resource_args_;
   xla::LocalClient* client_;
   xla::DeviceMemoryAllocator* xla_allocator_;
   bool allocate_xla_tensors_;
diff --git a/tensorflow/compiler/jit/xla_launch_util_test.cc b/tensorflow/compiler/jit/xla_launch_util_test.cc
index 27813efc0bc0aecdbea2dfce5ca27ba704ea45e2..a45932403ec1760d6b985d5357fd6d84fbf257a2 100644
--- a/tensorflow/compiler/jit/xla_launch_util_test.cc
+++ b/tensorflow/compiler/jit/xla_launch_util_test.cc
@@ -36,9 +36,9 @@ void BM_ExtractSubBuffer(int iters, int depth, int fan_out) {
   for (int i = 0; i < iters; ++i) {
     // Extract a buffer from approximately the middle of the first level of the
     // tree.
-    tensorflow::internal::ExtractSubShapedBuffer(&shaped_buffer,
-                                                 /*index=*/fan_out / 2,
-                                                 /*allocator=*/nullptr)
+    (void)tensorflow::internal::ExtractSubShapedBuffer(&shaped_buffer,
+                                                       /*index=*/fan_out / 2,
+                                                       /*allocator=*/nullptr)
         .release();
   }
 }
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index ce6456880bc1b3bc15ac0ef4bae35a83771098ef..a7211c9c7e281a8141d5671b345c628441b2359d 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -52,20 +52,22 @@ Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
       client->backend().transfer_manager()->HostShapeToDeviceShape(
           on_host_shape);
 
-  xla::ShapedBuffer buffer(on_host_shape, on_device_shape, client->platform(),
-                           device_ordinal);
-  for (auto& index_to_buffer : buffer.buffers()) {
+  xla::ScopedShapedBuffer shaped_buffer(on_host_shape, on_device_shape,
+                                        client->backend().memory_allocator(),
+                                        device_ordinal);
+  for (auto& index_to_buffer : shaped_buffer.buffers()) {
     xla::Shape subshape =
         xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
     uint64 size =
         client->backend().transfer_manager()->GetByteSizeRequirement(subshape);
-    TF_ASSIGN_OR_RETURN(index_to_buffer.second,
+    TF_ASSIGN_OR_RETURN(xla::OwningDeviceMemory buffer,
                         client->backend().memory_allocator()->Allocate(
                             device_ordinal, size, /*retry_on_failure=*/false));
+    // Move our buffer into shaped_buffer, which takes ownership of it.
+    index_to_buffer.second = buffer.Forget();
   }
 
-  set_shaped_buffer(xla::ScopedShapedBuffer(
-      std::move(buffer), client->backend().memory_allocator()));
+  set_shaped_buffer(std::move(shaped_buffer));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index 922a91897312096e4bb6ee2a1cc153e0039e2c7a..6b29c82ec11e39ad525663991e179443c2b6dca7 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -54,7 +54,7 @@ class XlaTensor {
   // Some Tensors can have complex on-device shapes, including tuple shapes. To
   // manage the memory for these tensors a ShapedBuffer may be required.
 
-  // Return true if this TensorInfo contains a ShapedBuffer.
+  // Return true if this XlaTensor contains a ShapedBuffer.
   bool has_shaped_buffer() const { return shaped_buffer_ != nullptr; }
   // Return the contained ShapedBuffer.
   // REQUIRES: has_shaped_buffer()
@@ -62,7 +62,7 @@ class XlaTensor {
     CHECK(has_shaped_buffer());
     return *shaped_buffer_;
   }
-  // Mutates the TensorInfo to set the ShapedBuffer.
+  // Mutates the XlaTensor to set the ShapedBuffer.
   void set_shaped_buffer(xla::ScopedShapedBuffer shaped_buffer) {
     shaped_buffer_ =
         xla::MakeUnique<xla::ScopedShapedBuffer>(std::move(shaped_buffer));
@@ -72,7 +72,7 @@ class XlaTensor {
   // in on-demand mode to avoid re-copying values from the device if we know the
   // host value already.
 
-  // Return true if this TensorInfo contains a host tensor.
+  // Return true if this XlaTensor contains a host tensor.
   bool has_host_tensor() const { return host_tensor_ != nullptr; }
   // Return the contained host tensor.
   // REQUIRES: has_host_tensor()
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index a94b298f87832057c6ec86a1ea250a54ed1b4ee0..4c291d2383163e5def54657186c2190c023832fc 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -42,7 +42,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform",
         "//tensorflow/python:random_seed",
         "//tensorflow/python:session",
@@ -58,7 +58,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
@@ -72,7 +72,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
@@ -93,7 +93,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -111,7 +111,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:bitwise_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:nn_ops",
@@ -127,7 +127,7 @@ tf_xla_py_test(
     tags = ["optonly"],
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
@@ -141,7 +141,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
@@ -156,7 +156,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
@@ -170,7 +170,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -184,7 +184,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:gradient_checker",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
@@ -196,9 +196,11 @@ tf_xla_py_test(
     name = "oom_test",
     size = "medium",
     srcs = ["oom_test.py"],
+    # TODO(b/80081500): Re-enable on GPU. Disabled on 2018-05-21.
     disabled_backends = [
         "cpu",
         "cpu_ondemand",
+        "gpu",
     ],
     tags = [
         # Allocates very large amounts of memory and does not work under TSAN.
@@ -209,7 +211,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:gradient_checker",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
@@ -225,7 +227,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
@@ -241,7 +243,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
@@ -263,7 +265,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
@@ -291,7 +293,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -300,10 +302,14 @@ tf_xla_py_test(
     name = "extract_image_patches_op_test",
     size = "small",
     srcs = ["extract_image_patches_op_test.py"],
+    tags = [
+        "manual",
+        "notap",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -322,8 +328,12 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:function",
     ],
 )
 
@@ -338,7 +348,7 @@ tf_xla_py_test(
         "//tensorflow/contrib/signal:signal_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:spectral_ops",
     ],
@@ -352,7 +362,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -364,7 +374,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
@@ -380,7 +390,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -395,12 +405,27 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:platform_test",
     ],
 )
 
+tf_xla_py_test(
+    name = "listdiff_op_test",
+    size = "small",
+    srcs = ["listdiff_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform_test",
+        "@six_archive//:six",
+    ],
+)
+
 tf_xla_py_test(
     name = "lrn_ops_test",
     size = "medium",
@@ -408,7 +433,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
@@ -423,7 +448,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -435,7 +460,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
@@ -449,7 +474,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -462,7 +487,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -475,7 +500,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
@@ -490,7 +515,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
@@ -507,7 +532,7 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
@@ -522,7 +547,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -538,7 +563,7 @@ tf_xla_py_test(
         "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -551,7 +576,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
     ],
 )
 
@@ -563,7 +588,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -575,7 +600,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
@@ -590,7 +615,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -603,7 +628,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:platform_test",
@@ -618,7 +643,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -634,7 +659,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -647,7 +672,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/contrib/stateless",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -661,7 +686,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:nn_ops",
@@ -680,7 +705,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -693,7 +718,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
@@ -707,7 +732,7 @@ tf_xla_py_test(
     srcs = ["fused_batchnorm_test.py"],
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:nn",
@@ -726,7 +751,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:nn_ops",
@@ -745,7 +770,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
     ],
@@ -760,7 +785,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -772,7 +797,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -785,21 +810,34 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
 
-cuda_py_test(
+tf_xla_py_test(
     name = "xla_device_test",
     size = "small",
     srcs = ["xla_device_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "xla_device_gpu_test",
+    size = "small",
+    srcs = ["xla_device_gpu_test.py"],
     additional_deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
     ],
 )
@@ -816,11 +854,22 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "dense_layer_test",
+    size = "small",
+    srcs = ["dense_layer_test.py"],
+    additional_deps = [
+        "//tensorflow/contrib/compiler:compiler_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:layers",
         "//tensorflow/python:variables",
     ],
 )
@@ -864,7 +913,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
@@ -879,7 +928,7 @@ cuda_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:gradients",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
@@ -917,7 +966,7 @@ tf_xla_py_test(
     srcs = ["fake_quant_ops_test.py"],
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -929,7 +978,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
diff --git a/tensorflow/compiler/tests/argminmax_test.py b/tensorflow/compiler/tests/argminmax_test.py
index ec547e16cd9c91a1e25bc963b9a3cafddf7326cd..9d3a889b1f54c813e881bb03b5275f809af1b3c8 100644
--- a/tensorflow/compiler/tests/argminmax_test.py
+++ b/tensorflow/compiler/tests/argminmax_test.py
@@ -29,51 +29,70 @@ from tensorflow.python.platform import test
 
 class ArgMinMaxTest(xla_test.XLATestCase):
 
-  def _assertOpOutputMatchesExpected(self, op, inp, expected):
-    """Verifies that 'op' produces 'expected' when fed input 'inp' .
+  def _assertOpOutputMatchesExpected(self, op, axis, output_type, op_input,
+                                     expected):
+    """Verifies that 'op' produces 'expected' when fed input 'op_input' .
 
     Args:
-      op: operator to test
-      inp: numpy input array to use as input to 'op'.
+      op: argmin or argmax operator to test.
+      axis: integer axis to reduce across.
+      output_type: numpy datatype of the output to produce.
+      op_input: numpy input array to use as input to 'op'.
       expected: numpy array representing the expected output of 'op'.
     """
     with self.test_session() as session:
       with self.test_scope():
         pinp = array_ops.placeholder(
-            dtypes.as_dtype(inp.dtype), inp.shape, name="a")
-        output = op(pinp)
-      result = session.run(output, {pinp: inp})
+            dtypes.as_dtype(op_input.dtype), op_input.shape, name="a")
+        output = op(pinp, axis=axis, output_type=output_type)
+      result = session.run(output, {pinp: op_input})
       self.assertAllEqual(result, expected)
 
   def testArgMinMax(self):
     # Complex numbers do not support argmin/argmax.
     minmax_types = set(self.numeric_types) - set(self.complex_types)
     for dtype in minmax_types:
-      self._assertOpOutputMatchesExpected(
-          lambda x: math_ops.argmax(x, axis=0, output_type=dtypes.int32),
-          np.array([1, 10, 27, 3, 3, 4], dtype=dtype),
-          expected=np.int32(2))
-      self._assertOpOutputMatchesExpected(
-          lambda x: math_ops.argmax(x, axis=0, output_type=dtypes.int32),
-          np.array([[4, 1, 7], [3, 2, 4]], dtype=dtype),
-          expected=np.array([0, 1, 0], dtype=np.int32))
-      self._assertOpOutputMatchesExpected(
-          lambda x: math_ops.argmax(x, axis=1, output_type=dtypes.int32),
-          np.array([[4, 1], [3, 2]], dtype=dtype),
-          expected=np.array([0, 0], dtype=np.int32))
+      # output_type is a numpy data type that is used to specify the desired
+      # output type of the op as well as to convert the Python number to the
+      # array scalar of the type.
+      for output_type in self.int_types:
+        self._assertOpOutputMatchesExpected(
+            math_ops.argmax,
+            axis=0,
+            output_type=output_type,
+            op_input=np.array([1, 10, 27, 3, 3, 4], dtype=dtype),
+            expected=output_type(2))
+        self._assertOpOutputMatchesExpected(
+            math_ops.argmax,
+            axis=0,
+            output_type=output_type,
+            op_input=np.array([[4, 1, 7], [3, 2, 4]], dtype=dtype),
+            expected=np.array([0, 1, 0], dtype=output_type))
+        self._assertOpOutputMatchesExpected(
+            math_ops.argmax,
+            axis=1,
+            output_type=output_type,
+            op_input=np.array([[4, 1], [3, 2]], dtype=dtype),
+            expected=np.array([0, 0], dtype=output_type))
 
-      self._assertOpOutputMatchesExpected(
-          lambda x: math_ops.argmin(x, axis=0, output_type=dtypes.int32),
-          np.array([3, 10, 27, 3, 2, 4], dtype=dtype),
-          expected=np.int32(4))
-      self._assertOpOutputMatchesExpected(
-          lambda x: math_ops.argmin(x, axis=0, output_type=dtypes.int32),
-          np.array([[4, 1, 7], [3, 2, 4]], dtype=dtype),
-          expected=np.array([1, 0, 1], dtype=np.int32))
-      self._assertOpOutputMatchesExpected(
-          lambda x: math_ops.argmin(x, axis=1, output_type=dtypes.int32),
-          np.array([[4, 1], [3, 2]], dtype=dtype),
-          expected=np.array([1, 1], dtype=np.int32))
+        self._assertOpOutputMatchesExpected(
+            math_ops.argmin,
+            axis=0,
+            output_type=output_type,
+            op_input=np.array([3, 10, 27, 3, 2, 4], dtype=dtype),
+            expected=output_type(4))
+        self._assertOpOutputMatchesExpected(
+            math_ops.argmin,
+            axis=0,
+            output_type=output_type,
+            op_input=np.array([[4, 1, 7], [3, 2, 4]], dtype=dtype),
+            expected=np.array([1, 0, 1], dtype=output_type))
+        self._assertOpOutputMatchesExpected(
+            math_ops.argmin,
+            axis=1,
+            output_type=output_type,
+            op_input=np.array([[4, 1], [3, 2]], dtype=dtype),
+            expected=np.array([1, 1], dtype=output_type))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/dense_layer_test.py b/tensorflow/compiler/tests/dense_layer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..865f60ccab46ec6829e49409508303052944e13b
--- /dev/null
+++ b/tensorflow/compiler/tests/dense_layer_test.py
@@ -0,0 +1,135 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DenseLayer JIT compilation on the CPU and GPU devices."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from tensorflow.contrib.compiler import jit
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.layers import layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+jit_scope = jit.experimental_jit_scope
+
+
+def GetRunMetadataLabels(run_metadata):
+  """Returns all labels in run_metadata."""
+  labels = []
+  for dev_stats in run_metadata.step_stats.dev_stats:
+    for node_stats in dev_stats.node_stats:
+      labels.append(node_stats.timeline_label)
+  return labels
+
+
+def InLabels(labels, substr):
+  """Returns true iff one of the labels contains substr."""
+  return any([substr in x for x in labels])
+
+
+def XlaLaunchOpCount(labels):
+  """Count how many XlaLaunch labels are present."""
+  return sum("XlaLaunch(" in x for x in labels)
+
+
+class DenseLayerTest(test.TestCase):
+
+  def testDenseLayerAutoJit(self):
+    """Tests dense layer compilation in auto-jit mode.
+
+    Dense layer should be compiled into a single XlaLaunch op in auto-jit mode.
+    """
+
+    os.environ["TF_XLA_FLAGS"] = ("--tf_xla_cpu_global_jit")
+    config = config_pb2.ConfigProto()
+    config.graph_options.optimizer_options.global_jit_level = (
+        config_pb2.OptimizerOptions.ON_1)
+
+    with self.test_session(config=config) as sess:
+      x = array_ops.placeholder(shape=[None, None, 3], dtype=np.float32)
+      y = layers.dense(x, 3)
+
+      sess.run(variables.initialize_all_variables())
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(
+          y, {x: np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])},
+          run_metadata=run_metadata,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+
+    labels = GetRunMetadataLabels(run_metadata)
+    self.assertEqual(1, XlaLaunchOpCount(labels))
+    self.assertFalse(InLabels(labels, "ListDiff"))
+
+  def testDenseLayerJitScopeDefinedShape(self):
+    """Tests that the dense layer node is properly compiled in jit scope.
+
+    Dense layer with static shape input tensor should be compiled into a single
+    XlaLaunch op by XLA.
+    """
+
+    with self.test_session() as sess:
+      x = array_ops.placeholder(shape=[2, 2, 3], dtype=np.float32)
+      with jit_scope():
+        y = layers.dense(x, 3)
+
+      sess.run(variables.initialize_all_variables())
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(
+          y, {x: np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])},
+          run_metadata=run_metadata,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+
+    labels = GetRunMetadataLabels(run_metadata)
+    self.assertEqual(1, XlaLaunchOpCount(labels))
+    # No need to check whether ListDiff is compiled or not because ListDiff op
+    # is not used when input tensor shape is fully defined.
+
+  def testDenseLayerJitScopeUndefinedShape(self):
+    """Tests that the dense layer node is properly compiled in jit scope.
+
+    Dense layer uses shape op to get shape of input tensor if its shape is not
+    fully defined. XLA does not cluster shape op with other operators. But in
+    experimental_jit_scope, XLA is forced to compile shape op into its own
+    cluster, causing dense layer to be split into TWO XlaLaunch ops.
+    """
+
+    with self.test_session() as sess:
+      x = array_ops.placeholder(shape=[None, None, 3], dtype=np.float32)
+      with jit_scope():
+        y = layers.dense(x, 3)
+
+      sess.run(variables.initialize_all_variables())
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(
+          y, {x: np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])},
+          run_metadata=run_metadata,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+
+    labels = GetRunMetadataLabels(run_metadata)
+    self.assertEqual(2, XlaLaunchOpCount(labels))
+    self.assertFalse(InLabels(labels, "ListDiff"))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index bdd0185dfe4abe9d9acecc5381ff82c54b8c0705..52d8d6d295c428f2c3466ef2963223cc978b4277 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -24,10 +24,16 @@ from tensorflow.compiler.tests.xla_test import XLATestCase
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.layers import convolutional
+from tensorflow.python.layers import pooling
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import googletest
 
@@ -43,7 +49,7 @@ class EagerTest(XLATestCase):
 
   def testExecuteListOutputLen0(self):
     with self.test_scope():
-      empty = constant_op.constant([], dtype=dtypes.int32)
+      empty = constant_op.constant([], dtype=dtypes.float32)
       result = array_ops.unstack(empty, 0)
       self.assertTrue(isinstance(result, list))
       self.assertEqual(0, len(result))
@@ -51,7 +57,7 @@ class EagerTest(XLATestCase):
   def testExecuteListOutputLen1(self):
     with self.test_scope():
       split_dim = constant_op.constant(1)
-      value = constant_op.constant([[0, 1, 2], [3, 4, 5]])
+      value = constant_op.constant([[0., 1., 2.], [3., 4., 5.]])
       result = array_ops.split(value, 1, axis=split_dim)
       self.assertTrue(isinstance(result, list))
       self.assertEqual(1, len(result))
@@ -60,7 +66,7 @@ class EagerTest(XLATestCase):
   def testExecuteListOutputLen3(self):
     with self.test_scope():
       split_dim = constant_op.constant(1)
-      value = constant_op.constant([[0, 1, 2], [3, 4, 5]])
+      value = constant_op.constant([[0., 1., 2.], [3., 4., 5.]])
       result = array_ops.split(value, 3, axis=split_dim)
       self.assertTrue(isinstance(result, list))
       self.assertEqual(3, len(result))
@@ -131,7 +137,173 @@ class EagerTest(XLATestCase):
     self.assertEqual(2., grads[0][0].numpy())
 
 
-if __name__ == "__main__":
+class EagerFunctionTest(XLATestCase):
+
+  def testBasic(self):
+    with self.test_scope():
+      matmul = function.defun(math_ops.matmul, compiled=True)
+      t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+      sq = matmul(t, t, transpose_a=True)
+      self.assertAllEqual(sq.numpy().reshape(-1), [10, 14, 14, 20])
+
+  def testConv(self):
+    if 'GPU' in self.device:
+      # TODO(b/32333178)
+      self.skipTest('Current implementation of RandomStandardNormal kernel '
+                    'is very slow on GPU, and has been blacklisted.')
+    with self.test_scope():
+      data_format = 'channels_last'
+      conv = convolutional.Conv2D(
+          filters=1, kernel_size=2, padding='VALID',
+          data_format=data_format, activation=nn_ops.relu,
+          kernel_initializer=init_ops.ones_initializer(),
+          bias_initializer=init_ops.zeros_initializer())
+      pool = pooling.MaxPooling2D(2, 2, data_format=data_format)
+
+      def model(x):
+        x = conv(x)
+        return pool(x)
+      model = function.defun(model, compiled=True)
+
+      x = array_ops.ones([1, 4, 4, 1])
+      y = model(x)
+      self.assertAllEqual(y.numpy(), [[[[4.]]]])
+
+  def testReadVariable(self):
+    with self.test_scope():
+      v = resource_variable_ops.ResourceVariable(1.0)
+
+      @function.defun(compiled=True)
+      def f():
+        return v.read_value()
+
+      var = f()
+      self.assertEqual(1.0, var.numpy())
+
+  def testUpdateVariable(self):
+    with self.test_scope():
+      v = resource_variable_ops.ResourceVariable(1.0)
+
+      def f(v):
+        v.assign_add(1.0)
+        return v
+
+      f = function.defun(f, compiled=True)
+
+      var = f(v)
+      self.assertEqual(2.0, var.numpy())
+
+  def testAllArgumentKinds(self):
+    """Test a complex function that takes different argument kinds.
+
+    tf2xla machinery that translates, compiles, and runs defuns
+    classifies arguments into: compile-time constants, regular tensors,
+    and resources. This test creates a function with a mix of all these
+    kinds. Moreover, the order of function arguments is intentionally mixed up.
+
+    This also tests the case when the same argument is a compile-time constant
+    as well as used in an operation that normally expects its inputs to be
+    in device memory - addition in this case.
+    """
+    with self.test_scope():
+      def foo(c1, r1, v1, c2, v2, r2):
+        # c1 and c2 are compile-time constants
+        # r1 and r2 are regular tensors
+        # v1 and v2 are resource variables
+        a = c1 + r1
+        b = math_ops.cast(c2, dtypes.float32) + v2
+        c = array_ops.slice(v1, c1, c2)
+        d = r2 * v2
+        return a, b, c, d
+
+      foo = function.defun(foo, compiled=True)
+
+      c1 = [0, 0]
+      c2 = array_ops.ones([2], dtype=dtypes.int32)
+
+      r1 = array_ops.ones([2])
+      r2 = [[2., 2.], [3., 3.]]
+
+      v1 = resource_variable_ops.ResourceVariable([[1., 2.], [3., 4.]])
+      v2 = resource_variable_ops.ResourceVariable([[10., 20.], [30., 40.]])
+
+      a, b, c, d = foo(c1, r1, v1, c2, v2, r2)
+
+      self.assertAllEqual([1, 1], a.numpy())
+      self.assertAllEqual([[11., 21.], [31., 41.]], b.numpy())
+      self.assertAllEqual([[1.]], c.numpy())
+      self.assertAllEqual([[20., 40.], [90., 120.]], d.numpy())
+
+  def testDefunInGradientTape(self):
+    with self.test_scope():
+      v0 = resource_variable_ops.ResourceVariable(5.0)
+
+      @function.defun(compiled=True)
+      def f(x):
+        x = v0 * v0 * x
+        return x
+
+      x = constant_op.constant(3.0)
+      with backprop.GradientTape() as tape:
+        y = f(x)
+      dy = tape.gradient(y, v0)
+
+    self.assertEqual(75, y.numpy())
+    self.assertEqual(30, dy.numpy())
+
+
+class ExcessivePaddingTest(XLATestCase):
+  """Test that eager execution works with TPU flattened tensors.
+
+  Tensors that would normally be excessively padded when written
+  to TPU memory are reshaped to 1-D flat tensors.
+
+  This test case verifies that such tensors work with eager execution.
+
+  The flattening currently only happens on TPU, but tests should work
+  fine with all backends as flattening is transparent.
+  """
+
+  def testFromConstant(self):
+    with self.test_scope():
+      # Create constant of shape [100, 2, 1]. This tensor would be
+      # excessively padded on TPU.
+      tensor = constant_op.constant(100 * [[[10.0], [2.0]]])
+      # Use reduce_sum since it requires correctly working with
+      # a particular dimension.
+      reduced = math_ops.reduce_sum(tensor, axis=1)
+      self.assertAllEqual(100 * [[12.0]], reduced)
+
+  def testFromOperation(self):
+    with self.test_scope():
+      tensor = array_ops.ones([3, 100, 2, 2])
+      reduced = math_ops.reduce_sum(tensor, axis=[0, 2, 3])
+      self.assertAllEqual(100 * [12.0], reduced)
+
+  def testAsFunctionInput(self):
+    with self.test_scope():
+
+      @function.defun(compiled=True)
+      def f(x):
+        return math_ops.reduce_sum(x, axis=2)
+
+      tensor = constant_op.constant(100 * [[[10.0, 2.0]]])
+      reduced = f(tensor)
+      self.assertAllEqual(100 * [[12.0]], reduced)
+
+  def testAsFunctionOutput(self):
+    with self.test_scope():
+
+      @function.defun(compiled=True)
+      def f(x):
+        return x * constant_op.constant(100 * [[[10.0, 2.0]]])
+
+      y = f(3)
+      reduced = math_ops.reduce_sum(y, axis=2)
+      self.assertAllEqual(100 * [[36.0]], reduced)
+
+
+if __name__ == '__main__':
   ops.enable_eager_execution(
       config=config_pb2.ConfigProto(log_device_placement=True))
   googletest.main()
diff --git a/tensorflow/compiler/tests/function_test.py b/tensorflow/compiler/tests/function_test.py
index fbc3c994d163a504351fcccd1ba71a0997e6516f..8a3f4b0bdc7a61d6cfa2ba7474ce8579e293a5c7 100644
--- a/tensorflow/compiler/tests/function_test.py
+++ b/tensorflow/compiler/tests/function_test.py
@@ -24,12 +24,10 @@ from tensorflow.compiler.tests.xla_test import XLATestCase
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import googletest
 
 
-@test_util.with_c_api
 class FunctionTest(XLATestCase):
 
   def testFunction(self):
diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index 1ad83d80409734efd1f5a0a9fc39f5b7d064d54b..4b0043b6b4c7fbf57ec1507b84adf18daaea9363 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -29,13 +29,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
-from tensorflow.python.layers import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 jit_scope = jit.experimental_jit_scope
@@ -80,10 +78,10 @@ def InLabels(labels, substr):
 
 
 def MetadataHasXlaLaunch(run_metadata):
-  """Returns true if there is a _XlaLaunch kernel in run_metadata's timeline."""
+  """Returns true if there is a XlaLaunch kernel in run_metadata's timeline."""
 
   # TODO(phawkins): find a less hacky way to test whether a kernel ran.
-  return InLabels(RunMetadataLabels(run_metadata), "_XlaLaunch")
+  return InLabels(RunMetadataLabels(run_metadata), "XlaLaunch")
 
 
 class JitLaunchTest(test.TestCase):
@@ -92,8 +90,8 @@ class JitLaunchTest(test.TestCase):
   # Verifies that the outputs match and that XLA was invoked. 'fn' must take
   # the same number of tensors as arguments that are in 'args', and must return
   # a tuple of output tensors.
-  # If 'require_kernel_launch' is True, then we verify that a _XlaLaunch node
-  # actually ran. However, it is sometimes possible for _XlaLaunch ops to be
+  # If 'require_kernel_launch' is True, then we verify that a XlaLaunch node
+  # actually ran. However, it is sometimes possible for XlaLaunch ops to be
   # constant-folded away, so the check is optional.
   def _compare(self, fn, args, require_kernel_launch=True, noinline=None):
     with session_lib.Session(config=NoRewriteSessionConfig()) as sess:
@@ -443,31 +441,14 @@ class XlaCompilationTest(test.TestCase):
     self.assertFalse(InLabels(labels, "Log"))
     self.assertTrue(InLabels(labels, "Reciprocal"))
     self.assertTrue(InLabels(labels, "Mul"))
-    self.assertFalse(InLabels(labels, "_XlaLaunch"))
+    self.assertFalse(InLabels(labels, "XlaLaunch"))
 
-    # Compile the backprop. One _XlaLaunch.
+    # Compile the backprop. One XlaLaunch.
     labels = _Run(compiled=True)
     self.assertFalse(InLabels(labels, "Log"))
     self.assertFalse(InLabels(labels, "Reciprocal"))
     self.assertFalse(InLabels(labels, "Mul"))
-    self.assertTrue(InLabels(labels, "_XlaLaunch"))
-
-  def testDenseLayer(self):
-    """Tests that the dense layer node is properly compiled."""
-
-    with self.test_session(config=NoRewriteSessionConfig()) as sess:
-      x = array_ops.placeholder(shape=[2, 3], dtype=np.float32)
-      with jit_scope():
-        y = layers.dense(x, 3)
-
-      sess.run(variables.initialize_all_variables())
-      run_metadata = config_pb2.RunMetadata()
-      sess.run(y, {x: np.array([[1, 2, 3], [4, 5, 6]])},
-               run_metadata=run_metadata,
-               options=config_pb2.RunOptions(
-                   trace_level=config_pb2.RunOptions.FULL_TRACE))
-
-    self.assert_(MetadataHasXlaLaunch(run_metadata))
+    self.assertTrue(InLabels(labels, "XlaLaunch"))
 
 
 class ElementWiseFusionTest(test.TestCase):
@@ -501,7 +482,7 @@ class ElementWiseFusionTest(test.TestCase):
               trace_level=config_pb2.RunOptions.FULL_TRACE))
 
       labels = RunMetadataLabels(run_metadata)
-      count = sum("_XlaLaunch(" in x for x in labels)
+      count = sum("XlaLaunch(" in x for x in labels)
 
       return output, count
 
diff --git a/tensorflow/compiler/tests/listdiff_op_test.py b/tensorflow/compiler/tests/listdiff_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..45a04f0cf56e88946b946bedacb25ce6da3121b4
--- /dev/null
+++ b/tensorflow/compiler/tests/listdiff_op_test.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for XLA listdiff operator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ListDiffTest(xla_test.XLATestCase):
+
+  def _testListDiff(self, x, y, out, idx):
+    for dtype in [dtypes.int32, dtypes.int64]:
+      for index_dtype in [dtypes.int32, dtypes.int64]:
+        with self.test_session() as sess:
+          x_tensor = ops.convert_to_tensor(x, dtype=dtype)
+          y_tensor = ops.convert_to_tensor(y, dtype=dtype)
+          with self.test_scope():
+            out_tensor, idx_tensor = array_ops.listdiff(
+                x_tensor, y_tensor, out_idx=index_dtype)
+            tf_out, tf_idx = sess.run([out_tensor, idx_tensor])
+        self.assertAllEqual(out, tf_out)
+        self.assertAllEqual(idx, tf_idx)
+        self.assertEqual(1, out_tensor.get_shape().ndims)
+        self.assertEqual(1, idx_tensor.get_shape().ndims)
+
+  def testBasic1(self):
+    self._testListDiff(x=[1, 2, 3, 4], y=[1, 2], out=[3, 4], idx=[2, 3])
+
+  def testBasic2(self):
+    self._testListDiff(x=[1, 2, 3, 4], y=[2], out=[1, 3, 4], idx=[0, 2, 3])
+
+  def testBasic3(self):
+    self._testListDiff(x=[1, 4, 3, 2], y=[4, 2], out=[1, 3], idx=[0, 2])
+
+  def testDuplicates(self):
+    self._testListDiff(x=[1, 2, 4, 3, 2, 3, 3, 1],
+                       y=[4, 2],
+                       out=[1, 3, 3, 3, 1],
+                       idx=[0, 3, 5, 6, 7])
+
+  def testRandom(self):
+    num_random_tests = 10
+    int_low = -7
+    int_high = 8
+    max_size = 50
+    for _ in xrange(num_random_tests):
+      x_size = np.random.randint(max_size + 1)
+      x = np.random.randint(int_low, int_high, size=x_size)
+      y_size = np.random.randint(max_size + 1)
+      y = np.random.randint(int_low, int_high, size=y_size)
+      out_idx = [(entry, pos) for pos, entry in enumerate(x) if entry not in y]
+      if out_idx:
+        out, idx = map(list, zip(*out_idx))
+      else:
+        out = []
+        idx = []
+      self._testListDiff(list(x), list(y), out, idx)
+
+  def testFullyOverlapping(self):
+    self._testListDiff(x=[1, 2, 3, 4], y=[1, 2, 3, 4], out=[], idx=[])
+
+  def testNonOverlapping(self):
+    self._testListDiff(x=[1, 2, 3, 4],
+                       y=[5, 6],
+                       out=[1, 2, 3, 4],
+                       idx=[0, 1, 2, 3])
+
+  def testEmptyX(self):
+    self._testListDiff(x=[], y=[1, 2], out=[], idx=[])
+
+  def testEmptyY(self):
+    self._testListDiff(x=[1, 2, 3, 4], y=[], out=[1, 2, 3, 4], idx=[0, 1, 2, 3])
+
+  def testEmptyXY(self):
+    self._testListDiff(x=[], y=[], out=[], idx=[])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index e53efc3091d8935e745122af29abd7b8063b1d01..16f293891d56d78885dd515bb7b9899faf0690f7 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -619,8 +619,8 @@ std::vector<int64> OpTest::ImageDims(TensorFormat format, int batch,
         dims.push_back(dim);
       }
       break;
-    case FORMAT_NCHW_VECT_C:
-      LOG(FATAL) << "FORMAT_NCHW_VECT_C not supported.";
+    default:
+      LOG(FATAL) << "Tensor format " << ToString(format) << " not supported.";
   }
   return dims;
 }
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index 4336ebdbd184a081619f0a6951dd4514735c6eb6..b6f8390a45d43bf7666b90e14cc6ff2f3f61947e 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -86,6 +86,15 @@ class StatelessRandomOpsTest(XLATestCase):
         # seed were not fixed.
         self.assertTrue(self._chi_squared(y, 10) < 16.92)
 
+  def testRandomNormalIsFinite(self):
+    with self.test_session() as sess, self.test_scope():
+      for dtype in self._random_types():
+        seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+        x = stateless.stateless_random_uniform(
+            shape=[10000], seed=seed_t, dtype=dtype)
+        y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
+        self.assertTrue(np.all(np.isfinite(y)))
+
   def _normal_cdf(self, x):
     """Cumulative distribution function for a standard normal distribution."""
     return 0.5 + 0.5 * np.vectorize(math.erf)(x / math.sqrt(2))
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index ba79f393a8f9b24ac506d2130957c38ecd442509..689a4a1f4e02f5dd48f64dc94afd0fcb50df8b5b 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -209,7 +209,8 @@ class UnaryOpsTest(XLATestCase):
       self._assertOpOutputMatchesExpected(
           math_ops.expm1,
           np.array([[-1, 1]], dtype=dtype),
-          expected=np.array([[-0.63212056, 1.71828183]], dtype=dtype))
+          expected=np.array([[-0.63212056, 1.71828183]], dtype=dtype),
+          rtol=1e-5)
 
       self._assertOpOutputMatchesExpected(
           math_ops.floor,
@@ -251,12 +252,12 @@ class UnaryOpsTest(XLATestCase):
           np.array([[1, 2]], dtype=dtype),
           expected=np.array([[0.540297, -0.41614]], dtype=dtype))
 
-      # TODO(b/34703906): improve log1p implementation and make tolerance
-      # tighter.
       self._assertOpOutputMatchesExpected(
           math_ops.log1p,
           np.array([[1e-14, 1e-15, 0.6]], dtype=dtype),
-          expected=np.log1p(np.array([[1e-14, 1e-15, 0.6]], dtype=dtype)))
+          expected=np.log1p(np.array([[1e-14, 1e-15, 0.6]], dtype=dtype)),
+          rtol=1e-4,
+          atol=1e-6)
 
       self._assertOpOutputMatchesExpected(
           math_ops.rint,
@@ -333,13 +334,19 @@ class UnaryOpsTest(XLATestCase):
 
       self._assertOpOutputMatchesExpected(
           nn_ops.elu,
-          np.array([[-1, 0, 1]], dtype=dtype),
-          expected=np.array([[-0.63212056, 0, 1]], dtype=dtype))
+          np.array([[-1, 0, 1, -1e-6]], dtype=dtype),
+          expected=np.array([[-0.63212056, 0, 1, -9.999995e-07]], dtype=dtype),
+          rtol=1e-5,
+          atol=1e-6)
 
       self._assertOpOutputMatchesExpected(
           nn_ops.selu,
-          np.array([[-1, 0, 1]], dtype=dtype),
-          expected=np.array([[-1.11133074, 0., 1.05070099]], dtype=dtype))
+          np.array([[-1, 0, 1, -1e-5]], dtype=dtype),
+          expected=np.array(
+              [[-1.11133074, 0., 1.05070099, -1.758090550379974e-05]],
+              dtype=dtype),
+          rtol=1e-5,
+          atol=1e-6)
 
       self._assertOpOutputMatchesExpected(
           nn_ops.relu,
@@ -419,7 +426,9 @@ class UnaryOpsTest(XLATestCase):
       self._assertOpOutputMatchesExpected(
           math_ops.expm1,
           np.array([[-1 + 2j, 3j, 2 - 3j]], dtype=dtype),
-          expected=np.expm1(np.array([[-1 + 2j, 3j, 2 - 3j]], dtype=dtype)))
+          expected=np.expm1(np.array([[-1 + 2j, 3j, 2 - 3j]], dtype=dtype)),
+          rtol=1e-6,
+          atol=1e-6)
 
       self._assertOpOutputMatchesExpected(
           math_ops.reciprocal,
@@ -441,13 +450,13 @@ class UnaryOpsTest(XLATestCase):
           np.array([[5j, 3 - 2j]], dtype=dtype),
           expected=np.cos(np.array([[5j, 3 - 2j]], dtype=dtype)))
 
-      # TODO(b/34703906): improve log1p implementation and make tolerance
-      # tighter.
       self._assertOpOutputMatchesExpected(
           math_ops.log1p,
           np.array([[1e-14, 1e-15j, 0.6 - 0.3j]], dtype=dtype),
           expected=np.log1p(
-              np.array([[1e-14, 1e-15j, 0.6 - 0.3j]], dtype=dtype)))
+              np.array([[1e-14, 1e-15j, 0.6 - 0.3j]], dtype=dtype)),
+          rtol=1e-4,
+          atol=1e-6)
 
       val = np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype)
       self._assertOpOutputMatchesExpected(
@@ -789,7 +798,9 @@ class UnaryOpsTest(XLATestCase):
     zero = np.asarray(0).astype(dtype)
     expected = np.logaddexp(zero, features)
     self._assertOpOutputMatchesExpected(
-        nn_ops.softplus, features, expected=expected)
+        nn_ops.softplus, features, expected=expected,
+        rtol=1e-6,
+        atol=9.1e-6)
 
   def testSoftplus(self):
     for dtype in self.float_types:
diff --git a/tensorflow/compiler/tests/xla_device_gpu_test.py b/tensorflow/compiler/tests/xla_device_gpu_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e30ebd55d09fe00449fb67b92a8325f5809d89a
--- /dev/null
+++ b/tensorflow/compiler/tests/xla_device_gpu_test.py
@@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test cases for XLA devices."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class XlaDeviceGpuTest(test.TestCase):
+
+  def testCopiesToAndFromGpuWork(self):
+    """Tests that copies between GPU and XLA devices work."""
+    if not test.is_gpu_available():
+      return
+
+    with session_lib.Session() as sess:
+      x = array_ops.placeholder(dtypes.float32, [2])
+      with ops.device("GPU"):
+        y = x * 2
+      with ops.device("device:XLA_CPU:0"):
+        z = y * y
+      with ops.device("GPU"):
+        w = y + z
+      result = sess.run(w, {x: [1.5, 0.5]})
+    self.assertAllClose(result, [12., 2.], rtol=1e-3)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/xla_device_test.py b/tensorflow/compiler/tests/xla_device_test.py
index f5c228f8305d740b994dadc34c93b4e0ae32d785..b707bd0963d71d7c4b43b8d42752b4c50e9bbf7c 100644
--- a/tensorflow/compiler/tests/xla_device_test.py
+++ b/tensorflow/compiler/tests/xla_device_test.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,30 +18,33 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.framework import dtypes
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class XlaDeviceTest(test.TestCase):
+class XlaDeviceTest(XLATestCase):
 
   def testCopies(self):
-    """Tests that copies between GPU and XLA devices work."""
-    if not test.is_gpu_available():
-      return
-
-    with session_lib.Session() as sess:
-      x = array_ops.placeholder(dtypes.float32, [2])
-      with ops.device("GPU"):
-        y = x * 2
-      with ops.device("device:XLA_CPU:0"):
-        z = y * y
-      with ops.device("GPU"):
-        w = y + z
-      result = sess.run(w, {x: [1.5, 0.5]})
-    self.assertAllClose(result, [12., 2.], rtol=1e-3)
+    """Tests that copies onto and off XLA devices work."""
+    shapes = [[0], [1], [1, 0], [1024, 0], [1024, 1], [3, 777], [777, 3],
+              [16384, 1], [1, 16384], [1, 20000, 1, 1]]
+    for dtype in self.numeric_types:
+      for shape in shapes:
+        with self.test_session() as sess:
+          with ops.device("CPU"):
+            x = array_ops.placeholder(dtype, shape)
+          with self.test_scope():
+            y = x + x
+          with ops.device("CPU"):
+            z = array_ops.identity(y)
+
+          inputs = np.random.randint(-100, 100, shape).astype(dtype)
+          result = sess.run(z, {x: inputs})
+        self.assertAllCloseAccordingToType(result, inputs + inputs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 4fca51f54d320e843343f80d7df1177f80f1d99f..cd57452302fcbde37d79ce760a80615a76d7ad8c 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -325,6 +325,7 @@ tf_cc_test(
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:cpu_plugin",
diff --git a/tensorflow/compiler/tf2xla/cc/BUILD b/tensorflow/compiler/tf2xla/cc/BUILD
index 4f8bb8ad743afe69a6544c2ae0dc7309891b2df3..ea8d1b3d14939d4f4fba598318200f71c2eb0270 100644
--- a/tensorflow/compiler/tf2xla/cc/BUILD
+++ b/tensorflow/compiler/tf2xla/cc/BUILD
@@ -27,3 +27,25 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
     ],
 )
+
+tf_gen_op_wrapper_cc(
+    name = "xla_jit_op_gen",
+    out_ops_file = "ops/xla_jit_op",
+    deps = ["//tensorflow/compiler/jit/ops:xla_ops"],
+)
+
+cc_library(
+    name = "xla_jit_ops",
+    srcs = ["ops/xla_jit_op.cc"],
+    hdrs = ["ops/xla_jit_op.h"],
+    deps = [
+        "//tensorflow/cc:const_op",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/compiler/jit/ops:xla_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 8d1f2684909e876fe5521ba6a63d745c7d3956e0..42585ad4d8a17d71146e48b69f9fa56f9ff24c3e 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -282,7 +282,58 @@ Status BuildLoopBody(const Graph& graph, Frame* frame,
   return Status::OK();
 }
 
-Status FunctionalizeLoop(Graph* graph, Frame* frame,
+// Copy the FunctionDef of given function from lookup_library to library, if
+// it can be found in lookup_library but is missing from library.
+Status AddMissingFunctionByName(const string& function_name,
+                                const FunctionLibraryDefinition* lookup_library,
+                                FunctionLibraryDefinition* library) {
+  if (!library->Find(function_name) && lookup_library->Find(function_name)) {
+    return library->AddFunctionDef(*lookup_library->Find(function_name));
+  }
+  return Status::OK();
+}
+
+// Iterate over all functions that the given fdef refers to. Copy the missing
+// FunctionDefs from lookup_library to library.
+Status AddMissingFunctionDef(const FunctionDef& fdef,
+                             const FunctionLibraryDefinition* lookup_library,
+                             FunctionLibraryDefinition* library) {
+  TF_RET_CHECK(lookup_library);
+  for (const NodeDef& node : fdef.node_def()) {
+    if (library->Find(node.op())) {
+      continue;
+    }
+    // The function refered by 'SymbolicGradient' node is specified in its
+    // attribute 'f'.
+    if (node.op() == FunctionLibraryDefinition::kGradientOp) {
+      const AttrValue* attr =
+          AttrSlice(&node.attr()).Find(FunctionLibraryDefinition::kFuncAttr);
+      if (!attr) {
+        return errors::InvalidArgument("SymbolicGradient is missing attr: f");
+      }
+      const string& func_name = attr->func().name();
+      TF_RETURN_IF_ERROR(
+          AddMissingFunctionByName(func_name, lookup_library, library));
+      // Copy the user-defined gradient function if it exists.
+      const string grad_name = lookup_library->FindGradient(func_name);
+      if (!grad_name.empty() && library->FindGradient(func_name).empty()) {
+        TF_RETURN_IF_ERROR(
+            AddMissingFunctionByName(grad_name, lookup_library, library));
+        GradientDef grad_def;
+        grad_def.set_function_name(func_name);
+        grad_def.set_gradient_func(grad_name);
+        TF_RETURN_IF_ERROR(library->AddGradientDef(grad_def));
+      }
+    } else if (lookup_library->Find(node.op())) {
+      TF_RETURN_IF_ERROR(
+          library->AddFunctionDef(*lookup_library->Find(node.op())));
+    }
+  }
+  return Status::OK();
+}
+
+Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
+                         Graph* graph, Frame* frame,
                          FunctionLibraryDefinition* library) {
   VLOG(2) << "Frame " << frame->name << " before: "
           << dump_graph::DumpGraphToFile("functionalize_before", *graph,
@@ -489,6 +540,14 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
 
   TF_RETURN_IF_ERROR(library->AddFunctionDef(cond_fdef));
   TF_RETURN_IF_ERROR(library->AddFunctionDef(body_fdef));
+  if (lookup_library) {
+    // Copy missing FunctionDefs from lookup_library to library to make library
+    // self-contained.
+    TF_RETURN_IF_ERROR(
+        AddMissingFunctionDef(cond_fdef, lookup_library, library));
+    TF_RETURN_IF_ERROR(
+        AddMissingFunctionDef(body_fdef, lookup_library, library));
+  }
 
   // Builds a While operator.
   NodeDef while_def;
@@ -1365,6 +1424,12 @@ Status FunctionalizeCond::Functionalize(Graph* graph,
 // functional equivalents.
 Status FunctionalizeControlFlow(Graph* graph,
                                 FunctionLibraryDefinition* library) {
+  return FunctionalizeControlFlow(/*lookup_library=*/nullptr, graph, library);
+}
+
+Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
+                                Graph* graph,
+                                FunctionLibraryDefinition* library) {
   VLOG(2) << "FunctionalizeControlFlow (initial): "
           << dump_graph::DumpGraphToFile("functionalize_initial", *graph,
                                          library);
@@ -1434,7 +1499,8 @@ Status FunctionalizeControlFlow(Graph* graph,
       continue;
     }
 
-    TF_RETURN_IF_ERROR(FunctionalizeLoop(graph, frame, library));
+    TF_RETURN_IF_ERROR(
+        FunctionalizeLoop(lookup_library, graph, frame, library));
 
     // If the parent has no remaining children, add it to the worklist.
     --frame->parent->num_children;
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.h b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
index 4d4ee3054c2914bb614bf75f7a51be8f6292683e..d941041d15532446d1413f16fe64602bfb1a7daa 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.h
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
@@ -22,9 +22,13 @@ limitations under the License.
 namespace tensorflow {
 
 // Transformation that converts tf.while_loop() loops into functional While
-// operators, suitable for XLA compilation.
+// operators, suitable for XLA compilation. If lookup_library is provided, use
+// it to make the library for control flow self-contained.
 Status FunctionalizeControlFlow(Graph* graph,
                                 FunctionLibraryDefinition* library);
+Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
+                                Graph* graph,
+                                FunctionLibraryDefinition* library);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index e494f42e8ed254ac0c7c7a23a13728d3f015e9d3..14977a908ae2b0ff7e13b634c41b6d331b4b8a36 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -299,6 +299,131 @@ TEST(FunctionalizeControlFlow, OneLoopVar) {
   }
 }
 
+// @function.Defun(noinline=True)
+// def increment_fn(x):
+//   return [x + 1]
+// Define the above function, and add it to the given graph. It's used as the
+// while loop body in NoinlineLoopBody test.
+Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
+  FunctionDef fdef = FunctionDefHelper::Create(
+      "increment_fn", {"x:int32"}, {"add:int32"}, {},
+      {
+          {{"add/y"}, "Const", {}, {{"dtype", DT_INT32}}},
+          {{"add_0"}, "Add", {"x", "add/y:output:0"}, {{"T", DT_INT32}}},
+      },
+      {{"add", "add_0:z:0"}});
+  (*fdef.mutable_attr())["_noinline"].set_b(true);
+  FunctionDefLibrary fdef_lib;
+  *(fdef_lib.add_function()) = fdef;
+  TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdef_lib));
+  NodeDef increment_fn;
+  increment_fn.set_name(node_name);
+  increment_fn.set_op("increment_fn");
+  *increment_fn.add_input() = "while/Identity";
+  *increment_fn.add_input() = "^while/Identity";
+  Status status;
+  graph->AddNode(increment_fn, &status);
+  return status;
+}
+
+// Graph:
+// x = array_ops.placeholder(dtypes.int32)
+// y = control_flow_ops.while_loop(lambda i: i < 10, increment_fn, [x])
+TEST(FunctionalizeControlFlow, NoinlineLoopBody) {
+  const string& noinline_node_name = "while/increment_fn";
+  Graph graph(OpRegistry::Global());
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto dummy = ops::Placeholder(scope.WithOpName("Dummy"), DT_INT32);
+    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+    auto enter = ops::internal::Enter(scope.WithOpName("while/Enter"), source,
+                                      "while/while_context");
+    auto merge = ops::Merge(scope.WithOpName("while/Merge"),
+                            std::initializer_list<Input>{enter, dummy});
+    auto ten = ops::Const<int32>(
+        scope.WithOpName("while/Less/y").WithControlDependencies(merge.output),
+        10);
+    auto less = ops::Less(scope.WithOpName("while/Less"), merge.output, ten);
+    auto loop_cond = ops::LoopCond(scope.WithOpName("while/LoopCond"), less);
+    auto switch_ =
+        ops::Switch(scope.WithOpName("while/Switch"), merge.output, loop_cond);
+    auto exit = ops::internal::Exit(scope.WithOpName("while/Exit"),
+                                    switch_.output_false);
+    auto identity =
+        ops::Identity(scope.WithOpName("while/Identity"), switch_.output_true);
+
+    TF_ASSERT_OK(AddNoinlineFunctionToGraph(noinline_node_name, scope.graph()));
+
+    NodeDef next_iter;
+    next_iter.set_name("while/NextIteration");
+    next_iter.set_op("NextIteration");
+    *next_iter.add_input() = noinline_node_name;
+    (*next_iter.mutable_attr())["T"].set_type(DT_INT32);
+
+    Status status;
+    Node* n = scope.graph()->AddNode(next_iter, &status);
+    TF_ASSERT_OK(status);
+
+    // Remove the dummy node and add the loop backedge.
+    scope.graph()->RemoveNode(dummy.node());
+    scope.graph()->AddEdge(n, 0, merge.output.node(), 1);
+    TF_ASSERT_OK(scope.ToGraph(&graph));
+  }
+
+  FunctionLibraryDefinition lookup_lib(graph.flib_def());
+  FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  // Function increment_fn will be copied from lookup_lib to library.
+  TF_ASSERT_OK(FunctionalizeControlFlow(&lookup_lib, &graph, &library));
+
+  GraphDef graph_def;
+  graph.ToGraphDef(&graph_def);
+
+  NameAttrList cond_fn, body_fn;
+  TF_ASSERT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+  // Outer graph
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+    auto while_op =
+        ops::XlaWhile(scope.WithOpName("while/LoopCond"),
+                      std::initializer_list<Input>{source}, cond_fn, body_fn);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, graph_def);
+  }
+
+  // Body graph.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+    TF_ASSERT_OK(AddNoinlineFunctionToGraph(noinline_node_name, scope.graph()));
+    auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
+    NodeDef retval;
+    retval.set_name("_retval0_RetVal");
+    retval.set_op(FunctionLibraryDefinition::kRetOp);
+    *retval.add_input() = noinline_node_name;
+    (*retval.mutable_attr())["T"].set_type(DT_INT32);
+    (*retval.mutable_attr())["index"].set_i(0);
+    Status status;
+    scope.graph()->AddNode(retval, &status);
+    TF_ASSERT_OK(status);
+
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+
+    InstantiationResultForTest result;
+    // Verify that increment_fn has been copied to library.
+    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+    // Ignore the function library when comparing the graphs.
+    expected.clear_library();
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+}
+
 // Tests functionalizing OneLoopVar where the loop value is not used post the
 // loop.
 // Graph:
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index 8115a26210a8e9e95e851f350e34dcdfa2519a64..212f6f3966149ca0b2d2e012b19300e1f488f996 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -208,10 +208,11 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
   TF_RETURN_IF_ERROR(
       PrepareArguments(&xla_op_context, graph.get(), expressions, &arguments));
 
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.is_entry_computation = false;
   XlaCompiler::CompilationResult result;
-
-  TF_RETURN_IF_ERROR(compiler->CompileFunction(XlaCompiler::CompileOptions(),
-                                               func, arguments, &result));
+  TF_RETURN_IF_ERROR(
+      compiler->CompileFunction(compile_options, func, arguments, &result));
 
   TF_RET_CHECK(arguments.size() == expressions.size());
 
@@ -229,11 +230,14 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
   auto output_handle = b->Call(*result.computation, handles);
   // The output handle of `Call` computation is a tuple type. Unzip it so
   // that it can fit into future computations.
+  int computation_output = 0;
   for (int64 i = 0; i < n->num_outputs(); ++i) {
     if (result.outputs[i].is_constant) {
       xla_op_context.SetConstantOutput(i, result.outputs[i].constant_value);
     } else {
-      xla_op_context.SetOutput(i, b->GetTupleElement(output_handle, i));
+      xla_op_context.SetOutput(
+          i, b->GetTupleElement(output_handle, computation_output));
+      ++computation_output;
     }
   }
   return b->first_error();
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 85ab4c41bf6a754236066260819f103970e603ae..e6da157c111ad9167bf7b1e743d9afbb8fb2ad03 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -45,6 +45,7 @@ tf_kernel_library(
         "image_resize_ops.cc",
         "index_ops.cc",
         "l2loss_op.cc",
+        "listdiff_op.cc",
         "lrn_ops.cc",
         "matmul_op.cc",
         "matrix_band_part_op.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/elu_op.cc b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
index ed7462c16615f7f63a174e29843c2a1675c17058..493781a1e68b8906f1a7e018e5710130e2eb08b5 100644
--- a/tensorflow/compiler/tf2xla/kernels/elu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
@@ -34,9 +34,8 @@ class EluOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* b = ctx->builder();
     const auto zero = XlaHelpers::Zero(b, input_type(0));
-    const auto one = XlaHelpers::One(b, input_type(0));
     const auto pred = b->Gt(ctx->Input(0), zero);
-    const auto expm1 = b->Sub(b->Exp(ctx->Input(0)), one);
+    const auto expm1 = b->Expm1(ctx->Input(0));
     ctx->SetOutput(0, b->Select(pred, ctx->Input(0), expm1));
   }
 };
@@ -68,13 +67,12 @@ class SeluOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* b = ctx->builder();
     const auto zero = XlaHelpers::Zero(b, input_type(0));
-    const auto one = XlaHelpers::One(b, input_type(0));
     const auto scale = XlaHelpers::FloatLiteral(b, input_type(0),
             1.0507009873554804934193349852946);
     const auto scale_alpha = XlaHelpers::FloatLiteral(b, input_type(0),
             1.7580993408473768599402175208123);
     const auto pred = b->Gt(ctx->Input(0), zero);
-    const auto expm1 = b->Sub(b->Exp(ctx->Input(0)), one);
+    const auto expm1 = b->Expm1(ctx->Input(0));
     ctx->SetOutput(0, b->Select(pred, b->Mul(scale, ctx->Input(0)),
                                       b->Mul(scale_alpha, expm1)));
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0388b4c830702ea00ec69fc42c6468326c88cf38
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA-specific ListDiff Op. This only supports constant DT_INT32 and DT_INT64
+// input.
+
+#include <unordered_set>
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace {
+
+constexpr std::array<DataType, 2> kListDiffTypes = {DT_INT32, DT_INT64};
+
+// ListDiffOp is an XLA kernel that supports constant-only x and y input.
+class ListDiffOp : public XlaOpKernel {
+ public:
+  explicit ListDiffOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(context->InputShape(0)),
+                errors::InvalidArgument("ListDiff expects x as a vector, not ",
+                                        context->InputShape(0).DebugString()));
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(context->InputShape(1)),
+                errors::InvalidArgument("ListDiff expects y as a vector, not ",
+                                        context->InputShape(1).DebugString()));
+
+    DataType val_type = context->expected_output_dtype(0);
+    DataType idx_type = context->expected_output_dtype(1);
+
+    Status status;
+    switch (val_type) {
+      case DT_INT32:
+        status = ListDiffWithIndexType<int32>(context, idx_type);
+        break;
+      case DT_INT64:
+        status = ListDiffWithIndexType<int64>(context, idx_type);
+        break;
+      default:
+        // This should never happen since we restrict this kernel to only match
+        // inputs with supported Tensor datatype.
+        status = errors::InvalidArgument("ListDiff expects x and y as either ",
+                                         "int32 or int64, not ",
+                                         DataTypeString(val_type));
+    }
+    OP_REQUIRES_OK(context, status);
+  }
+
+ private:
+  template <typename Tval, typename Tidx>
+  Status ListDiff(XlaOpKernelContext* context) {
+    std::vector<int64> x_input, y_input;
+    TF_RETURN_IF_ERROR(context->ConstantInputAsIntVector(0, &x_input));
+    TF_RETURN_IF_ERROR(context->ConstantInputAsIntVector(1, &y_input));
+
+    std::unordered_set<Tval> y_input_set;
+    y_input_set.reserve(y_input.size());
+    for (auto y : y_input) {
+      y_input_set.insert(y);
+    }
+
+    std::vector<Tval> val_output;
+    std::vector<Tidx> idx_output;
+    auto x_size = x_input.size();
+    for (Tidx i = 0; i < x_size; ++i) {
+      if (y_input_set.count(x_input[i]) > 0) {
+        continue;
+      }
+      val_output.push_back(x_input[i]);
+      idx_output.push_back(i);
+    }
+
+    context->SetOutput(0, context->builder()->ConstantR1<Tval>(val_output));
+    context->SetOutput(1, context->builder()->ConstantR1<Tidx>(idx_output));
+    return Status::OK();
+  }
+
+  template <typename Tval>
+  Status ListDiffWithIndexType(XlaOpKernelContext* context, DataType idx_type) {
+    switch (idx_type) {
+      case DT_INT32:
+        return ListDiff<Tval, int32>(context);
+      case DT_INT64:
+        return ListDiff<Tval, int64>(context);
+      default:
+        return errors::InvalidArgument(
+            "ListDiff expects idx_out as either int32 or int64, not ",
+            DataTypeString(idx_type));
+    }
+  }
+};
+
+REGISTER_XLA_OP(Name("ListDiff")
+                    .TypeConstraint("T", kListDiffTypes)
+                    .CompileTimeConstInput("x")
+                    .CompileTimeConstInput("y"),
+                ListDiffOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
index 70547290eaed169599764a5d66185dde85345863..a711278638444be01fb865561957702368b75114 100644
--- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
@@ -55,18 +55,33 @@ class RetvalOp : public XlaOpKernel {
       }
 
       XlaContext& tc = XlaContext::Get(ctx);
-      if (input_shape.num_elements() == 0 || is_constant.ValueOrDie()) {
+      if (tc.resolve_compile_time_constants() &&
+          (input_shape.num_elements() == 0 || is_constant.ValueOrDie())) {
         xla::Literal literal;
         OP_REQUIRES_OK(ctx, ctx->ConstantInput(0, &literal));
         OP_REQUIRES_OK(ctx, tc.AddConstRetval(index_, dtype_, literal));
       } else {
-        // The core from which a return value is returned depends on the core
-        // assignment of the input to the retval .Since we can't change the core
-        // assignment of <input> as this point, create a tuple/get-tuple-element
-        // combination so that the core will be set on them.
-        auto tuple_elem =
-            ctx->builder()->GetTupleElement(ctx->builder()->Tuple({input}), 0);
-        tc.AddRetval(index_, dtype_, tuple_elem);
+        TensorShape shape = ctx->InputShape(0);
+        TensorShape representation_shape =
+            tc.is_entry_computation()
+                ? tc.RepresentationShape(shape, ctx->input_type(0))
+                : shape;
+
+        xla::XlaOp output = input;
+        if (tc.is_entry_computation()) {
+          output =
+              ctx->builder()->Reshape(input, representation_shape.dim_sizes());
+        } else {
+          // The core from which a return value is returned depends on the
+          // device assignment of the input to the retval. Since we can't change
+          // the device assignment of "input" at this point, we must always
+          // introduce an operator here, even if the shape does not change.
+          // TODO(b/76097077): propagate device assignments onto arguments and
+          // return values of functions, and then reshape unconditionally.
+          output = ctx->builder()->GetTupleElement(
+              ctx->builder()->Tuple({output}), 0);
+        }
+        tc.AddRetval(index_, dtype_, shape, output);
       }
     }
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
index 0ed4c4707df71cf5f56ccfe0af506916f04bcdb5..5d1c05268493f4f6404c40a4092a71f1e5b3f3b9 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -106,20 +106,40 @@ class ReverseSequenceOp : public XlaOpKernel {
           seq_lens, body_builder->Reshape(i, {1}), {1});
 
       // Indices is the offset of the batch element in the input.
-      auto indices = body_builder->Broadcast(
+      auto batch_element_indices = body_builder->Broadcast(
           XlaHelpers::Zero(body_builder.get(), seq_lens_type),
           {input_shape.dims()});
-      indices = body_builder->DynamicUpdateSlice(
-          indices, body_builder->Reshape(i, {1}),
+      batch_element_indices = body_builder->DynamicUpdateSlice(
+          batch_element_indices, body_builder->Reshape(i, {1}),
           body_builder->Reshape(
               XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type,
                                          batch_dim_),
               {1}));
 
-      // slice_indices is the offset of the start of the reversed sequence in
-      // the input.
-      auto slice_indices = body_builder->DynamicUpdateSlice(
-          indices,
+      // Slice out the current batch element and pad it out in the sequence
+      // dimension.
+      TensorShape slice_shape = input_shape;
+      slice_shape.set_dim(batch_dim_, 1);
+      slice_shape.set_dim(seq_dim_, max_seq_len);
+      auto slice = body_builder->DynamicSlice(output, batch_element_indices,
+                                              slice_shape.dim_sizes());
+      auto padding_config = xla::MakeNoPaddingConfig(slice_shape.dims());
+      padding_config.mutable_dimensions(seq_dim_)->set_edge_padding_high(
+          slice_shape.dim_size(seq_dim_));
+      slice = body_builder->Pad(
+          slice, XlaHelpers::Zero(body_builder.get(), input_type),
+          padding_config);
+
+      // Now slice out the reversed sequence from its actual start.
+      // sequence_start_indices is the offset of the start of the reversed
+      // sequence in the input. The slice will go into the padding, however, we
+      // will mask off these elements and replace them with elements from the
+      // original input so their values do not matter.
+      auto sequence_start_indices = body_builder->Broadcast(
+          XlaHelpers::Zero(body_builder.get(), seq_lens_type),
+          {slice_shape.dims()});
+      sequence_start_indices = body_builder->DynamicUpdateSlice(
+          sequence_start_indices,
           body_builder->Sub(XlaHelpers::IntegerLiteral(
                                 body_builder.get(), seq_lens_type, max_seq_len),
                             seq_len),
@@ -127,18 +147,12 @@ class ReverseSequenceOp : public XlaOpKernel {
               XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type,
                                          seq_dim_),
               {1}));
-
-      // Slice out the reversed sequence. The slice will overflow the end of the
-      // sequence, and the contents of the overflow are implementation-defined.
-      // However, we will mask off these elements and replace them with elements
-      // from the original input so their values do not matter.
-      TensorShape slice_shape = input_shape;
-      slice_shape.set_dim(batch_dim_, 1);
-      auto slice = body_builder->DynamicSlice(output, slice_indices,
-                                              slice_shape.dim_sizes());
+      slice = body_builder->DynamicSlice(slice, sequence_start_indices,
+                                         slice_shape.dim_sizes());
 
       // Shift the reversed sequence to the left.
-      output = body_builder->DynamicUpdateSlice(output, slice, indices);
+      output = body_builder->DynamicUpdateSlice(output, slice,
+                                                batch_element_indices);
 
       body_builder->Tuple(
           {body_builder->Add(
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index 6340c225185e68df638747def5b4fda3ef4c28ac..a99d4ddc7c4956f7144512a9bdf6f4c2eb0f944f 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -255,7 +255,8 @@ class StatelessRandomNormalOp : public XlaOpKernel {
                                         seed_shape.DebugString()));
     xla::XlaOp seed = ctx->Input(1);
     xla::XlaBuilder* builder = ctx->builder();
-    auto uniform = RandomUniform(builder, seed, shape, -1.0, 1.0);
+    auto uniform =
+        RandomUniform(builder, seed, shape, std::nextafter(-1.0f, 0.0f), 1.0);
     // Convert uniform distribution to normal distribution by computing
     // sqrt(2) * erfinv(x)
     auto normal = builder->Mul(builder->ConstantR0<float>(std::sqrt(2.0)),
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index a4f50f52ebe8b1ed7df862996d64e135ea1d0ac5..71a9fd051bfc8db09738a4bfe8ddde447895ecf0 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -100,8 +100,7 @@ XLAJIT_MAKE_UNARY(Cosh,
 XLAJIT_MAKE_UNARY(Sin, b->Sin(x));
 XLAJIT_MAKE_UNARY(Exp, b->Exp(x));
 
-// TODO(b/34703906): use a more accurate implementation of expm1.
-XLAJIT_MAKE_UNARY(Expm1, b->Sub(b->Exp(x), XlaHelpers::One(b, input_type(0))));
+XLAJIT_MAKE_UNARY(Expm1, b->Expm1(x));
 
 XLAJIT_MAKE_UNARY(Floor, b->Floor(x));
 XLAJIT_MAKE_UNARY(IsFinite, b->IsFinite(x));
@@ -115,8 +114,7 @@ XLAJIT_MAKE_UNARY(Inv, b->Div(XlaHelpers::One(b, input_type(0)), x));
 XLAJIT_MAKE_UNARY(Reciprocal, b->Div(XlaHelpers::One(b, input_type(0)), x));
 XLAJIT_MAKE_UNARY(Log, b->Log(x));
 
-// TODO(b/34703906): use a more accurate implementation of log1p.
-XLAJIT_MAKE_UNARY(Log1p, b->Log(b->Add(XlaHelpers::One(b, input_type(0)), x)));
+XLAJIT_MAKE_UNARY(Log1p, b->Log1p(x));
 
 XLAJIT_MAKE_UNARY(Invert, b->Not(x));
 XLAJIT_MAKE_UNARY(LogicalNot, b->Not(x));
@@ -160,24 +158,17 @@ XLAJIT_MAKE_UNARY(Sinh,
                   b->Mul(b->Sub(b->Exp(x), b->Exp(b->Neg(x))),
                          XlaHelpers::FloatLiteral(b, input_type(0), 0.5)));
 
-static xla::XlaOp Softplus(xla::XlaBuilder* b, DataType dtype,
-                           const xla::XlaOp& features) {
-  xla::XlaOp threshold = b->Add(b->Log(XlaHelpers::Epsilon(b, dtype)),
-                                XlaHelpers::FloatLiteral(b, dtype, 2.0));
-  // Value above which exp(x) may overflow, but softplus(x) == x
-  // is within machine epsilon.
-  xla::XlaOp too_large = b->Gt(features, b->Neg(threshold));
-  // Value below which exp(x) may underflow, but softplus(x) == exp(x)
-  // is within machine epsilon.
-  xla::XlaOp too_small = b->Lt(features, threshold);
-  xla::XlaOp features_exp = b->Exp(features);
-  xla::XlaOp output = b->Select(
-      too_large, features,
-      b->Select(too_small, features_exp,
-                b->Log(b->Add(features_exp, XlaHelpers::One(b, dtype)))));
-  return output;
-}
-XLAJIT_MAKE_UNARY(Softplus, Softplus(b, input_type(0), x));
+// softplus(x) = log(1 + exp(x))
+//
+// This is not numerically stable when x is large, it can easily overflow.
+// However, we can compute it as LogSumExp(x, 0):
+//   max(x, 0) + log(exp(x - max(x, 0)) + exp(0 - max(x, 0)))
+//
+// This is equivalent to:
+//   max(x, 0) + log1p(exp(-abs(x)))
+XLAJIT_MAKE_UNARY(Softplus,
+                  b->Add(b->Max(x, XlaHelpers::Zero(b, input_type(0))),
+                         b->Log1p(b->Exp(b->Neg(b->Abs(x))))));
 
 // softsign(x) = x / (abs(x) + 1)
 XLAJIT_MAKE_UNARY(Softsign,
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 04ad3694a0c0df9d43c706d428c3b8715e5ff8ca..ee7f5d510ab7a3ce7d3bbe843c5fefd362f79b7b 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -80,7 +80,6 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -141,7 +140,6 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
index 83e73827862ca26a1a51bed72ab87768854c1e71..3f1384bc864abd882ebba2b90acbe0b1e664687a 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -214,7 +214,7 @@ xla::StatusOr<xla::XlaOp> Cholesky(xla::XlaBuilder* builder, xla::XlaOp a,
                                           /*lower=*/true,
                                           /*transpose_a=*/true,
                                           /*conjugate_a=*/false,
-                                          /*block_size=*/8));
+                                          /*block_size=*/block_size));
       TF_ASSIGN_OR_RETURN(
           l, UpdateSliceInMinorDims(builder, l, update, {i + k, i}));
     }
diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc
index 2c3cd658e0462368ac0b51938979b7a6815a7574..43e1c1e9fecec1c71db1509757251cb5d903ca49 100644
--- a/tensorflow/compiler/tf2xla/literal_util.cc
+++ b/tensorflow/compiler/tf2xla/literal_util.cc
@@ -40,7 +40,7 @@ Status HostTensorToLiteral(const Tensor& host_tensor, xla::Literal* literal) {
   return Status::OK();
 }
 
-Status CopyLiteralToHostTensor(const xla::Literal& literal,
+Status CopyLiteralToHostTensor(const xla::LiteralSlice& literal,
                                Tensor* host_tensor) {
   TF_RET_CHECK(xla::ShapeUtil::IsArray(literal.shape()) &&
                xla::ShapeUtil::ElementsIn(literal.shape()) ==
@@ -63,8 +63,8 @@ Status CopyLiteralToHostTensor(const xla::Literal& literal,
   return Status::OK();
 }
 
-Status LiteralToHostTensor(const xla::Literal& literal, DataType target_type,
-                           Tensor* host_tensor) {
+Status LiteralToHostTensor(const xla::LiteralSlice& literal,
+                           DataType target_type, Tensor* host_tensor) {
   TensorShape shape;
   TF_RETURN_IF_ERROR(XLAShapeToTensorShape(literal.shape(), &shape));
   *host_tensor = Tensor(target_type, shape);
diff --git a/tensorflow/compiler/tf2xla/literal_util.h b/tensorflow/compiler/tf2xla/literal_util.h
index f283b0236811f8d52e8fe2982a74c11c92cd20d8..220bec15538c36fa30abef9e729b64dbbb9f72b3 100644
--- a/tensorflow/compiler/tf2xla/literal_util.h
+++ b/tensorflow/compiler/tf2xla/literal_util.h
@@ -36,13 +36,13 @@ Status HostTensorToLiteral(const Tensor& host_tensor, xla::Literal* literal);
 // derivable from the type of <literal>, because multiple tensorflow types map
 // to the same XLA type (e.g. INT32 and QINT32 both map to INT32 in
 // XLA).
-Status LiteralToHostTensor(const xla::Literal& literal, DataType target_type,
-                           Tensor* host_tensor);
+Status LiteralToHostTensor(const xla::LiteralSlice& literal,
+                           DataType target_type, Tensor* host_tensor);
 
 // Copies the contents of 'literal' to a previously allocated tensor
 // 'host_tensor'. The tensor and the literal must have the same number of
 // elements and the same type.
-Status CopyLiteralToHostTensor(const xla::Literal& literal,
+Status CopyLiteralToHostTensor(const xla::LiteralSlice& literal,
                                Tensor* host_tensor);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 3d1946c332b0f903b710a19fbb79fc9923e89c43..f7098917b191058c53a1d6a5923e80e5e8319d72 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -15,10 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 
-#include <deque>
 #include <numeric>
+#include <vector>
 
-#include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow.h"
 #include "tensorflow/compiler/tf2xla/graph_compiler.h"
@@ -28,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
-#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
@@ -40,7 +38,6 @@ limitations under the License.
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 namespace {
@@ -110,10 +107,10 @@ XlaCompiler::XlaCompiler(XlaCompiler::Options options)
   local_flib_runtime_ = local_pflr_->GetFLR(device_->name());
   flib_runtime_ = pflr_->GetFLR(device_->name());
 
-  // The default variable representation shape is the identity function.
-  if (!options_.variable_representation_shape_fn) {
-    options_.variable_representation_shape_fn =
-        [](const TensorShape& shape, DataType type) { return shape; };
+  // The default shape representation function is the identity.
+  if (!options_.shape_representation_fn) {
+    options_.shape_representation_fn = [](const TensorShape& shape,
+                                          DataType type) { return shape; };
   }
 }
 
@@ -230,20 +227,25 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
 
 // Computes the XLA shape for argument 'arg'.
 Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
+                                        bool is_entry_computation,
                                         xla::Shape* xla_shape) {
   switch (arg.kind) {
     case XlaCompiler::Argument::kConstant:
-      return TensorShapeToXLAShape(arg.type, arg.constant_value.shape(),
-                                   xla_shape);
-    case XlaCompiler::Argument::kParameter:
-      return TensorShapeToXLAShape(arg.type, arg.shape, xla_shape);
+      LOG(FATAL) << "Unreachable case";
+    case XlaCompiler::Argument::kParameter: {
+      TensorShape shape =
+          is_entry_computation
+              ? options_.shape_representation_fn(arg.shape, arg.type)
+              : arg.shape;
+      return TensorShapeToXLAShape(arg.type, shape, xla_shape);
+    }
     case XlaCompiler::Argument::kResource: {
       TF_RET_CHECK(arg.initialized);
 
       switch (arg.resource_kind) {
         case XlaResource::kVariable: {
           TensorShape representation_shape =
-              options_.variable_representation_shape_fn(arg.shape, arg.type);
+              options_.shape_representation_fn(arg.shape, arg.type);
           return TensorShapeToXLAShape(arg.type, representation_shape,
                                        xla_shape);
         }
@@ -337,16 +339,25 @@ Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
 Status BuildComputation(
     const std::vector<XlaCompiler::Argument>& args,
     const std::vector<int>& arg_cores,
-    const std::vector<XlaExpression>& retvals,
+    const std::vector<XlaContext::Retval>& retvals,
     const std::vector<std::unique_ptr<XlaResource>>& resources,
     bool return_updated_values_for_all_resources, xla::XlaBuilder* builder,
     xla::XlaComputation* computation, int* num_computation_outputs,
     int* num_nonconst_outputs,
+    std::vector<XlaCompiler::OutputDescription>* outputs,
     std::vector<XlaCompiler::ResourceUpdate>* resource_updates) {
   std::vector<xla::XlaOp> elems;
   elems.reserve(retvals.size());
-  for (const XlaExpression& retval : retvals) {
-    if (!retval.has_constant_value()) {
+  for (int i = 0; i < retvals.size(); ++i) {
+    XlaCompiler::OutputDescription& output = (*outputs)[i];
+    output.type = retvals[i].type;
+    output.shape = retvals[i].shape;
+    const XlaExpression& retval = retvals[i].expression;
+    if (retval.has_constant_value()) {
+      output.is_constant = true;
+      output.constant_value = retval.constant_value();
+    } else {
+      output.is_constant = false;
       elems.push_back(retval.handle());
     }
   }
@@ -490,8 +501,8 @@ Status XlaCompiler::BuildArguments(
   std::vector<xla::Shape> arg_shapes(input_mapping->size());
   for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
     // Computes the shapes of non-constant arguments.
-    TF_RETURN_IF_ERROR(
-        XLAShapeForArgument(args[(*input_mapping)[i]], &arg_shapes[i]));
+    TF_RETURN_IF_ERROR(XLAShapeForArgument(
+        args[(*input_mapping)[i]], is_entry_computation, &arg_shapes[i]));
   }
 
   if (use_tuple_arg) {
@@ -567,7 +578,8 @@ Status XlaCompiler::BuildArguments(
 
   builder->ClearOpMetadata();
 
-  // Fill in the handles in non-constant arguments.
+  // Fill in the handles in non-constant arguments, and reshape parameters
+  // back to their correct shapes.
   VLOG(2) << "XLA computation inputs:";
   for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
     const XlaCompiler::Argument& arg = args[input_mapping->at(i)];
@@ -586,7 +598,15 @@ Status XlaCompiler::BuildArguments(
         break;
       }
       case XlaCompiler::Argument::kParameter:
-        arg_expression.set_handle(arg_handles[i]);
+        // Reshape parameters back to their correct shapes.
+        // TODO(b/76097077): propagate device assignments onto arguments and
+        // return values of functions, and then reshape unconditionally.
+        if (is_entry_computation) {
+          arg_expression.set_handle(
+              builder->Reshape(arg_handles[i], arg.shape.dim_sizes()));
+        } else {
+          arg_expression.set_handle(arg_handles[i]);
+        }
         break;
       case XlaCompiler::Argument::kConstant:
       case XlaCompiler::Argument::kInvalid:
@@ -658,13 +678,14 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   // Converts Tensorflow's graph control-flow constructs into functional
   // control-flow that can be compiled into XLA code.
   TF_RETURN_IF_ERROR(
-      FunctionalizeControlFlow(graph.get(), local_flib_def_.get()));
+      FunctionalizeControlFlow(flib_runtime_->GetFunctionLibraryDefinition(),
+                               graph.get(), local_flib_def_.get()));
 
   xla::XlaBuilder builder(name);
-  XlaContext* context =
-      new XlaContext(this, &builder, options_.allow_cpu_custom_calls,
-                     options.resolve_compile_time_constants,
-                     &options_.variable_representation_shape_fn);
+  XlaContext* context = new XlaContext(
+      this, &builder, options_.allow_cpu_custom_calls,
+      options.resolve_compile_time_constants, options.is_entry_computation,
+      &options_.shape_representation_fn);
   core::ScopedUnref context_unref(context);
 
   std::vector<XlaExpression> arg_expressions;
@@ -681,35 +702,22 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   int num_nonconst_outputs;
   int num_computation_outputs;
   result->computation = std::make_shared<xla::XlaComputation>();
+  result->outputs.resize(context->retvals().size());
   TF_RETURN_IF_ERROR(BuildComputation(
       args, arg_cores, context->retvals(), context->resources(),
       options.return_updated_values_for_all_resources, &builder,
       result->computation.get(), &num_computation_outputs,
-      &num_nonconst_outputs, &result->resource_updates));
+      &num_nonconst_outputs, &result->outputs, &result->resource_updates));
 
   VLOG(2) << "Outputs: total: " << context->retvals().size()
           << " nonconstant: " << num_nonconst_outputs;
-  result->outputs.resize(context->retvals().size());
-  for (std::vector<XlaExpression>::size_type i = 0;
-       i < context->retvals().size(); ++i) {
-    const XlaExpression& retval = context->retvals()[i];
-    if (retval.has_constant_value()) {
-      OutputDescription& output = result->outputs[i];
-      output.shape = retval.constant_value().shape();
-      output.is_constant = true;
-      output.constant_value = retval.constant_value();
-    }
-  }
 
-  // Compute the output shapes, if there is a computation with non-constant
+  // Compute the XLA output shape, if there is a computation with non-constant
   // outputs.
-  auto computation_shape = client()->GetComputationShape(*result->computation);
-  if (!computation_shape.ok()) {
-    return computation_shape.status();
-  }
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::ProgramShape> computation_shape,
+                      client()->GetComputationShape(*result->computation));
 
-  result->xla_output_shape.Swap(
-      computation_shape.ValueOrDie()->mutable_result());
+  result->xla_output_shape.Swap(computation_shape->mutable_result());
   VLOG(2) << "XLA output shape: "
           << xla::ShapeUtil::HumanString(result->xla_output_shape);
 
@@ -724,23 +732,6 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   // Tensorflow expects a major-to-minor order of results.
   xla::LayoutUtil::SetToDefaultLayout(&result->xla_output_shape);
 
-  // Converts the output shapes to TensorShapes.
-  int computation_output = 0;
-  for (std::vector<XlaExpression>::size_type i = 0;
-       i < context->retvals().size(); ++i) {
-    const XlaExpression& retval = context->retvals()[i];
-    if (!retval.has_constant_value()) {
-      TF_RET_CHECK(computation_output < num_computation_outputs)
-          << "Computation has more outputs than expected";
-      OutputDescription& output = result->outputs[i];
-      output.is_constant = false;
-      TF_RETURN_IF_ERROR(XLAShapeToTensorShape(
-          xla::ShapeUtil::GetTupleElementShape(result->xla_output_shape,
-                                               computation_output),
-          &output.shape));
-      ++computation_output;
-    }
-  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index ca6cd822ef4effd48dbc3cc18d35d6642f303df1..bf496bd8bc81e67056eba380288bca88737cc00d 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -38,7 +38,7 @@ class XlaContext;
 // It does a symbolic execution of the graph starting from specific input
 // shapes, using a JIT device to convert operators into XLA computations.
 //
-// XlaCompiler is typically invoked from an `_XlaLaunch` operator once the
+// XlaCompiler is typically invoked from an `XlaLaunch` operator once the
 // shapes of all input parameters to the computation are known. This is
 // because the symbolic execution requires known shapes for all operations.
 //
@@ -67,6 +67,15 @@ class XlaContext;
 // _Retval values are ordered by _Retval index, whereas kResource values are
 // ordered by the original _Arg position of the variable.
 //
+// If a shape representation function is provided as part of
+// XlaCompiler::CompileOptions, kParameter arguments and return values to an
+// entry computation will be reshaped in accordance to the shape function.
+// Arguments and return values to a non-entry computation are not reshaped.
+// Variable resource arguments are passed and returned in reshaped form, even
+// for non-entry computations. This feature allows TensorFlow to keep on-device
+// tensors with a different shape to their representation inside the XLA
+// computation.
+//
 // In both inputs and outputs, kResource values are placed the end. When
 // emitting While loop bodies, we must ensure that the loop body has
 // identical input and output signatures. By moving variable values
@@ -171,7 +180,7 @@ class XlaCompiler {
   };
 
   struct OutputDescription {
-    // Type and shape of the output.
+    // Type and shape of the output. The shape is the unflattened shape.
     DataType type;
     TensorShape shape;
 
@@ -206,10 +215,12 @@ class XlaCompiler {
     // original arguments, and are not necessarily in the same order.)
     std::vector<int> input_mapping;
 
-    // Input shapes of the computation.
+    // Input shapes of the computation. If we are flattening inputs, these are
+    // the flattened shapes.
     std::vector<xla::Shape> xla_input_shapes;
 
-    // Output shape in XLA format. The output shape is always a tuple.
+    // Output shape in XLA format. The output shape is always a tuple. If we
+    // are flattening outputs, these are the flattened shapes.
     xla::Shape xla_output_shape;
 
     // TensorFlow shapes of outputs, together with the values of any
@@ -230,6 +241,8 @@ class XlaCompiler {
     std::shared_ptr<xla::XlaComputation> computation;
   };
 
+  typedef std::function<TensorShape(const TensorShape&, DataType)>
+      ShapeRepresentationFn;
   struct Options {
     // Name of the compilation device to use. Needs to be live only during
     // XlaCompiler's constructor.
@@ -250,8 +263,7 @@ class XlaCompiler {
     // If set, the XLA representation of variables represented to XLA as the
     // shape given by this shape function. Variables are reshaped to this shape
     // on write, and reshaped to their original shape on read.
-    std::function<TensorShape(const TensorShape&, DataType)>
-        variable_representation_shape_fn;
+    ShapeRepresentationFn shape_representation_fn;
 
     // If not nullptr, populate_resource_manager is called with the
     // compilation device's resource manager when the compilation
@@ -300,7 +312,8 @@ class XlaCompiler {
   // Returns the shape of the XLA parameter for an argument 'arg'.
   // See the class comment for more details about the argument passing
   // convention.
-  Status XLAShapeForArgument(const Argument& arg, xla::Shape* xla_shape);
+  Status XLAShapeForArgument(const Argument& arg, bool is_entry_computation,
+                             xla::Shape* xla_shape);
 
   // Retrieves the channel handle associated with `key`. Allocates
   // a new channel handle if none exists.
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 6b8918b26179735a4518a422fed024fa534122f5..55772ca324872f6d5fac008de7819b7fae64966a 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -25,12 +25,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -225,7 +227,7 @@ TEST_F(XlaCompilerTest, Simple) {
       xla::Literal::CreateR1<int32>({4, 143});
   std::unique_ptr<xla::Literal> expected_literal =
       xla::Literal::MakeTuple({expected0.get()});
-  xla::LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal));
 }
 
 TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) {
@@ -320,7 +322,8 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
         xla::Literal::CreateR1<int32>({-7, -42});
     std::unique_ptr<xla::Literal> expected_literal =
         xla::Literal::MakeTuple({expected0.get()});
-    xla::LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+    EXPECT_TRUE(
+        xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal));
   }
 
   {
@@ -355,10 +358,80 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
         xla::Literal::CreateR1<int32>({-7, -42});
     std::unique_ptr<xla::Literal> expected =
         xla::Literal::MakeTuple({expected0.get(), expected1.get()});
-    xla::LiteralTestUtil::ExpectEqual(*expected, *actual_literal);
+    EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected, *actual_literal));
   }
 }
 
+TEST_F(XlaCompilerTest, ConstantOutputsOfFunctionalNode) {
+  // Define a function with one compile-time constant output and one
+  // data-dependent output.
+  // @function.Defun(noinline=True)
+  // foo(a) {b=7; return b, a; }
+  const Tensor seven = test::AsScalar<int>(7);
+  FunctionDef fdef = FunctionDefHelper::Create(
+      "foo", {"a_0:int32"}, {"const:int32", "a:int32"}, {},
+      {
+          {{"Const"}, "Const", {}, {{"dtype", DT_INT32}, {"value", seven}}},
+      },
+      {{"a", "a_0"}, {"const", "Const:output:0"}});
+  (*fdef.mutable_attr())["_noinline"].set_b(true);
+  FunctionDefLibrary fdef_lib;
+  *(fdef_lib.add_function()) = fdef;
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    TF_EXPECT_OK(scope.graph()->AddFunctionLibrary(fdef_lib));
+    auto arg = ops::_Arg(scope.WithOpName("input_arg"), DT_INT32, 0);
+    NodeDef foo;
+    foo.set_name("foo");
+    foo.set_op("foo");
+    *foo.add_input() = "input_arg";
+    Status status;
+    scope.graph()->AddNode(foo, &status);
+    TF_ASSERT_OK(status);
+    NodeDef retval_1;
+    retval_1.set_name("retval_0");
+    retval_1.set_op(FunctionLibraryDefinition::kRetOp);
+    *retval_1.add_input() = "foo";
+    (*retval_1.mutable_attr())["T"].set_type(DT_INT32);
+    (*retval_1.mutable_attr())["index"].set_i(0);
+    scope.graph()->AddNode(retval_1, &status);
+    TF_ASSERT_OK(status);
+    NodeDef retval_2;
+    retval_2.set_name("retval_1");
+    retval_2.set_op(FunctionLibraryDefinition::kRetOp);
+    *retval_2.add_input() = "foo:1";
+    (*retval_2.mutable_attr())["T"].set_type(DT_INT32);
+    (*retval_2.mutable_attr())["index"].set_i(1);
+    scope.graph()->AddNode(retval_2, &status);
+    TF_ASSERT_OK(status);
+    TF_ASSERT_OK(scope.ToGraph(graph.get()));
+  }
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({1});
+
+  XlaCompiler::Options options = DefaultOptions();
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), fdef_lib);
+  options.flib_def = &flib_def;
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.resolve_compile_time_constants = true;
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(compile_options, "constants",
+                                     std::move(graph), args, &result));
+
+  ASSERT_EQ(2, result.outputs.size());
+  EXPECT_TRUE(result.outputs[0].is_constant);
+  test::ExpectTensorEqual<int32>(result.outputs[0].constant_value,
+                                 test::AsScalar(7));
+  EXPECT_FALSE(result.outputs[1].is_constant);
+}
+
 // Tests compilation and execution of a graph that adds two tensors.
 TEST_F(XlaCompilerTest, ResourceManager) {
   // Builds a graph that calls the dummy resource Op.
@@ -523,7 +596,7 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) {
       {output_base.get(), output_grad1.get(), output_grad2.get()});
   std::unique_ptr<xla::Literal> expected_literal =
       xla::Literal::MakeTuple({output_read.get(), output_resource.get()});
-  xla::LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal));
 }
 
 // Tests compilation and execution of a graph that adds two tensors.
@@ -746,13 +819,10 @@ TEST_F(XlaCompilerTest, Variables) {
       xla::Literal::CreateR1<int32>({4, 143});
   std::unique_ptr<xla::Literal> expected_literal =
       xla::Literal::MakeTuple({expected0.get(), expected1.get()});
-  xla::LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal));
 }
 
-// Tests a simple graph that reads and writes a variable, with a
-// variable_representation_shape_fn passed to the compiler that flattens all
-// variable tensors to vectors.
-TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
+xla::StatusOr<std::unique_ptr<Graph>> BuildTestGraph() {
   Scope scope = Scope::NewRootScope().ExitOnError();
   auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
   auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 1);
@@ -763,7 +833,15 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
   auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
   auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0);
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+  TF_RETURN_IF_ERROR(scope.ToGraph(graph.get()));
+  return std::move(graph);
+}
+
+// Tests a simple graph that reads and writes a variable, with a
+// shape_representation_fn passed to the compiler that flattens all
+// variable tensors to vectors.
+TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Graph> graph, BuildTestGraph());
 
   // Builds a description of the arguments.
   std::vector<XlaCompiler::Argument> args(2);
@@ -778,15 +856,33 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
 
   // Compiles the graph.
   XlaCompiler::Options options = DefaultOptions();
-  options.variable_representation_shape_fn = [](const TensorShape& shape,
-                                                DataType type) {
+  options.shape_representation_fn = [](const TensorShape& shape,
+                                       DataType type) {
     return TensorShape({shape.num_elements()});
   };
   XlaCompiler compiler(options);
 
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.is_entry_computation = false;  // Only reshape variables.
+
   XlaCompiler::CompilationResult result;
-  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
-                                     std::move(graph), args, &result));
+  TF_ASSERT_OK(compiler.CompileGraph(compile_options, "add", std::move(graph),
+                                     args, &result));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::ProgramShape> program_shape,
+                          client_->GetComputationShape(*result.computation));
+
+  ASSERT_EQ(program_shape->parameters_size(), 2);
+  EXPECT_TRUE(
+      xla::ShapeUtil::Compatible(program_shape->parameters(0),
+                                 xla::ShapeUtil::MakeShape(xla::S32, {2, 2})));
+  EXPECT_TRUE(xla::ShapeUtil::Compatible(
+      program_shape->parameters(1), xla::ShapeUtil::MakeShape(xla::S32, {4})));
+  EXPECT_TRUE(xla::ShapeUtil::Compatible(
+      program_shape->result(),
+      xla::ShapeUtil::MakeTupleShape(
+          {xla::ShapeUtil::MakeShape(xla::S32, {2, 2}),
+           xla::ShapeUtil::MakeShape(xla::S32, {4})})));
 
   // Tests that the generated computation works.
   std::unique_ptr<xla::Literal> param0_literal =
@@ -811,7 +907,76 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
       xla::Literal::CreateR1<int32>({26, 66, 34, 401});
   std::unique_ptr<xla::Literal> expected_literal =
       xla::Literal::MakeTuple({expected0.get(), expected1.get()});
-  xla::LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal));
+}
+
+TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Graph> graph, BuildTestGraph());
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 2});
+  args[1].kind = XlaCompiler::Argument::kResource;
+  args[1].resource_kind = XlaResource::kVariable;
+  args[1].initialized = true;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({2, 2});
+
+  // Compiles the graph.
+  XlaCompiler::Options options = DefaultOptions();
+  options.shape_representation_fn = [](const TensorShape& shape,
+                                       DataType type) {
+    return TensorShape({shape.num_elements()});
+  };
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.is_entry_computation = true;  // Reshape args and retvals.
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(compile_options, "add", std::move(graph),
+                                     args, &result));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::ProgramShape> program_shape,
+                          client_->GetComputationShape(*result.computation));
+
+  ASSERT_EQ(program_shape->parameters_size(), 2);
+  EXPECT_TRUE(xla::ShapeUtil::Compatible(
+      program_shape->parameters(0), xla::ShapeUtil::MakeShape(xla::S32, {4})));
+  EXPECT_TRUE(xla::ShapeUtil::Compatible(
+      program_shape->parameters(1), xla::ShapeUtil::MakeShape(xla::S32, {4})));
+  EXPECT_TRUE(xla::ShapeUtil::Compatible(
+      program_shape->result(),
+      xla::ShapeUtil::MakeTupleShape(
+          {xla::ShapeUtil::MakeShape(xla::S32, {4}),
+           xla::ShapeUtil::MakeShape(xla::S32, {4})})));
+
+  // Tests that the generated computation works.
+  std::unique_ptr<xla::Literal> param0_literal =
+      xla::Literal::CreateR1<int32>({4, 55, 1, -3});
+  std::unique_ptr<xla::Literal> param1_literal =
+      xla::Literal::CreateR1<int32>({22, 11, 33, 404});
+  std::unique_ptr<xla::GlobalData> param0_data =
+      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
+  std::unique_ptr<xla::GlobalData> param1_data =
+      client_->TransferToServer(*param1_literal).ConsumeValueOrDie();
+
+  std::unique_ptr<xla::GlobalData> actual =
+      client_
+          ->Execute(*result.computation, {param0_data.get(), param1_data.get()})
+          .ConsumeValueOrDie();
+  std::unique_ptr<xla::Literal> actual_literal =
+      client_->Transfer(*actual).ConsumeValueOrDie();
+
+  std::unique_ptr<xla::Literal> expected0 =
+      xla::Literal::CreateR1<int32>({27, 67, 35, 402});
+  std::unique_ptr<xla::Literal> expected1 =
+      xla::Literal::CreateR1<int32>({26, 66, 34, 401});
+  std::unique_ptr<xla::Literal> expected_literal =
+      xla::Literal::MakeTuple({expected0.get(), expected1.get()});
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 3dd2d183f3a538786856dd8d92c5886b1cc237d8..098072d33cd4eb7f7dec0ec4196b43eca0220d4a 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -65,26 +65,30 @@ void XlaContext::set_args(std::vector<XlaExpression> args) {
 XlaContext::XlaContext(
     XlaCompiler* compiler, xla::XlaBuilder* builder,
     bool allow_cpu_custom_calls, bool resolve_compile_time_constants,
+    bool is_entry_computation,
     const std::function<TensorShape(const TensorShape&, DataType)>*
-        variable_representation_shape_fn)
+        shape_representation_fn)
     : compiler_(compiler),
       builder_(builder),
       allow_cpu_custom_calls_(allow_cpu_custom_calls),
       resolve_compile_time_constants_(resolve_compile_time_constants),
-      variable_representation_shape_fn_(variable_representation_shape_fn) {}
+      is_entry_computation_(is_entry_computation),
+      shape_representation_fn_(shape_representation_fn) {}
 
 string XlaContext::DebugString() { return "TLA JIT context"; }
 
 // This is called by the Retval Op to associate a computed value
 // with a specific return value of the subgraph.
 void XlaContext::AddRetval(int retval_index, DataType type,
-                           const xla::XlaOp& handle) {
+                           const TensorShape& shape, const xla::XlaOp& handle) {
   VLOG(1) << "Added retval index " << retval_index << " to XLA computation";
   // Add the return value to the list being built up.
   if (retvals_.size() <= retval_index) {
     retvals_.resize(retval_index + 1);
   }
-  retvals_[retval_index].set_handle(handle);
+  XlaExpression e;
+  e.set_handle(handle);
+  retvals_[retval_index] = Retval{type, shape, e};
 }
 
 Status XlaContext::AddConstRetval(int retval_index, DataType dtype,
@@ -94,13 +98,11 @@ Status XlaContext::AddConstRetval(int retval_index, DataType dtype,
   if (retvals_.size() <= retval_index) {
     retvals_.resize(retval_index + 1);
   }
-  if (resolve_compile_time_constants_) {
-    Tensor value;
-    TF_RETURN_IF_ERROR(LiteralToHostTensor(literal, dtype, &value));
-    retvals_[retval_index].set_constant_value(std::move(value));
-  } else {
-    retvals_[retval_index].set_handle(builder_->ConstantLiteral(literal));
-  }
+  Tensor value;
+  TF_RETURN_IF_ERROR(LiteralToHostTensor(literal, dtype, &value));
+  XlaExpression e;
+  e.set_constant_value(value);
+  retvals_[retval_index] = Retval{dtype, value.shape(), e};
   return Status::OK();
 }
 
@@ -117,9 +119,9 @@ Status XlaContext::CreateResource(
   return Status::OK();
 }
 
-TensorShape XlaContext::VariableRepresentationShape(const TensorShape& shape,
-                                                    DataType type) const {
-  return (*variable_representation_shape_fn_)(shape, type);
+TensorShape XlaContext::RepresentationShape(const TensorShape& shape,
+                                            DataType type) const {
+  return (*shape_representation_fn_)(shape, type);
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 1136ffe5073a8e7fd3c27d6ec7050cb1f8307584..341bf6ff1f37fa7cd81f41c02a941214067b1bd1 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -42,11 +42,13 @@ class XlaContext : public ResourceBase {
   static XlaContext& Get(const OpKernelContext* ctx);
   static XlaContext& Get(const XlaOpKernelContext* ctx);
 
-  // Creates a new XlaContext.
+  // Creates a new XlaContext. See the documentation on the class data fields
+  // for descriptions of the arguments.
   XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder,
              bool allow_cpu_custom_calls, bool resolve_compile_time_constants,
+             bool is_entry_computation,
              const std::function<TensorShape(const TensorShape&, DataType)>*
-                 variable_representation_shape_fn);
+                 shape_representation_fn);
 
   // Virtual method defined by ResourceBase.
   string DebugString() override;
@@ -58,14 +60,26 @@ class XlaContext : public ResourceBase {
 
   bool allow_cpu_custom_calls() const { return allow_cpu_custom_calls_; }
 
+  bool resolve_compile_time_constants() const {
+    return resolve_compile_time_constants_;
+  }
+  bool is_entry_computation() const { return is_entry_computation_; }
+
   const std::vector<XlaExpression>& args() const { return args_; }
   void set_args(std::vector<XlaExpression> args);
 
-  const std::vector<XlaExpression>& retvals() { return retvals_; }
+  struct Retval {
+    DataType type;
+    TensorShape shape;
+    // An XlaExpression representing the Retval's value.
+    XlaExpression expression;
+  };
+  const std::vector<Retval>& retvals() { return retvals_; }
 
   // This is called by the Retval Op to associate a computed value
   // with a specific return value of the subgraph.
-  void AddRetval(int retval_index, DataType type, const xla::XlaOp& handle);
+  void AddRetval(int retval_index, DataType type, const TensorShape& shape,
+                 const xla::XlaOp& handle);
 
   // As for Retval, but for return values that are compile-time constants.
   Status AddConstRetval(int retval_index, DataType dtype,
@@ -86,9 +100,9 @@ class XlaContext : public ResourceBase {
   }
 
   // Returns the XLA shape to be used to represent a variable of TF `shape`
-  // and `type`.
-  TensorShape VariableRepresentationShape(const TensorShape& shape,
-                                          DataType type) const;
+  // and `type`, or of an argument or return value of a top-level computation.
+  TensorShape RepresentationShape(const TensorShape& shape,
+                                  DataType type) const;
 
   // Get an XLA lambda to compute Max. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
@@ -131,15 +145,23 @@ class XlaContext : public ResourceBase {
   std::vector<XlaExpression> args_;
 
   // Return values of the Tensorflow graph, indexed by _Retval index.
-  std::vector<XlaExpression> retvals_;
+  std::vector<Retval> retvals_;
 
   // Holds ownership of resources. The resources are not ordered.
   std::vector<std::unique_ptr<XlaResource>> resources_;
 
-  // A function that describes how variable shapes should be represented
-  // in XLA. Variable values will be reshaped to this shape. Must be non-null.
+  // Is this a top-level computation, or an inner computation (e.g., a while
+  // body)?
+  const bool is_entry_computation_;
+
+  // A function that describes how the shapes of
+  // a) argument and return value, for entry computations
+  // b) variables, for all computations,
+  // should be represented in XLA. Parameters/return values will be shaped
+  // according to this function, and reshaped back to/from their declared shapes
+  // for computations. Must be non-null.
   const std::function<TensorShape(const TensorShape&, DataType)>*
-      variable_representation_shape_fn_;
+      shape_representation_fn_;
 
   // Cache of prebuilt computations indexed by their type.
   using ComputationMap = std::map<DataType, xla::XlaComputation>;
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 2b65f4d5d5936e062e5351a0723544191ffe2dfa..76c68d81af4dd9ec40fe6b1c33b03a876a0c6dc6 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -314,8 +314,8 @@ Status XlaOpKernelContext::ReadVariableInput(int index, DataType type,
   }
 
   XlaContext& xla_context = XlaContext::Get(context_);
-  TensorShape representation_shape = xla_context.VariableRepresentationShape(
-      variable->shape(), variable->type());
+  TensorShape representation_shape =
+      xla_context.RepresentationShape(variable->shape(), variable->type());
   if (representation_shape == variable->shape()) {
     *value = variable->value();
   } else {
@@ -436,7 +436,7 @@ Status XlaOpKernelContext::AssignVariable(int input_index, DataType type,
 
   XlaContext& xla_context = XlaContext::Get(context_);
   TensorShape representation_shape =
-      xla_context.VariableRepresentationShape(shape, type);
+      xla_context.RepresentationShape(shape, type);
   if (shape != representation_shape) {
     handle = builder()->Reshape(handle, representation_shape.dim_sizes());
   }
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index e309cb1e34db7f8430c2494c03aed41652b7a167..4692038b61f6871a8a16299fd4d11e963eb46a57 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -39,10 +39,10 @@ const char* const DEVICE_XLA_GPU = "XLA_GPU";
 
 static Status LaunchOpHasKernelForDevice(const DeviceType& device_type) {
   const OpDef* op_def;
-  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef("_XlaLaunch", &op_def));
+  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef("XlaLaunch", &op_def));
   NodeDef node_def;
   node_def.set_name("_XlaLaunch-op");
-  node_def.set_op("_XlaLaunch");
+  node_def.set_op("XlaLaunch");
   string kernel_class_name;
   TF_RETURN_IF_ERROR(FindKernelDef(device_type, node_def, /*KernelDef*/ nullptr,
                                    &kernel_class_name));
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 1af9cb6d2ab15a33b56f1df0410f47d7e139a1ba..c6deb959a59f7b79500a0948b4035ea56cd9b4a1 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -99,8 +99,9 @@ cc_library(
     hdrs = ["service_interface.h"],
     visibility = [":friends"],
     deps = [
+        ":status",
+        ":xla_data_proto",
         ":xla_proto",
-        "//tensorflow/core:lib",
     ],
 )
 
@@ -244,6 +245,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":protobuf_util",
+        ":status",
         ":status_macros",
         ":statusor",
         ":types",
@@ -302,13 +304,13 @@ cc_library(
         ":array2d",
         ":array3d",
         ":array4d",
-        ":shape_tree",
         ":shape_util",
         ":sparse_index_array",
         ":status_macros",
         ":types",
         ":util",
         ":xla_data_proto",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
 )
@@ -323,12 +325,30 @@ tf_cc_test(
         ":shape_util",
         ":test",
         ":types",
+        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
 
+cc_library(
+    name = "error_spec",
+    hdrs = ["error_spec.h"],
+)
+
+cc_library(
+    name = "literal_comparison",
+    srcs = ["literal_comparison.cc"],
+    hdrs = ["literal_comparison.h"],
+    deps = [
+        ":error_spec",
+        ":literal_util",
+        ":util",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "metric_table_report",
     srcs = ["metric_table_report.cc"],
@@ -563,6 +583,7 @@ tf_cc_test(
         ":shape_util",
         ":test",
         ":xla_data_proto",
+        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index aac3273d5fd144f3b737529b0833c9328b3d0e4d..aacb394ae5f92aa0d87ee3a23bcc3d4ec5cd99a3 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -63,7 +63,6 @@ cc_library(
     srcs = ["client.cc"],
     hdrs = ["client.h"],
     deps = [
-        ":computation",
         ":global_data",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal_util",
@@ -76,7 +75,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
-        "//tensorflow/compiler/xla/service:session_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -99,7 +98,6 @@ cc_library(
     hdrs = ["local_client.h"],
     deps = [
         ":client",
-        ":computation",
         ":executable_build_options",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:status_macros",
@@ -126,7 +124,6 @@ cc_library(
     hdrs = ["compile_only_client.h"],
     deps = [
         ":client",
-        ":computation",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -162,47 +159,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "computation",
-    srcs = ["computation.cc"],
-    hdrs = ["computation.h"],
-    deps = [
-        "//tensorflow/compiler/xla:service_interface",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/service:session_proto",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "computation_builder",
-    srcs = ["computation_builder.cc"],
-    hdrs = ["computation_builder.h"],
-    deps = [
-        ":client",
-        ":computation",
-        ":global_data",
-        ":padding",
-        "//tensorflow/compiler/xla:array",
-        "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:array3d",
-        "//tensorflow/compiler/xla:array4d",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "sharding_builder",
     srcs = ["sharding_builder.cc"],
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 328e1b8fa84e7baaca41c6c9a65e9a1598ac32ae..c9d275a77b5cd40225f4b5c45e02c242d27d9aa1 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -161,22 +161,6 @@ Status Client::ResetDevice() {
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<Literal>> Client::ExecuteAndTransfer(
-    const Computation& computation,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
-    const ExecutionOptions* execution_options,
-    ExecutionProfile* execution_profile) {
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<GlobalData> data,
-      Execute(computation, arguments, execution_options, execution_profile));
-
-  const Shape* shape_with_output_layout = nullptr;
-  if (execution_options && execution_options->has_shape_with_output_layout()) {
-    shape_with_output_layout = &execution_options->shape_with_output_layout();
-  }
-  return Transfer(*data, shape_with_output_layout);
-}
-
 StatusOr<std::unique_ptr<Literal>> Client::ExecuteAndTransfer(
     const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
@@ -221,65 +205,11 @@ StatusOr<std::unique_ptr<Literal>> Client::ComputeConstant(
   return Literal::CreateFromProto(response.literal());
 }
 
-StatusOr<Computation> Client::LoadSnapshot(const SessionModule& module) {
-  LoadComputationSnapshotRequest request;
-  *request.mutable_module() = module;
-  LoadComputationSnapshotResponse response;
-
-  Status s = stub_->LoadComputationSnapshot(&request, &response);
-  if (!s.ok()) {
-    return s;
-  }
-
-  VLOG(1) << "load snapshot response: " << response.ShortDebugString();
-  return Computation(stub_, response.computation());
-}
-
 StatusOr<XlaComputation> Client::LoadSnapshot(const HloSnapshot& module) {
   TF_RET_CHECK(module.has_hlo() && module.hlo().has_hlo_module());
   return XlaComputation(module.hlo().hlo_module());
 }
 
-StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
-    const Computation& computation,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
-    const ExecutionOptions* execution_options,
-    ExecutionProfile* execution_profile) {
-  ExecuteRequest request;
-  *request.mutable_computation() = computation.handle();
-
-  if (execution_options == nullptr) {
-    *request.mutable_execution_options() = CreateDefaultExecutionOptions();
-  } else {
-    *request.mutable_execution_options() = *execution_options;
-  }
-  for (GlobalData* argument : arguments) {
-    CHECK(argument != nullptr) << "Argument pointers must not be null.";
-    *request.add_arguments() = argument->handle();
-  }
-
-  ExecuteResponse response;
-  VLOG(1) << "making execute request: " << request.ShortDebugString();
-  Status s = stub_->Execute(&request, &response);
-  VLOG(1) << "done with request";
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  if (execution_profile != nullptr) {
-    *execution_profile = response.profile();
-    if (VLOG_IS_ON(1)) {
-      TF_ASSIGN_OR_RETURN(
-          auto execution_stats,
-          ExecutionStatsAsString(computation, response.profile()));
-      VLOG(1) << execution_stats;
-    }
-  }
-
-  return MakeUnique<GlobalData>(stub_, response.output());
-}
-
 StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
     const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
@@ -320,41 +250,6 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
   return MakeUnique<GlobalData>(stub_, response.output());
 }
 
-StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
-    tensorflow::gtl::ArraySlice<ComputationInstance> computations) {
-  ExecuteParallelRequest request;
-
-  for (const ComputationInstance& computation : computations) {
-    ExecuteRequest single_request;
-    *single_request.mutable_computation() = computation.computation.handle();
-    for (GlobalData* argument : computation.arguments) {
-      *single_request.add_arguments() = argument->handle();
-    }
-    *single_request.mutable_execution_options() = computation.execution_options;
-    *request.add_requests() = single_request;
-  }
-
-  ExecuteParallelResponse response;
-  VLOG(1) << "making execute-parallel request: " << request.ShortDebugString();
-  tensorflow::Status s = stub_->ExecuteParallel(&request, &response);
-  VLOG(1) << "done with request";
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  std::vector<std::unique_ptr<GlobalData>> outputs;
-  for (size_t i = 0; i < computations.size(); ++i) {
-    outputs.push_back(
-        MakeUnique<GlobalData>(stub_, response.responses(i).output()));
-    if (computations[i].execution_profile != nullptr) {
-      *computations[i].execution_profile = response.responses(i).profile();
-    }
-  }
-
-  return std::move(outputs);
-}
-
 StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
     tensorflow::gtl::ArraySlice<XlaComputationInstance> computations) {
   ExecuteGraphParallelRequest request;
@@ -372,7 +267,7 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
   ExecuteParallelResponse response;
   VLOG(1) << "making execute-graph-parallel request: "
           << request.ShortDebugString();
-  tensorflow::Status s = stub_->ExecuteGraphParallel(&request, &response);
+  Status s = stub_->ExecuteGraphParallel(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -401,7 +296,7 @@ StatusOr<std::vector<DeviceHandle>> Client::GetDeviceHandles(
 
   GetDeviceHandlesResponse response;
   VLOG(1) << "making get device request: " << request.ShortDebugString();
-  tensorflow::Status s = stub_->GetDeviceHandles(&request, &response);
+  Status s = stub_->GetDeviceHandles(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -449,24 +344,6 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::DeconstructTuple(
   return std::move(handles);
 }
 
-StatusOr<ComputationStats> Client::GetComputationStats(
-    const Computation& computation, const DebugOptions& debug_options) const {
-  ComputationStatsRequest request;
-  *request.mutable_computation() = computation.handle();
-  *request.mutable_debug_options() = debug_options;
-  ComputationStatsResponse response;
-
-  VLOG(1) << "making computation stats request";
-  Status s = stub_->GetComputationStats(&request, &response);
-  VLOG(1) << "done with request";
-
-  if (!s.ok()) {
-    return s;
-  }
-  CHECK(response.has_stats());
-  return response.stats();
-}
-
 StatusOr<ComputationStats> Client::GetComputationStats(
     const XlaComputation& computation,
     const DebugOptions& debug_options) const {
@@ -488,23 +365,6 @@ StatusOr<ComputationStats> Client::GetComputationStats(
   return response.stats();
 }
 
-StatusOr<std::unique_ptr<ProgramShape>> Client::GetComputationShape(
-    const Computation& computation) {
-  GetComputationShapeRequest request;
-  *request.mutable_computation() = computation.handle();
-  GetComputationShapeResponse response;
-
-  VLOG(1) << "making get-computation-shape request";
-  Status s = stub_->GetComputationShape(&request, &response);
-  VLOG(1) << "done with request";
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  return WrapUnique(response.release_program_shape());
-}
-
 StatusOr<std::unique_ptr<ProgramShape>> Client::GetComputationShape(
     const XlaComputation& computation) {
   TF_ASSIGN_OR_RETURN(const auto& result, computation.GetProgramShape());
@@ -527,28 +387,6 @@ StatusOr<Shape> Client::GetShape(const GlobalData& data) {
   return response.shape();
 }
 
-StatusOr<string> Client::ExecutionStatsAsString(
-    const Computation& computation, const ExecutionProfile& profile) {
-  TF_ASSIGN_OR_RETURN(
-      auto computation_stats,
-      GetComputationStats(computation,
-                          legacy_flags::GetDebugOptionsFromFlags()));
-  int64 total_flops =
-      computation_stats.flop_count() + computation_stats.transcendental_count();
-  if (profile.compute_time_ns() > 0) {
-    int64 nanoseconds = profile.compute_time_ns();
-    int64 cycle_count = profile.compute_cycle_count();
-    double gflops = total_flops / nanoseconds;
-    return tensorflow::strings::StrCat(
-        "[Execution Statistics] flop count: ", computation_stats.flop_count(),
-        ", transcendental count: ", computation_stats.transcendental_count(),
-        ", compute execution time: ", nanoseconds, " nsec",
-        ", compute cycles: ", cycle_count, ", performance: ", gflops,
-        "gflop/s");
-  }
-  return string("[Execution Statistics] not available.");
-}
-
 StatusOr<string> Client::ExecutionStatsAsString(
     const XlaComputation& computation, const ExecutionProfile& profile) {
   TF_ASSIGN_OR_RETURN(
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index a63ff4c56d1dd78c7abfa2bf163b5fbd54d82b2b..d57e2536d0b44cda46d7c1c2513b82c9f8a31c1b 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -19,11 +19,10 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -52,21 +51,6 @@ class Client {
   //   device is chosen by the service.
   // * If execution_profile is not nullptr then the pointed-to ExecutionProfile
   //   will be filled with profile data from the execution.
-  StatusOr<std::unique_ptr<GlobalData>> Execute(
-      const Computation& computation,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
-      const ExecutionOptions* execution_options = nullptr,
-      ExecutionProfile* execution_profile = nullptr);
-
-  // Executes the computation with the given arguments and returns the global
-  // data that was produced from the execution.
-  // * If execution_options is not nullptr, these options are passed to the
-  //   service to affect how it compiles our computation.  (The pointer does not
-  //   need to live beyond this call.)
-  // * If execution_profile is not nullptr then the pointed-to ExecutionProfile
-  //   will be filled with profile data from the execution.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::unique_ptr<GlobalData>> Execute(
       const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
@@ -78,34 +62,6 @@ class Client {
   //   executed on the devices associated with the handles by partitioning the
   //   computation based on the attached sharding attributes. Otherwise, a
   //   device is chosen by the service.
-  struct ComputationInstance {
-    const Computation& computation;
-    std::vector<GlobalData*> arguments;
-    ExecutionOptions execution_options;
-    ExecutionProfile* execution_profile;
-
-    ComputationInstance(const Computation& computation,
-                        std::vector<GlobalData*> arguments,
-                        ExecutionOptions execution_options,
-                        ExecutionProfile* execution_profile)
-        : computation(computation),
-          arguments(std::move(arguments)),
-          execution_options(execution_options),
-          execution_profile(execution_profile) {}
-  };
-
-  // Executes a list ComputationInstances and returns global data produced from
-  // each computation.
-  StatusOr<std::vector<std::unique_ptr<GlobalData>>> ExecuteParallel(
-      tensorflow::gtl::ArraySlice<ComputationInstance> computations);
-
-  // A struct to represent a computation instance to be executed.
-  // * If execution_options.device_handles is not empty, the computation is
-  //   executed on the devices associated with the handles by partitioning the
-  //   computation based on the attached sharding attributes. Otherwise, a
-  //   device is chosen by the service.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   struct XlaComputationInstance {
     const XlaComputation& computation;
     std::vector<GlobalData*> arguments;
@@ -125,7 +81,6 @@ class Client {
   // Executes a list XlaComputationInstances and returns global data produced
   // from each computation.
   //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::vector<std::unique_ptr<GlobalData>>> ExecuteParallel(
       tensorflow::gtl::ArraySlice<XlaComputationInstance> computations);
 
@@ -177,17 +132,6 @@ class Client {
   // Executes the computation with the given arguments and transfers the result
   // to the client as a literal. Parameters are defined the same as for
   // Execute() and Transfer().
-  StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
-      const Computation& computation,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
-      const ExecutionOptions* execution_options = nullptr,
-      ExecutionProfile* execution_profile = nullptr);
-
-  // Executes the computation with the given arguments and transfers the result
-  // to the client as a literal. Parameters are defined the same as for
-  // Execute() and Transfer().
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
       const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
@@ -223,12 +167,6 @@ class Client {
       const GlobalData& data);
 
   // Retrieves the statistics of the given computation.
-  StatusOr<ComputationStats> GetComputationStats(
-      const Computation& computation, const DebugOptions& debug_options) const;
-
-  // Retrieves the statistics of the given computation.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<ComputationStats> GetComputationStats(
       const XlaComputation& computation,
       const DebugOptions& debug_options) const;
@@ -239,13 +177,6 @@ class Client {
 
   // As above, but returns the shape of the provided computation (parameter
   // types/names and return type).
-  StatusOr<std::unique_ptr<ProgramShape>> GetComputationShape(
-      const Computation& computation);
-
-  // As above, but returns the shape of the provided computation (parameter
-  // types/names and return type).
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::unique_ptr<ProgramShape>> GetComputationShape(
       const XlaComputation& computation);
 
@@ -253,9 +184,6 @@ class Client {
   // two computations via a pair of Send and Recv instructions.
   StatusOr<ChannelHandle> CreateChannelHandle();
 
-  StatusOr<Computation> LoadSnapshot(const SessionModule& module);
-
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<XlaComputation> LoadSnapshot(const HloSnapshot& module);
 
   ServiceInterface* stub() { return stub_; }
@@ -263,8 +191,6 @@ class Client {
  private:
   // Returns the execution statistics (e.g., gflop/s) as a string from the
   // ExecutionProfile returned from an execution of the computation.
-  StatusOr<string> ExecutionStatsAsString(const Computation& computation,
-                                          const ExecutionProfile& profile);
   StatusOr<string> ExecutionStatsAsString(const XlaComputation& computation,
                                           const ExecutionProfile& profile);
 
diff --git a/tensorflow/compiler/xla/client/compile_only_client.cc b/tensorflow/compiler/xla/client/compile_only_client.cc
index 96e38bca01087991943aff40ed1cb3e21f9e6cba..dc69d2097ebe14ca0e14a39849d4fcae99024fdc 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.cc
+++ b/tensorflow/compiler/xla/client/compile_only_client.cc
@@ -21,24 +21,6 @@ limitations under the License.
 
 namespace xla {
 
-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-CompileOnlyClient::CompileAheadOfTime(
-    const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
-    const AotCompilationOptions& options) {
-  std::vector<CompileOnlyService::AotComputationInstance> service_instances;
-  service_instances.reserve(computations.size());
-  for (const AotComputationInstance& instance : computations) {
-    service_instances.push_back({});
-    CompileOnlyService::AotComputationInstance& service_instance =
-        service_instances.back();
-    TF_RET_CHECK(instance.computation != nullptr);
-    service_instance.computation = instance.computation->handle();
-    service_instance.argument_layouts = instance.argument_layouts;
-    service_instance.result_layout = instance.result_layout;
-  }
-  return compiler_service_->CompileAheadOfTime(service_instances, options);
-}
-
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 CompileOnlyClient::CompileAheadOfTime(
     const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
diff --git a/tensorflow/compiler/xla/client/compile_only_client.h b/tensorflow/compiler/xla/client/compile_only_client.h
index c8725b8517484acdaf093bc3b34adb00f69155b1..f9a7c31270c7a11175f47a537639a97d0c9211af 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.h
+++ b/tensorflow/compiler/xla/client/compile_only_client.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
 
 #include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/compile_only_service.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
@@ -38,26 +37,7 @@ class CompileOnlyClient : public Client {
   CompileOnlyClient(const CompileOnlyClient&) = delete;
   void operator=(const CompileOnlyClient&) = delete;
 
-  // A description of a computation to compile using CompileAheadOfTime.
-  struct AotComputationInstance {
-    const Computation* computation;
-    // Inform the compiler of the expected layout for arguments.
-    std::vector<const Shape*> argument_layouts;
-    // Specifies the expected result layout.
-    const Shape* result_layout;
-  };
-
-  // Compiles a list of computations for ahead-of-time execution.  This is
-  // intended for use in static compilation. The |options| parameter describes
-  // the target for which the compiler should emit code.
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
-      const AotCompilationOptions& options);
-
   // A description of an xla computation to compile using CompileAheadOfTime.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   struct AotXlaComputationInstance {
     const XlaComputation* computation;
     // Inform the compiler of the expected layout for arguments.
@@ -69,8 +49,6 @@ class CompileOnlyClient : public Client {
   // Compiles a list of xla computations for ahead-of-time execution.  This is
   // intended for use in static compilation. The |options| parameter describes
   // the target for which the compiler should emit code.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(
       const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
diff --git a/tensorflow/compiler/xla/client/computation.cc b/tensorflow/compiler/xla/client/computation.cc
deleted file mode 100644
index e6c57bda0f0c4cb969939883efebcf3a6d6be381..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/client/computation.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/client/computation.h"
-
-#include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace xla {
-
-Computation::Computation() : parent_(nullptr) {}
-
-Computation::Computation(ServiceInterface* parent,
-                         const ComputationHandle& handle)
-    : handle_(handle), parent_(parent) {}
-
-Computation::Computation(Computation&& computation)
-    : handle_(std::move(computation.handle_)), parent_(computation.parent_) {
-  computation.ResetWithoutFreeing();
-}
-
-void Computation::Reset() {
-  // TODO(b/34469253) deallocate any owned computation.
-  ResetWithoutFreeing();
-}
-
-StatusOr<std::unique_ptr<SessionModule>> Computation::Snapshot() const {
-  SnapshotComputationRequest request;
-  *request.mutable_computation() = handle_;
-  SnapshotComputationResponse response;
-
-  TF_RETURN_IF_ERROR(parent_->SnapshotComputation(&request, &response));
-
-  return WrapUnique(response.release_module());
-}
-
-Computation::~Computation() { Reset(); }
-
-Computation& Computation::operator=(Computation&& computation) {
-  if (&computation != this) {
-    Reset();
-    handle_ = computation.handle_;
-    parent_ = computation.parent_;
-    computation.ResetWithoutFreeing();
-  }
-  return *this;
-}
-
-void Computation::ResetWithoutFreeing() {
-  handle_.Clear();
-  parent_ = nullptr;
-}
-
-StatusOr<ProgramShape> Computation::GetProgramShape() const {
-  GetComputationShapeRequest request;
-  *request.mutable_computation() = handle_;
-  GetComputationShapeResponse response;
-
-  TF_RETURN_IF_ERROR(parent_->GetComputationShape(&request, &response));
-
-  return std::move(*response.mutable_program_shape());
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/computation.h b/tensorflow/compiler/xla/client/computation.h
deleted file mode 100644
index 9a1bcde76387297cb7f374b25baad1d5ec284859..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/client/computation.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_COMPUTATION_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_COMPUTATION_H_
-
-#include <memory>
-
-#include "tensorflow/compiler/xla/service/session.pb.h"
-#include "tensorflow/compiler/xla/service_interface.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/macros.h"
-
-namespace xla {
-
-// Wraps a ComputationHandle protobuf with a lifetime. Computation is
-// movable and not copyable to capture the same kind of unique
-// ownership that std::unique_ptr represents.
-//
-// TODO(b/74197823): Deprecated. Use XlaComputation instead.
-class Computation {
- public:
-  // Creates a null Computation.
-  Computation();
-
-  // parent: stub for the service on which we will deallocate the computation
-  //   when it is no longer needed.
-  // handle: the computation handle protobuf from the service.
-  Computation(ServiceInterface* parent, const ComputationHandle& handle);
-
-  Computation(Computation&& computation);
-
-  // Deallocates the computation.
-  ~Computation();
-
-  Computation& operator=(Computation&& computation);
-
-  // Returns the underlying handle.
-  const ComputationHandle& handle() const { return handle_; }
-
-  // Sets handle to a null state and clears any owned computation.
-  void Reset();
-
-  // Requests that we snapshot the computation into a serializable protocol
-  // buffer form.
-  StatusOr<std::unique_ptr<SessionModule>> Snapshot() const;
-
-  // Returns true if this object is a null Computation.
-  bool IsNull() const { return parent_ == nullptr; }
-
-  // Returns the "program shape" (parameter and return shapes) for this
-  // computation.
-  StatusOr<ProgramShape> GetProgramShape() const;
-
- private:
-  void ResetWithoutFreeing();
-
-  ComputationHandle handle_;  // Handle that is wrapped by this class.
-
-  // Stub that the handle is deallocated on when this object's lifetime ends.
-  ServiceInterface* parent_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(Computation);
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_COMPUTATION_H_
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
deleted file mode 100644
index 83c7cb174402133706fbde6a734a29afd8edfe80..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ /dev/null
@@ -1,1574 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/client/computation_builder.h"
-
-#include <stddef.h>
-#include <array>
-#include <numeric>
-#include <set>
-#include <vector>
-
-#include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
-
-namespace xla {
-
-ComputationBuilder::ComputationBuilder(Client* client,
-                                       const string& computation_name)
-    : name_(computation_name), client_(client) {}
-
-ComputationBuilder::~ComputationBuilder() {}
-
-void ComputationBuilder::NoteError(const Status& error) {
-  if (die_immediately_on_error_) {
-    LOG(FATAL) << "error building computation: " << error;
-  }
-
-  if (first_error_.ok()) {
-    first_error_ = error;
-    first_error_backtrace_.CreateCurrent(/*skip_count=*/1);
-  }
-}
-
-std::unique_ptr<ComputationBuilder> ComputationBuilder::CreateSubBuilder(
-    const string& computation_name) {
-  auto sub_builder = MakeUnique<ComputationBuilder>(client_, computation_name);
-  sub_builder->parent_builder_ = this;
-  sub_builder->die_immediately_on_error_ = die_immediately_on_error_;
-  return sub_builder;
-}
-
-Status ComputationBuilder::PrepareComputation() {
-  TF_RETURN_IF_ERROR(first_error_);
-
-  if (!computation_.IsNull()) {
-    return Status::OK();
-  }
-
-  ComputationRequest request;
-  request.set_name(name_);
-  ComputationResponse response;
-
-  VLOG(2) << "making computation request";
-  Status s = client_->stub()->Computation(&request, &response);
-  VLOG(2) << "done with computation request";
-
-  if (!s.ok()) {
-    NoteError(s);
-    return first_error_;
-  }
-
-  computation_ = Computation(client_->stub(), response.computation());
-  return Status::OK();
-}
-
-Status ComputationBuilder::RunOp(OpRequest* op_request,
-                                 OpResponse* op_response) {
-  TF_RETURN_IF_ERROR(first_error_);
-  TF_RETURN_IF_ERROR(PrepareComputation());
-
-  // Fill in fields that are set on every OpRequest.
-  *op_request->mutable_computation() = computation_.handle();
-  *op_request->mutable_metadata() = metadata_;
-  if (sharding_) {
-    *op_request->mutable_sharding() = *sharding_;
-  }
-
-  const string& op_name =
-      OpRequest::descriptor()->FindFieldByNumber(op_request->op_case())->name();
-  VLOG(2) << "running op request: " << op_name;
-  Status status = client_->stub()->Op(op_request, op_response);
-  VLOG(2) << "done with op request: " << op_name;
-  return status;
-}
-
-void ComputationBuilder::RunOpAndNoteError(OpRequest* op_request) {
-  OpResponse op_response;
-  Status status = RunOp(op_request, &op_response);
-  if (!status.ok()) {
-    NoteError(status);
-  }
-}
-
-ComputationDataHandle ComputationBuilder::RunOpAndParseResponse(
-    OpRequest* op_request) {
-  OpResponse op_response;
-  Status status = RunOp(op_request, &op_response);
-  if (!status.ok()) {
-    NoteError(status);
-    return ComputationDataHandle();
-  }
-  if (op_response.output().handle() == 0) {
-    NoteError(InternalError("No output handle"));
-    return ComputationDataHandle();
-  }
-  return op_response.output();
-}
-
-bool ComputationBuilder::MakeWindow(
-    tensorflow::gtl::ArraySlice<int64> window_dimensions,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-    tensorflow::gtl::ArraySlice<int64> rhs_dilation, Window* window) {
-  const auto verify_size = [&](const size_t x, const char* x_name) {
-    if (x == 0 || x == window_dimensions.size()) {
-      return true;
-    } else {
-      NoteError(InvalidArgument(
-          "%s", tensorflow::strings::StrCat(
-                    "Window has different number of window dimensions than of ",
-                    x_name, "\nNumber of window dimensions: ",
-                    window_dimensions.size(), "\nNumber of ", x_name, ": ", x,
-                    "\n")
-                    .c_str()));  //
-      return false;
-    }
-  };
-  if (!verify_size(window_strides.size(), "window strides") ||
-      !verify_size(padding.size(), "padding entries") ||
-      !verify_size(lhs_dilation.size(), "lhs dilation factors") ||
-      !verify_size(rhs_dilation.size(), "rhs dilation factors")) {
-    return false;
-  }
-
-  window->Clear();
-  for (size_t i = 0; i < window_dimensions.size(); i++) {
-    auto dim = window->add_dimensions();
-    dim->set_size(window_dimensions[i]);
-    if (!window_strides.empty()) {
-      dim->set_stride(window_strides[i]);
-    } else {
-      dim->set_stride(1);
-    }
-    if (!padding.empty()) {
-      dim->set_padding_low(padding[i].first);
-      dim->set_padding_high(padding[i].second);
-    } else {
-      dim->set_padding_low(0);
-      dim->set_padding_high(0);
-    }
-    if (!lhs_dilation.empty()) {
-      dim->set_base_dilation(lhs_dilation[i]);
-    } else {
-      dim->set_base_dilation(1);
-    }
-    if (!rhs_dilation.empty()) {
-      dim->set_window_dilation(rhs_dilation[i]);
-    } else {
-      dim->set_window_dilation(1);
-    }
-    dim->set_window_reversal(false);
-  }
-  return true;
-}
-
-ComputationDataHandle ComputationBuilder::ConstantLiteral(
-    const Literal& literal) {
-  OpRequest op_request;
-  ConstantRequest* request = op_request.mutable_constant_request();
-  *request->mutable_literal() = literal.ToProto();
-  VLOG(3) << "created constant: " << request->literal().ShortDebugString();
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Parameter(int64 parameter_number,
-                                                    const Shape& shape,
-                                                    const string& name) {
-  OpRequest op_request;
-  ParameterRequest* request = op_request.mutable_parameter_request();
-  *request->mutable_shape() = shape;
-  request->set_parameter(parameter_number);
-  request->set_name(name);
-  return RunOpAndParseResponse(&op_request);
-}
-
-StatusOr<std::unique_ptr<Shape>> ComputationBuilder::GetShapeWithoutNoteError(
-    const ComputationDataHandle& operand) {
-  GetLocalShapeRequest request;
-  *request.mutable_computation() = computation_.handle();
-  *request.mutable_operand() = operand;
-  GetLocalShapeResponse response;
-
-  VLOG(2) << "making get-shape request";
-  TF_RETURN_IF_ERROR(client_->stub()->GetLocalShape(&request, &response));
-  VLOG(2) << "done with request";
-
-  TF_RET_CHECK(response.has_shape());
-  std::unique_ptr<Shape> shape = WrapUnique(response.release_shape());
-  TF_RET_CHECK(shape != nullptr);
-  return std::move(shape);
-}
-
-StatusOr<std::unique_ptr<Shape>> ComputationBuilder::GetShape(
-    const ComputationDataHandle& operand) {
-  TF_RETURN_IF_ERROR(first_error_);
-
-  auto status_or_shape = GetShapeWithoutNoteError(operand);
-  if (!status_or_shape.ok()) {
-    NoteError(status_or_shape.status());
-    return first_error_;
-  }
-  return status_or_shape;
-}
-
-StatusOr<ProgramShape> ComputationBuilder::GetProgramShape() {
-  TF_RETURN_IF_ERROR(first_error_);
-
-  GetComputationShapeRequest request;
-  *request.mutable_computation() = computation_.handle();
-  GetComputationShapeResponse response;
-
-  VLOG(2) << "making get-program-shape-request";
-  Status status = client_->stub()->GetComputationShape(&request, &response);
-  VLOG(2) << "done with get-program-shape-request";
-
-  if (!status.ok()) {
-    first_error_ = status;
-    return status;
-  }
-
-  TF_RET_CHECK(response.has_program_shape());
-  return std::move(*response.mutable_program_shape());
-}
-
-ComputationDataHandle ComputationBuilder::Slice(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> start_indices,
-    tensorflow::gtl::ArraySlice<int64> limit_indices,
-    tensorflow::gtl::ArraySlice<int64> strides) {
-  OpRequest op_request;
-  SliceRequest* request = op_request.mutable_slice_request();
-  *request->mutable_operand() = operand;
-  for (int64 index : start_indices) {
-    request->add_start_indices(index);
-  }
-  for (int64 index : limit_indices) {
-    request->add_limit_indices(index);
-  }
-  for (int64 index : strides) {
-    request->add_strides(index);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::SliceInDim(
-    const ComputationDataHandle& operand, int64 start_index, int64 limit_index,
-    int64 stride, int64 dimno) {
-  StatusOr<std::unique_ptr<Shape>> shape_status = GetShape(operand);
-  if (!shape_status.ok()) {
-    NoteError(shape_status.status());
-    return ComputationDataHandle{};
-  }
-  const Shape& shape = *shape_status.ValueOrDie();
-  std::vector<int64> starts(ShapeUtil::Rank(shape), 0);
-  std::vector<int64> limits(shape.dimensions().begin(),
-                            shape.dimensions().end());
-  std::vector<int64> strides(ShapeUtil::Rank(shape), 1);
-  starts[dimno] = start_index;
-  limits[dimno] = limit_index;
-  strides[dimno] = stride;
-  return Slice(operand, starts, limits, strides);
-}
-
-ComputationDataHandle ComputationBuilder::DynamicSlice(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& start_indices,
-    tensorflow::gtl::ArraySlice<int64> slice_sizes) {
-  OpRequest op_request;
-  DynamicSliceRequest* request = op_request.mutable_dynamic_slice_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_start_indices() = start_indices;
-  for (int64 index : slice_sizes) {
-    request->add_slice_sizes(index);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::DynamicUpdateSlice(
-    const ComputationDataHandle& operand, const ComputationDataHandle& update,
-    const ComputationDataHandle& start_indices) {
-  OpRequest op_request;
-  DynamicUpdateSliceRequest* request =
-      op_request.mutable_dynamic_update_slice_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_update() = update;
-  *request->mutable_start_indices() = start_indices;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::ConcatInDim(
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-    int64 dimension) {
-  OpRequest op_request;
-  ConcatenateRequest* request = op_request.mutable_concatenate_request();
-  for (const ComputationDataHandle& operand : operands) {
-    *request->add_operands() = operand;
-  }
-  request->set_dimension(dimension);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Broadcast(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> broadcast_sizes) {
-  OpRequest op_request;
-  BroadcastRequest* request = op_request.mutable_broadcast_request();
-  *request->mutable_operand() = operand;
-  for (int64 size : broadcast_sizes) {
-    request->add_broadcast_sizes(size);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Pad(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& padding_value,
-    const PaddingConfig& padding_config) {
-  OpRequest op_request;
-  PadRequest* request = op_request.mutable_pad_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_padding_value() = padding_value;
-  *request->mutable_padding_config() = padding_config;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Reshape(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> dimensions,
-    tensorflow::gtl::ArraySlice<int64> new_sizes) {
-  OpRequest op_request;
-  ReshapeRequest* request = op_request.mutable_reshape_request();
-  *request->mutable_operand() = operand;
-  for (int64 dimension : dimensions) {
-    request->add_dimensions(dimension);
-  }
-  for (int64 new_size : new_sizes) {
-    request->add_new_sizes(new_size);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Reshape(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> new_sizes) {
-  if (!first_error_.ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> shape = GetShape(operand);
-  if (!shape.ok()) {
-    return ComputationDataHandle();
-  }
-  std::vector<int64> dimensions(shape.ValueOrDie()->dimensions().size());
-  std::iota(dimensions.begin(), dimensions.end(), 0);
-  return Reshape(operand, dimensions, new_sizes);
-}
-
-ComputationDataHandle ComputationBuilder::Collapse(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> dimensions) {
-  if (!first_error_.ok()) {
-    return ComputationDataHandle();
-  }
-
-  // Don't support out-of-order collapse here.
-  // Checks that the collapsed dimensions are in order and consecutive.
-  for (tensorflow::gtl::ArraySlice<int64>::size_type i = 1;
-       i < dimensions.size(); ++i) {
-    if (dimensions[i] - 1 != dimensions[i - 1]) {
-      NoteError(InvalidArgument(
-          "Collapsed dimensions are not in order and consecutive."));
-      return ComputationDataHandle();
-    }
-  }
-
-  // Create a new sizes vector from the old shape, replacing the collapsed
-  // dimensions by the product of their sizes.
-  StatusOr<std::unique_ptr<Shape>> shape_or_status = GetShape(operand);
-  if (!shape_or_status.ok()) {
-    return ComputationDataHandle();
-  }
-  std::unique_ptr<Shape> original_shape = shape_or_status.ConsumeValueOrDie();
-
-  VLOG(3) << "original shape: " << ShapeUtil::HumanString(*original_shape);
-  VLOG(3) << "dims to collapse: "
-          << tensorflow::str_util::Join(dimensions, ",");
-
-  if (dimensions.size() <= 1) {
-    // Not collapsing anything, trivially we can return the operand versus
-    // enqueueing a trivial reshape.
-    return operand;
-  }
-
-  std::vector<int64> new_sizes;
-  for (int i = 0; i < ShapeUtil::Rank(*original_shape); ++i) {
-    if (i <= dimensions.front() || i > dimensions.back()) {
-      new_sizes.push_back(original_shape->dimensions(i));
-    } else {
-      new_sizes.back() *= original_shape->dimensions(i);
-    }
-  }
-
-  VLOG(3) << "new sizes: [" << tensorflow::str_util::Join(new_sizes, ",")
-          << "]";
-
-  return Reshape(operand, new_sizes);
-}
-
-void ComputationBuilder::Trace(const string& tag,
-                               const ComputationDataHandle& operand) {
-  OpRequest op_request;
-  TraceRequest* request = op_request.mutable_trace_request();
-  request->set_tag(tag);
-  *request->mutable_operand() = operand;
-  RunOpAndNoteError(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Select(
-    const ComputationDataHandle& pred, const ComputationDataHandle& on_true,
-    const ComputationDataHandle& on_false) {
-  return TernaryOp(TRIOP_SELECT, pred, on_true, on_false);
-}
-
-ComputationDataHandle ComputationBuilder::Tuple(
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> elements) {
-  OpRequest op_request;
-  VariadicOpRequest* request = op_request.mutable_variadic_op_request();
-  request->set_varop(VAROP_TUPLE);
-  for (const ComputationDataHandle& operand : elements) {
-    *request->add_operands() = operand;
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::GetTupleElement(
-    const ComputationDataHandle& tuple_data, int64 index) {
-  OpRequest op_request;
-  GetTupleElementRequest* request =
-      op_request.mutable_get_tuple_element_request();
-  *request->mutable_operand() = tuple_data;
-  request->set_index(index);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Eq(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_EQ, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Ne(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_NE, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Ge(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_GE, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Gt(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_GT, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Le(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_LE, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Lt(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_LT, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Dot(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs) {
-  StatusOr<std::unique_ptr<Shape>> lhs_shape_or_status = GetShape(lhs);
-  if (!lhs_shape_or_status.ok()) {
-    return ComputationDataHandle();
-  }
-  std::unique_ptr<Shape> lhs_shape = lhs_shape_or_status.ConsumeValueOrDie();
-
-  DotDimensionNumbers dimension_numbers;
-  dimension_numbers.add_lhs_contracting_dimensions(
-      lhs_shape->dimensions_size() == 1 ? 0 : 1);
-  dimension_numbers.add_rhs_contracting_dimensions(0);
-  return DotGeneral(lhs, rhs, dimension_numbers);
-}
-
-ComputationDataHandle ComputationBuilder::DotGeneral(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    const DotDimensionNumbers& dimension_numbers) {
-  OpRequest op_request;
-  DotRequest* request = op_request.mutable_dot_request();
-  *request->mutable_lhs() = lhs;
-  *request->mutable_rhs() = rhs;
-  *request->mutable_dimension_numbers() = dimension_numbers;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Conv(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding) {
-  return ConvWithGeneralDimensions(
-      lhs, rhs, window_strides, padding,
-      CreateDefaultConvDimensionNumbers(window_strides.size()));
-}
-
-ComputationDataHandle ComputationBuilder::ConvWithGeneralPadding(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
-  return ConvGeneral(lhs, rhs, window_strides, padding,
-                     CreateDefaultConvDimensionNumbers(window_strides.size()));
-}
-
-bool ComputationBuilder::VerifyConvolution(
-    const Shape& lhs_shape, const Shape& rhs_shape,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
-  if (ShapeUtil::Rank(lhs_shape) != ShapeUtil::Rank(rhs_shape)) {
-    NoteError(
-        InvalidArgument("Convolution arguments must have same number of "
-                        "dimensions. Got: %s and %s",
-                        ShapeUtil::HumanString(lhs_shape).c_str(),
-                        ShapeUtil::HumanString(rhs_shape).c_str()));
-    return false;
-  }
-  int num_dims = ShapeUtil::Rank(lhs_shape);
-  if (num_dims < 2) {
-    NoteError(InvalidArgument(
-        "Convolution expects argument arrays with >= 3 dimensions. "
-        "Got: %s and %s",
-        ShapeUtil::HumanString(lhs_shape).c_str(),
-        ShapeUtil::HumanString(rhs_shape).c_str()));
-    return false;
-  }
-  int num_spatial_dims = num_dims - 2;
-
-  const auto check_spatial_dimensions =
-      [&](const char* const field_name,
-          const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>&
-              numbers) {
-        if (numbers.size() != num_spatial_dims) {
-          NoteError(InvalidArgument("Expected %d elements for %s, but got %d.",
-                                    num_spatial_dims, field_name,
-                                    numbers.size()));
-          return false;
-        }
-        for (int i = 0; i < numbers.size(); ++i) {
-          if (numbers.Get(i) < 0 || numbers.Get(i) >= num_dims) {
-            NoteError(
-                InvalidArgument("Convolution %s[%d] is out of bounds: %lld",
-                                field_name, i, numbers.Get(i)));
-            return false;
-          }
-        }
-        return true;
-      };
-  return check_spatial_dimensions(
-             "input_spatial_dimensions",
-             dimension_numbers.input_spatial_dimensions()) &&
-         check_spatial_dimensions(
-             "kernel_spatial_dimensions",
-             dimension_numbers.kernel_spatial_dimensions()) &&
-         check_spatial_dimensions(
-             "output_spatial_dimensions",
-             dimension_numbers.output_spatial_dimensions());
-}
-
-ComputationDataHandle ComputationBuilder::ConvWithGeneralDimensions(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> lhs_shape_or_status = GetShape(lhs);
-  if (!lhs_shape_or_status.ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> rhs_shape_or_status = GetShape(rhs);
-  if (!rhs_shape_or_status.ok()) {
-    return ComputationDataHandle();
-  }
-
-  std::unique_ptr<Shape> lhs_shape = lhs_shape_or_status.ConsumeValueOrDie();
-  std::unique_ptr<Shape> rhs_shape = rhs_shape_or_status.ConsumeValueOrDie();
-
-  if (!VerifyConvolution(*lhs_shape, *rhs_shape, dimension_numbers)) {
-    NoteError(InternalError("failed to verify convolution"));
-    return ComputationDataHandle();
-  }
-
-  std::vector<int64> base_area_dimensions(
-      dimension_numbers.input_spatial_dimensions_size());
-  for (std::vector<int64>::size_type i = 0; i < base_area_dimensions.size();
-       ++i) {
-    base_area_dimensions[i] =
-        lhs_shape->dimensions(dimension_numbers.input_spatial_dimensions(i));
-  }
-
-  std::vector<int64> window_dimensions(
-      dimension_numbers.kernel_spatial_dimensions_size());
-  for (std::vector<int64>::size_type i = 0; i < window_dimensions.size(); ++i) {
-    window_dimensions[i] =
-        rhs_shape->dimensions(dimension_numbers.kernel_spatial_dimensions(i));
-  }
-
-  return ConvGeneral(lhs, rhs, window_strides,
-                     MakePadding(base_area_dimensions, window_dimensions,
-                                 window_strides, padding),
-                     dimension_numbers);
-}
-
-ComputationDataHandle ComputationBuilder::ConvGeneral(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
-  return ConvGeneralDilated(lhs, rhs, window_strides, padding, {}, {},
-                            dimension_numbers);
-}
-
-ComputationDataHandle ComputationBuilder::ConvGeneralDilated(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-    tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> lhs_shape_or_status = GetShape(lhs);
-  if (!lhs_shape_or_status.ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> rhs_shape_or_status = GetShape(rhs);
-  if (!rhs_shape_or_status.ok()) {
-    return ComputationDataHandle();
-  }
-
-  std::unique_ptr<Shape> lhs_shape = lhs_shape_or_status.ConsumeValueOrDie();
-  std::unique_ptr<Shape> rhs_shape = rhs_shape_or_status.ConsumeValueOrDie();
-  if (!VerifyConvolution(*lhs_shape, *rhs_shape, dimension_numbers)) {
-    // Error is recorded in VerifyConvolution.
-    return ComputationDataHandle();
-  }
-
-  std::vector<int64> window_dimensions(
-      dimension_numbers.kernel_spatial_dimensions_size());
-  for (std::vector<int64>::size_type i = 0; i < window_dimensions.size(); ++i) {
-    window_dimensions[i] =
-        rhs_shape->dimensions(dimension_numbers.kernel_spatial_dimensions(i));
-  }
-
-  OpRequest op_request;
-  ConvolveRequest* request = op_request.mutable_convolve_request();
-  *request->mutable_lhs() = lhs;
-  *request->mutable_rhs() = rhs;
-  *request->mutable_dimension_numbers() = dimension_numbers;
-
-  if (!MakeWindow(window_dimensions, window_strides, padding, lhs_dilation,
-                  rhs_dilation, request->mutable_window())) {
-    // Error is recorded in MakeWindow.
-    return ComputationDataHandle();
-  }
-
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Fft(
-    const ComputationDataHandle& operand, const FftType fft_type,
-    const tensorflow::gtl::ArraySlice<int64> fft_length) {
-  OpRequest op_request;
-  FftRequest* request = op_request.mutable_fft_request();
-  *request->mutable_operand() = operand;
-  request->set_fft_type(fft_type);
-  for (int64 dim_len : fft_length) {
-    request->add_fft_length(dim_len);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Infeed(const Shape& shape,
-                                                 const string& config) {
-  OpRequest op_request;
-  InfeedRequest* request = op_request.mutable_infeed_request();
-  *request->mutable_shape() = shape;
-  *request->mutable_config() = config;
-  return RunOpAndParseResponse(&op_request);
-}
-
-void ComputationBuilder::Outfeed(const ComputationDataHandle& operand,
-                                 const Shape& shape_with_layout,
-                                 const string& outfeed_config) {
-  OpRequest op_request;
-  OutfeedRequest* request = op_request.mutable_outfeed_request();
-  request->set_outfeed_config(outfeed_config);
-  *request->mutable_operand() = operand;
-  *request->mutable_shape() = shape_with_layout;
-  RunOpAndNoteError(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Call(
-    const Computation& computation,
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands) {
-  OpRequest op_request;
-  CallRequest* request = op_request.mutable_call_request();
-  *request->mutable_to_apply() = computation.handle();
-  for (const ComputationDataHandle& operand : operands) {
-    *request->add_operands() = operand;
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::CustomCall(
-    const string& call_target_name,
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-    const Shape& shape) {
-  OpRequest op_request;
-  CustomCallRequest* request = op_request.mutable_custom_call_request();
-  request->set_call_target_name(call_target_name);
-  for (const ComputationDataHandle& operand : operands) {
-    *request->add_operands() = operand;
-  }
-  *request->mutable_shape() = shape;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::HostCompute(
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-    const string& channel_name, int64 cost_estimate_ns, const Shape& shape) {
-  OpRequest op_request;
-  HostComputeRequest* request = op_request.mutable_host_compute_request();
-  for (const ComputationDataHandle& operand : operands) {
-    *request->add_operands() = operand;
-  }
-  *request->mutable_shape() = shape;
-  request->set_channel_name(channel_name);
-  request->set_cost_estimate_ns(cost_estimate_ns);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Complex(
-    const ComputationDataHandle& real, const ComputationDataHandle& imag,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_COMPLEX, real, imag, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Conj(
-    const ComputationDataHandle& operand) {
-  return Complex(Real(operand), Neg(Imag(operand)));
-}
-
-ComputationDataHandle ComputationBuilder::Add(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_ADD, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Sub(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_SUB, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Mul(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_MUL, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Div(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_DIV, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Rem(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_REM, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Max(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_MAX, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Min(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_MIN, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::And(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_AND, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Or(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_OR, lhs, rhs, broadcast_dimensions);
-}
-
-// TODO(b/65209188): Create a dedicated lowering for Xor
-ComputationDataHandle ComputationBuilder::Xor(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return Or(And(Not(lhs), rhs, broadcast_dimensions),
-            And(lhs, Not(rhs), broadcast_dimensions));
-}
-
-ComputationDataHandle ComputationBuilder::Not(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_NOT, operand);
-}
-
-ComputationDataHandle ComputationBuilder::ShiftLeft(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_SHIFT_LEFT, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::ShiftRightArithmetic(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_SHIFT_RIGHT_ARITHMETIC, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::ShiftRightLogical(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_SHIFT_RIGHT_LOGICAL, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Abs(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_ABS, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Atan2(
-    const ComputationDataHandle& y, const ComputationDataHandle& x,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_ATAN2, y, x, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Exp(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_EXP, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Floor(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_FLOOR, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Ceil(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_CEIL, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Round(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_ROUND_NEAREST_AFZ, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Log(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_LOG, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Sign(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_SIGN, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Cos(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_COS, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Sin(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_SIN, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Tanh(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_TANH, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Real(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_REAL, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Imag(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_IMAG, operand);
-}
-
-ComputationDataHandle ComputationBuilder::IsFinite(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_IS_FINITE, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Transpose(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> permutation) {
-  OpRequest op_request;
-  TransposeRequest* request = op_request.mutable_transpose_request();
-  *request->mutable_operand() = operand;
-  for (int64 dimension : permutation) {
-    request->add_dimensions(dimension);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Rev(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> dimensions) {
-  OpRequest op_request;
-  ReverseRequest* request = op_request.mutable_reverse_request();
-  *request->mutable_operand() = operand;
-  for (int64 dimension : dimensions) {
-    request->add_dimensions(dimension);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Sort(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_SORT, operand);
-}
-
-ComputationDataHandle ComputationBuilder::SqrtF32(
-    const ComputationDataHandle& operand) {
-  return BinaryOp(BINOP_POW, operand, ConstantR0<float>(0.5),
-                  /*broadcast_dimensions=*/{});
-}
-
-ComputationDataHandle ComputationBuilder::Pow(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_POW, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::ConvertElementType(
-    const ComputationDataHandle& operand, PrimitiveType new_element_type) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> shape_status = GetShape(operand);
-  if (!shape_status.ok()) {
-    return ComputationDataHandle();
-  }
-  std::unique_ptr<Shape> original = shape_status.ConsumeValueOrDie();
-
-  OpRequest op_request;
-  ConvertRequest* request = op_request.mutable_convert_request();
-  *request->mutable_operand() = operand;
-  request->set_new_element_type(new_element_type);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::BitcastConvertType(
-    const ComputationDataHandle& operand, PrimitiveType new_element_type) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> shape_status = GetShape(operand);
-  if (!shape_status.ok()) {
-    return ComputationDataHandle();
-  }
-  std::unique_ptr<Shape> original = shape_status.ConsumeValueOrDie();
-
-  OpRequest op_request;
-  ConvertRequest* request = op_request.mutable_bitcast_convert_request();
-  *request->mutable_operand() = operand;
-  request->set_new_element_type(new_element_type);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::SquareF32(
-    const ComputationDataHandle& operand) {
-  return BinaryOp(BINOP_POW, operand, ConstantR0<float>(2.0),
-                  /*broadcast_dimensions=*/{});
-}
-
-ComputationDataHandle ComputationBuilder::ReciprocalF32(
-    const ComputationDataHandle& operand) {
-  return BinaryOp(BINOP_POW, operand, ConstantR0<float>(-1.0),
-                  /*broadcast_dimensions=*/{});
-}
-
-ComputationDataHandle ComputationBuilder::Neg(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_NEGATE, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Clz(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_CLZ, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Clamp(
-    const ComputationDataHandle& min, const ComputationDataHandle& operand,
-    const ComputationDataHandle& max) {
-  return TernaryOp(TRIOP_CLAMP, min, operand, max);
-}
-
-ComputationDataHandle ComputationBuilder::UnaryOp(
-    UnaryOperation unop, const ComputationDataHandle& operand) {
-  OpRequest op_request;
-  UnaryOpRequest* request = op_request.mutable_unary_op_request();
-  request->set_unop(unop);
-  *request->mutable_operand() = operand;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::BinaryOp(
-    BinaryOperation binop, const ComputationDataHandle& lhs,
-    const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  OpRequest op_request;
-  BinaryOpRequest* request = op_request.mutable_binary_op_request();
-  request->set_binop(binop);
-  *request->mutable_lhs() = lhs;
-  *request->mutable_rhs() = rhs;
-  for (int64 dimension : broadcast_dimensions) {
-    request->add_broadcast_dimensions(dimension);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::RngOp(
-    RandomDistribution distribution,
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> parameters,
-    const Shape& shape) {
-  OpRequest op_request;
-  RngRequest* request = op_request.mutable_rng_request();
-  request->set_distribution(distribution);
-  for (const ComputationDataHandle& param : parameters) {
-    *request->add_parameter() = param;
-  }
-  *request->mutable_shape() = shape;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::TernaryOp(
-    TernaryOperation triop, const ComputationDataHandle& lhs,
-    const ComputationDataHandle& rhs, const ComputationDataHandle& ehs) {
-  OpRequest op_request;
-  TernaryOpRequest* request = op_request.mutable_ternary_op_request();
-  request->set_triop(triop);
-  *request->mutable_lhs() = lhs;
-  *request->mutable_rhs() = rhs;
-  *request->mutable_ehs() = ehs;
-  return RunOpAndParseResponse(&op_request);
-}
-
-Status ComputationBuilder::SetReturnValue(
-    const ComputationDataHandle& operand) {
-  TF_RETURN_IF_ERROR(first_error_);
-
-  SetReturnValueRequest request;
-  *request.mutable_computation() = computation_.handle();
-  *request.mutable_operand() = operand;
-
-  SetReturnValueResponse response;
-
-  VLOG(2) << "making set-handle-to-execute request";
-  Status s = client_->stub()->SetReturnValue(&request, &response);
-  VLOG(2) << "done with request";
-
-  if (!s.ok()) {
-    NoteError(s);
-    return first_error_;
-  }
-
-  return Status::OK();
-}
-
-StatusOr<bool> ComputationBuilder::IsConstant(
-    const ComputationDataHandle& operand, int64 num_parameters) {
-  TF_RETURN_IF_ERROR(first_error_);
-
-  IsConstantRequest request;
-  *request.mutable_computation() = computation_.handle();
-  *request.mutable_operand() = operand;
-  request.set_num_parameters(num_parameters);
-  IsConstantResponse response;
-
-  VLOG(2) << "making IsConstant request";
-  Status s = client_->stub()->IsConstant(&request, &response);
-  VLOG(2) << "done with request";
-
-  if (!s.ok()) {
-    return s;
-  }
-  return response.is_constant();
-}
-
-StatusOr<std::unique_ptr<Literal>> ComputationBuilder::ComputeConstant(
-    const ComputationDataHandle& operand, const Layout* output_layout,
-    tensorflow::gtl::ArraySlice<Literal> parameters) {
-  TF_RETURN_IF_ERROR(first_error_);
-
-  ComputeConstantRequest request;
-  *request.mutable_computation() = computation_.handle();
-  *request.mutable_operand() = operand;
-  if (output_layout != nullptr) {
-    *request.mutable_output_layout() = *output_layout;
-  }
-  for (const auto& param : parameters) {
-    *request.add_parameters() = param.ToProto();
-  }
-
-  ComputeConstantResponse response;
-
-  VLOG(2) << "making compute-constant request";
-  Status s = client_->stub()->ComputeConstant(&request, &response);
-  VLOG(2) << "done with request";
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  VLOG(3) << "ComputeConstant: {" << response.DebugString() << "}";
-
-  if (!response.has_literal()) {
-    return InternalError(
-        "no computed literal in the provided response in ComputeConstant "
-        "request");
-  }
-  return Literal::CreateFromProto(response.literal());
-}
-
-ComputationDataHandle ComputationBuilder::Map(
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-    const Computation& computation,
-    tensorflow::gtl::ArraySlice<int64> dimensions,
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> static_operands) {
-  OpRequest op_request;
-  MapRequest* request = op_request.mutable_map_request();
-  for (const ComputationDataHandle& operand : operands) {
-    *request->add_operands() = operand;
-  }
-  *request->mutable_to_apply() = computation.handle();
-  for (int64 dimension : dimensions) {
-    request->add_dimensions(dimension);
-  }
-  for (const ComputationDataHandle& sop : static_operands) {
-    *request->add_static_operands() = sop;
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::RngNormal(
-    const ComputationDataHandle& mu, const ComputationDataHandle& sigma,
-    const Shape& shape) {
-  return RngOp(RandomDistribution::RNG_NORMAL, {mu, sigma}, shape);
-}
-
-ComputationDataHandle ComputationBuilder::RngUniform(
-    const ComputationDataHandle& a, const ComputationDataHandle& b,
-    const Shape& shape) {
-  return RngOp(RandomDistribution::RNG_UNIFORM, {a, b}, shape);
-}
-
-ComputationDataHandle ComputationBuilder::While(
-    const Computation& condition, const Computation& body,
-    const ComputationDataHandle& init) {
-  OpRequest op_request;
-  WhileRequest* request = op_request.mutable_while_request();
-  *request->mutable_condition() = condition.handle();
-  *request->mutable_body() = body.handle();
-  *request->mutable_init() = init;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Gather(
-    const ComputationDataHandle& input,
-    const ComputationDataHandle& gather_indices,
-    const GatherDimensionNumbers& dimension_numbers,
-    tensorflow::gtl::ArraySlice<int64> window_bounds) {
-  OpRequest op_request;
-  GatherRequest* gather_request = op_request.mutable_gather_request();
-  *gather_request->mutable_input() = input;
-  *gather_request->mutable_gather_indices() = gather_indices;
-  *gather_request->mutable_dimension_numbers() = dimension_numbers;
-  for (int64 window_bound : window_bounds) {
-    gather_request->add_window_bounds(window_bound);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Conditional(
-    const ComputationDataHandle& predicate,
-    const ComputationDataHandle& true_operand,
-    const Computation& true_computation,
-    const ComputationDataHandle& false_operand,
-    const Computation& false_computation) {
-  OpRequest op_request;
-  ConditionalRequest* request = op_request.mutable_conditional_request();
-  *request->mutable_predicate() = predicate;
-  *request->mutable_true_operand() = true_operand;
-  *request->mutable_true_computation() = true_computation.handle();
-  *request->mutable_false_operand() = false_operand;
-  *request->mutable_false_computation() = false_computation.handle();
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Reduce(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& init_value, const Computation& computation,
-    tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce) {
-  OpRequest op_request;
-  ReduceRequest* request = op_request.mutable_reduce_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_init_value() = init_value;
-  for (int64 dimension : dimensions_to_reduce) {
-    request->add_dimensions(dimension);
-  }
-  *request->mutable_to_apply() = computation.handle();
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::ReduceAll(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& init_value, const Computation& computation) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> shape = GetShape(operand);
-  if (!shape.ok()) {
-    return ComputationDataHandle();
-  }
-
-  std::vector<int64> all_dimnos(ShapeUtil::Rank(*shape.ValueOrDie()));
-  std::iota(all_dimnos.begin(), all_dimnos.end(), 0);
-  return Reduce(operand, init_value, computation, all_dimnos);
-}
-
-ComputationDataHandle ComputationBuilder::ReduceWindow(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& init_value, const Computation& computation,
-    tensorflow::gtl::ArraySlice<int64> window_dimensions,
-    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding) {
-  if (!first_error_.ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> shape = GetShape(operand);
-  if (!shape.ok()) {
-    return ComputationDataHandle();
-  }
-
-  Status padding_valid =
-      ValidatePaddingValues(AsInt64Slice(shape.ValueOrDie()->dimensions()),
-                            window_dimensions, window_strides);
-  if (!padding_valid.ok()) {
-    first_error_ = padding_valid;
-    return ComputationDataHandle();
-  }
-
-  std::vector<std::pair<int64, int64>> padding_values =
-      MakePadding(AsInt64Slice(shape.ValueOrDie()->dimensions()),
-                  window_dimensions, window_strides, padding);
-  return ReduceWindowWithGeneralPadding(operand, init_value, computation,
-                                        window_dimensions, window_strides,
-                                        padding_values);
-}
-
-ComputationDataHandle ComputationBuilder::ReduceWindowWithGeneralPadding(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& init_value, const Computation& computation,
-    tensorflow::gtl::ArraySlice<int64> window_dimensions,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
-  OpRequest op_request;
-  ReduceWindowRequest* request = op_request.mutable_reduce_window_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_to_apply() = computation.handle();
-  *request->mutable_init_value() = init_value;
-
-  if (!MakeWindow(window_dimensions, window_strides, padding, {}, {},
-                  request->mutable_window())) {
-    NoteError(InternalError("failed to make window"));
-    return ComputationDataHandle();
-  }
-
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::BatchNormTraining(
-    const ComputationDataHandle& operand, const ComputationDataHandle& scale,
-    const ComputationDataHandle& offset, float epsilon, int64 feature_index) {
-  OpRequest op_request;
-  BatchNormTrainingRequest* request =
-      op_request.mutable_batch_norm_training_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_scale() = scale;
-  *request->mutable_offset() = offset;
-  request->set_epsilon(epsilon);
-  request->set_feature_index(feature_index);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::BatchNormInference(
-    const ComputationDataHandle& operand, const ComputationDataHandle& scale,
-    const ComputationDataHandle& offset, const ComputationDataHandle& mean,
-    const ComputationDataHandle& variance, float epsilon, int64 feature_index) {
-  OpRequest op_request;
-  BatchNormInferenceRequest* request =
-      op_request.mutable_batch_norm_inference_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_scale() = scale;
-  *request->mutable_offset() = offset;
-  *request->mutable_mean() = mean;
-  *request->mutable_variance() = variance;
-  request->set_epsilon(epsilon);
-  request->set_feature_index(feature_index);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::BatchNormGrad(
-    const ComputationDataHandle& operand, const ComputationDataHandle& scale,
-    const ComputationDataHandle& batch_mean,
-    const ComputationDataHandle& batch_var,
-    const ComputationDataHandle& grad_output, float epsilon,
-    int64 feature_index) {
-  OpRequest op_request;
-  BatchNormGradRequest* request = op_request.mutable_batch_norm_grad_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_scale() = scale;
-  *request->mutable_mean() = batch_mean;
-  *request->mutable_variance() = batch_var;
-  *request->mutable_grad_output() = grad_output;
-  request->set_epsilon(epsilon);
-  request->set_feature_index(feature_index);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::CrossReplicaSum(
-    const ComputationDataHandle& operand) {
-  OpRequest op_request;
-  CrossReplicaSumRequest* request =
-      op_request.mutable_cross_replica_sum_request();
-  *request->mutable_operand() = operand;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::SelectAndScatter(
-    const ComputationDataHandle& operand, const Computation& select,
-    tensorflow::gtl::ArraySlice<int64> window_dimensions,
-    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-    const ComputationDataHandle& source,
-    const ComputationDataHandle& init_value, const Computation& scatter) {
-  if (!first_error_.ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> shape = GetShape(operand);
-  if (!shape.ok()) {
-    return ComputationDataHandle();
-  }
-  return SelectAndScatterWithGeneralPadding(
-      operand, select, window_dimensions, window_strides,
-      MakePadding(AsInt64Slice(shape.ValueOrDie()->dimensions()),
-                  window_dimensions, window_strides, padding),
-      source, init_value, scatter);
-}
-
-ComputationDataHandle ComputationBuilder::SelectAndScatterWithGeneralPadding(
-    const ComputationDataHandle& operand, const Computation& select,
-    tensorflow::gtl::ArraySlice<int64> window_dimensions,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    const ComputationDataHandle& source,
-    const ComputationDataHandle& init_value, const Computation& scatter) {
-  OpRequest op_request;
-  SelectAndScatterRequest* request =
-      op_request.mutable_select_and_scatter_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_select() = select.handle();
-  *request->mutable_source() = source;
-  *request->mutable_init_value() = init_value;
-  *request->mutable_scatter() = scatter.handle();
-
-  if (!MakeWindow(window_dimensions, window_strides, padding, {}, {},
-                  request->mutable_window())) {
-    NoteError(InternalError("failed to make window"));
-    return ComputationDataHandle();
-  }
-
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::ReducePrecision(
-    const ComputationDataHandle& operand, const int exponent_bits,
-    const int mantissa_bits) {
-  OpRequest op_request;
-  ReducePrecisionRequest* request =
-      op_request.mutable_reduce_precision_request();
-  *request->mutable_operand() = operand;
-  request->set_exponent_bits(exponent_bits);
-  request->set_mantissa_bits(mantissa_bits);
-  return RunOpAndParseResponse(&op_request);
-}
-
-void ComputationBuilder::Send(const ComputationDataHandle& operand,
-                              const ChannelHandle& handle) {
-  OpRequest op_request;
-  SendRequest* request = op_request.mutable_send_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_channel_handle() = handle;
-  *op_request.mutable_computation() = computation_.handle();
-  RunOpAndNoteError(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Recv(const Shape& shape,
-                                               const ChannelHandle& handle) {
-  OpRequest op_request;
-  RecvRequest* request = op_request.mutable_recv_request();
-  *request->mutable_shape() = shape;
-  *request->mutable_channel_handle() = handle;
-  return RunOpAndParseResponse(&op_request);
-}
-
-Computation ComputationBuilder::BuildAndNoteError() {
-  DCHECK(parent_builder_ != nullptr);
-  auto build_status = Build();
-  if (!build_status.ok()) {
-    parent_builder_->NoteError(
-        AddStatus(build_status.status(),
-                  tensorflow::strings::StrCat("error from: ", name_)));
-    return Computation();
-  }
-  return build_status.ConsumeValueOrDie();
-}
-
-StatusOr<Computation> ComputationBuilder::Build() {
-  if (!first_error_.ok()) {
-    string backtrace;
-    first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace);
-    return AppendStatus(first_error_, backtrace);
-  }
-
-  if (computation_.IsNull()) {
-    return FailedPrecondition("no computation was built");
-  }
-
-  return {std::move(computation_)};
-}
-
-/* static */ ConvolutionDimensionNumbers
-ComputationBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) {
-  ConvolutionDimensionNumbers dimension_numbers;
-  dimension_numbers.set_input_batch_dimension(kConvBatchDimension);
-  dimension_numbers.set_input_feature_dimension(kConvFeatureDimension);
-  dimension_numbers.set_output_batch_dimension(kConvBatchDimension);
-  dimension_numbers.set_output_feature_dimension(kConvFeatureDimension);
-  dimension_numbers.set_kernel_output_feature_dimension(
-      kConvKernelOutputDimension);
-  dimension_numbers.set_kernel_input_feature_dimension(
-      kConvKernelInputDimension);
-  for (int i = 0; i < num_spatial_dims; ++i) {
-    dimension_numbers.add_input_spatial_dimensions(i + 2);
-    dimension_numbers.add_kernel_spatial_dimensions(i + 2);
-    dimension_numbers.add_output_spatial_dimensions(i + 2);
-  }
-  return dimension_numbers;
-}
-
-/* static */ StatusOr<ConvolutionDimensionNumbers>
-ComputationBuilder::CreateConvDimensionNumbers(
-    int64 input_batch, int64 input_feature, int64 input_first_spatial,
-    int64 input_second_spatial, int64 output_batch, int64 output_feature,
-    int64 output_first_spatial, int64 output_second_spatial,
-    int64 kernel_output_feature, int64 kernel_input_feature,
-    int64 kernel_first_spatial, int64 kernel_second_spatial) {
-  if (std::set<int64>({input_batch, input_feature, input_first_spatial,
-                       input_second_spatial})
-          .size() != 4) {
-    return FailedPrecondition(
-        "dimension numbers for the input are not unique: (%lld, %lld, %lld, "
-        "%lld)",
-        input_batch, input_feature, input_first_spatial, input_second_spatial);
-  }
-  if (std::set<int64>({kernel_output_feature, kernel_input_feature,
-                       kernel_first_spatial, kernel_second_spatial})
-          .size() != 4) {
-    return FailedPrecondition(
-        "dimension numbers for the weight are not unique: (%lld, %lld, %lld, "
-        "%lld)",
-        kernel_output_feature, kernel_input_feature, kernel_first_spatial,
-        kernel_second_spatial);
-  }
-  if (std::set<int64>({output_batch, output_feature, output_first_spatial,
-                       output_second_spatial})
-          .size() != 4) {
-    return FailedPrecondition(
-        "dimension numbers for the output are not unique: (%lld, %lld, %lld, "
-        "%lld)",
-        output_batch, output_feature, output_first_spatial,
-        output_second_spatial);
-  }
-  ConvolutionDimensionNumbers dimension_numbers;
-  dimension_numbers.set_input_batch_dimension(input_batch);
-  dimension_numbers.set_input_feature_dimension(input_feature);
-  dimension_numbers.add_input_spatial_dimensions(input_first_spatial);
-  dimension_numbers.add_input_spatial_dimensions(input_second_spatial);
-  dimension_numbers.set_kernel_output_feature_dimension(kernel_output_feature);
-  dimension_numbers.set_kernel_input_feature_dimension(kernel_input_feature);
-  dimension_numbers.add_kernel_spatial_dimensions(kernel_first_spatial);
-  dimension_numbers.add_kernel_spatial_dimensions(kernel_second_spatial);
-  dimension_numbers.set_output_batch_dimension(output_batch);
-  dimension_numbers.set_output_feature_dimension(output_feature);
-  dimension_numbers.add_output_spatial_dimensions(output_first_spatial);
-  dimension_numbers.add_output_spatial_dimensions(output_second_spatial);
-  return dimension_numbers;
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
deleted file mode 100644
index ac1eb915cc52df94df71631a7e80de9095f7fafb..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ /dev/null
@@ -1,1067 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_COMPUTATION_BUILDER_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_COMPUTATION_BUILDER_H_
-
-#include <functional>
-#include <initializer_list>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "tensorflow/compiler/xla/array.h"
-#include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/array3d.h"
-#include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/bitmap.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/stacktrace.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-
-// Wraps an XLA client with a convenient interface for building up
-// computations. Any errors encountered in building up the computation are
-// deferred from being handled until Build() is called.
-//
-// Thread-compatible.
-//
-// TODO(b/74197823): Deprecated. Use XlaBuilder instead.
-class ComputationBuilder {
- public:
-  // client: client in which to build the computation.
-  // computation_name: name to use for the built computation.
-  ComputationBuilder(Client* client, const string& computation_name);
-
-  ~ComputationBuilder();
-
-  // Returns the client the builder was initialized with.
-  Client* client() const { return client_; }
-
-  // Returns the computation name.
-  const string& name() const { return name_; }
-
-  // Sets OpMetadata that will be added to all instructions until cleared.
-  //
-  // OpMetadata is often applied to a series of XLA HLO instructions. As a
-  // result, OpMetadata is set on the Computation Builder. All subsequent
-  // instructions generated via this Computation Builder will have the same
-  // OpMetadata attached until a call to ClearOpMetadata.
-  void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; }
-
-  // Clears the HloMetadata state.
-  void ClearOpMetadata() { metadata_.Clear(); }
-
-  // Sets an OpSharding that will be attached to all instructions until cleared.
-  void SetSharding(const OpSharding& sharding) { sharding_ = sharding; }
-
-  // Clears the sharding. Ops will be sharded according to the default placement
-  // policy.
-  void ClearSharding() { sharding_ = tensorflow::gtl::nullopt; }
-
-  // Returns the OpSharding that will be attached to all instructions.
-  const tensorflow::gtl::optional<OpSharding>& sharding() const {
-    return sharding_;
-  }
-
-  // Sets the builder to a mode where it will die immediately when an error is
-  // encountered, rather than producing it in a deferred fashion when Build() is
-  // called (which is the default).
-  void set_die_immediately_on_error(bool enabled) {
-    die_immediately_on_error_ = enabled;
-  }
-
-  // Enqueues a "retrieve parameter value" instruction for a parameter that was
-  // passed to the computation.
-  ComputationDataHandle Parameter(int64 parameter_number, const Shape& shape,
-                                  const string& name);
-
-  // Retrieves the (inferred) shape of the operand in the computation.
-  StatusOr<std::unique_ptr<Shape>> GetShape(
-      const ComputationDataHandle& operand);
-
-  // Retrieves the (inferred) result for the current computation's shape.
-  StatusOr<ProgramShape> GetProgramShape();
-
-  // Enqueues a constant with the value of the given literal onto the
-  // computation.
-  ComputationDataHandle ConstantLiteral(const Literal& literal);
-
-  // Enqueues a constant onto the computation. Methods are templated on the
-  // native host type (NativeT) which corresponds to a specific XLA
-  // PrimitiveType as given in the following table:
-  //
-  //  Native Type   PrimitiveType
-  // -----------------------------
-  //   bool           PRED
-  //   int32          S32
-  //   int64          S64
-  //   uint32         U32
-  //   uint64         U64
-  //   float          F32
-  //   double         F64
-  //
-  // Note: not all primitive types defined in xla_data.proto have a
-  // corresponding native type yet.
-  template <typename NativeT>
-  ComputationDataHandle ConstantR0(NativeT value);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR1(tensorflow::gtl::ArraySlice<NativeT> values);
-  ComputationDataHandle ConstantR1(const tensorflow::core::Bitmap& values);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR2(
-      std::initializer_list<std::initializer_list<NativeT>> values);
-  template <typename NativeT>
-  ComputationDataHandle ConstantFromArrayWithLayout(
-      const Array<NativeT>& values, const Layout& layout);
-  template <typename NativeT>
-  ComputationDataHandle ConstantFromArray(const Array<NativeT>& values);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR2FromArray2DWithLayout(
-      const Array2D<NativeT>& values, const Layout& layout);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR2FromArray2D(const Array2D<NativeT>& values);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR3FromArray3DWithLayout(
-      const Array3D<NativeT>& values, const Layout& layout);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR3FromArray3D(const Array3D<NativeT>& values);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR4FromArray4DWithLayout(
-      const Array4D<NativeT>& values, const Layout& layout);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR4FromArray4D(const Array4D<NativeT>& values);
-
-  // Enqueues a rank one constant (vector) onto the computation. The vector has
-  // size 'length' and every element has the value 'value'.
-  template <typename NativeT>
-  ComputationDataHandle ConstantR1(int64 length, NativeT value);
-
-  // Adds dimensions to an array by duplicating the data in the array.
-  //
-  // The new dimensions are inserted on the left, i.e. if
-  // broadcast_sizes has values {a0, ..., aN} and the operand shape
-  // has dimensions {b0, ..., bM} then the shape of the output has
-  // dimensions {a0, ..., aN, b0, ..., bM}.
-  //
-  // The new dimensions index into copies of the operand, i.e.
-  //
-  //   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
-  ComputationDataHandle Broadcast(
-      const ComputationDataHandle& operand,
-      tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
-
-  // Enqueues a pad operation onto the computation that pads the given value on
-  // the edges as well as between the elements of the input. padding_config
-  // specifies the padding amount for each dimension.
-  ComputationDataHandle Pad(const ComputationDataHandle& operand,
-                            const ComputationDataHandle& padding_value,
-                            const PaddingConfig& padding_config);
-
-  // Enqueues an operation onto the computation that flattens the operand based
-  // on the dimension order (major/slowest-varying to minor/fastest-varying)
-  // given, followed by reshaping it into the shape with the given dimension
-  // sizes (also major to minor). Conceptually, this is a limited form of
-  // "shape casting".
-  ComputationDataHandle Reshape(const ComputationDataHandle& operand,
-                                tensorflow::gtl::ArraySlice<int64> dimensions,
-                                tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-  // Enqueues an operation onto the computation that collapses the operand, from
-  // first to last dimension (C order), then reshapes it to the given dimension
-  // sizes. Conceptually, this is a limited form of "shape casting".
-  ComputationDataHandle Reshape(const ComputationDataHandle& operand,
-                                tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-  // Wrapper for Reshape.
-  // Enqueues an operation to collapse the provided dimensions; e.g. an
-  // operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
-  // {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
-  // be a consecutive, in-order subsequence of the operand dimensions.
-  //
-  // Note that collapsing a single dimension does nothing:
-  //
-  //    {256} collapsing {0} => {256}
-  //    {1} collapsing {0} => {1}
-  //
-  // Collapsing multiple dimensions produces a single result dimension:
-  //
-  //    {256, 2} collapsing {0,1} => {512}
-  //    {256, 2, 3} collapsing {0,1} => {512, 3}
-  //
-  // This could potentially cause data to be moved -- it provides a more
-  // structured form of reshaping than an arbitrary Reshape operation.
-  ComputationDataHandle Collapse(const ComputationDataHandle& operand,
-                                 tensorflow::gtl::ArraySlice<int64> dimensions);
-
-  // Enqueues a slice operation onto the computation that slices the operand
-  // from the start indices to the limit indices; e.g.
-  //
-  //        x
-  //   [ 0 1 2 3 ]
-  // y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
-  //   [ 8 9 a b ]
-  //
-  // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
-  // range notation.
-  // The strides parameter determines the stride over the slice
-  ComputationDataHandle Slice(const ComputationDataHandle& operand,
-                              tensorflow::gtl::ArraySlice<int64> start_indices,
-                              tensorflow::gtl::ArraySlice<int64> limit_indices,
-                              tensorflow::gtl::ArraySlice<int64> strides);
-
-  // Enqueues a slice operation in a given dimension, taking all other
-  // dimensions as they are; e.g. if dimno is 1 from start_index 2 to
-  // limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
-  // for:
-  //
-  //  array[:, 2:4:1, :]
-  ComputationDataHandle SliceInDim(const ComputationDataHandle& operand,
-                                   int64 start_index, int64 limit_index,
-                                   int64 stride, int64 dimno);
-
-  // Enqueues a slice operation onto the computation that slices the 'operand'
-  // from dynamic start indices which are passed in 'start_indices'.
-  // The size of the slice in each dimension is passed in 'slice_sizes',
-  // which specify the end point of exclusive slice intervals in each
-  // dimension [start, start + size).
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo input dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
-  ComputationDataHandle DynamicSlice(
-      const ComputationDataHandle& operand,
-      const ComputationDataHandle& start_indices,
-      tensorflow::gtl::ArraySlice<int64> slice_sizes);
-
-  // Enqueues a dynamic update slice operation onto the computation, which
-  // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
-  // The shape of 'update' determines the shape of the slice of 'operand'
-  // which is updated.
-  // The indices specified in 'start_indices' specify the offset of the slice
-  // of 'operand' which is updated.
-  //
-  //               update = {10, 11} // calculated at runtime.
-  //   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
-  //   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
-  //   [7 8 9]                                                  [7 8  9 ]
-  //
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo update dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
-  ComputationDataHandle DynamicUpdateSlice(
-      const ComputationDataHandle& operand, const ComputationDataHandle& update,
-      const ComputationDataHandle& start_indices);
-
-  // Enqueues a concatenate instruction onto the computation. 'operands' must
-  // have >= 1 entry.
-  ComputationDataHandle ConcatInDim(
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-      int64 dimension);
-
-  // Enqueue a tracing operation onto the computation; the computation will emit
-  // a logging message with the operand.
-  void Trace(const string& tag, const ComputationDataHandle& operand);
-
-  // Enqueues a conditional-move-like select operation onto the computation;
-  // predicated on pred, selects between on_true and on_false.
-  ComputationDataHandle Select(const ComputationDataHandle& pred,
-                               const ComputationDataHandle& on_true,
-                               const ComputationDataHandle& on_false);
-
-  // Enqueues a tuple-creation instruction onto the computation.
-  ComputationDataHandle Tuple(
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> elements);
-
-  // Enqueues a tuple-element-get instruction onto the computation.
-  ComputationDataHandle GetTupleElement(const ComputationDataHandle& tuple_data,
-                                        int64 index);
-
-  // Enqueues an equal-to comparison instruction onto the computation.
-  ComputationDataHandle Eq(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a not-equal comparison instruction onto the computation.
-  ComputationDataHandle Ne(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a greater-or-equal comparison instruction onto the computation.
-  ComputationDataHandle Ge(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a greater-than comparison instruction onto the computation.
-  ComputationDataHandle Gt(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a less-than comparison instruction onto the computation.
-  ComputationDataHandle Lt(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a less-or-equal comparison instruction onto the computation.
-  ComputationDataHandle Le(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a dot instruction onto the computation.
-  ComputationDataHandle Dot(const ComputationDataHandle& lhs,
-                            const ComputationDataHandle& rhs);
-
-  // Enqueues a general dot instruction onto the computation.
-  ComputationDataHandle DotGeneral(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      const DotDimensionNumbers& dimension_numbers);
-
-  // Default dimension numbers used for a 2D convolution.
-  static constexpr int64 kConvBatchDimension = 0;
-  static constexpr int64 kConvFeatureDimension = 1;
-  static constexpr int64 kConvFirstSpatialDimension = 2;
-  static constexpr int64 kConvSecondSpatialDimension = 3;
-  static constexpr int64 kConvKernelOutputDimension = 0;
-  static constexpr int64 kConvKernelInputDimension = 1;
-  static constexpr int64 kConvKernelFirstSpatialDimension = 2;
-  static constexpr int64 kConvKernelSecondSpatialDimension = 3;
-
-  // Creates a default ConvolutionDimensionNumbers. For a 2D convolution, for
-  // the input operand {batch, feature, height, width} = {0, 1, 2, 3} and for
-  // the kernel operand
-  // {output_feature, input_feature, height, width} = {0, 1, 2, 3}.
-  static ConvolutionDimensionNumbers CreateDefaultConvDimensionNumbers(
-      int num_spatial_dims = 2);
-
-  // Creates a ConvolutionDimensionNumbers with the given arguments. Returns an
-  // error if either the input or the weight dimension numbers have conflicts.
-  static StatusOr<ConvolutionDimensionNumbers> CreateConvDimensionNumbers(
-      int64 input_batch, int64 input_feature, int64 input_first_spatial,
-      int64 input_second_spatial, int64 output_batch, int64 output_feature,
-      int64 output_first_spatial, int64 output_second_spatial,
-      int64 kernel_output_feature, int64 kernel_input_feature,
-      int64 kernel_first_spatial, int64 kernel_second_spatial);
-
-  // Enqueues a convolution instruction onto the computation, which uses the
-  // default convolution dimension numbers.
-  ComputationDataHandle Conv(const ComputationDataHandle& lhs,
-                             const ComputationDataHandle& rhs,
-                             tensorflow::gtl::ArraySlice<int64> window_strides,
-                             Padding padding);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration in the format returned by MakePadding().
-  ComputationDataHandle ConvWithGeneralPadding(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided dimension numbers configuration.
-  ComputationDataHandle ConvWithGeneralDimensions(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration as well as the dimension numbers.
-  ComputationDataHandle ConvGeneral(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration, dilation factors and dimension numbers.
-  ComputationDataHandle ConvGeneralDilated(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-      tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-
-  // Enqueues an FFT instruction onto the computation, of the given type and
-  // with the given FFT length.
-  ComputationDataHandle Fft(const ComputationDataHandle& operand,
-                            FftType fft_type,
-                            tensorflow::gtl::ArraySlice<int64> fft_length);
-
-  // Enqueues an infeed instruction onto the computation, which writes data of
-  // the given shape to the infeed buffer of the device.
-  ComputationDataHandle Infeed(const Shape& shape, const string& config = "");
-
-  // Enqueues an outfeed instruction onto the computation. This instruction
-  // generates outgoing data transfers for the given data.
-  //
-  // shape_with_layout communicates the laid out shape that we want to outfeed
-  // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
-  // will occur.
-  void Outfeed(const ComputationDataHandle& operand,
-               const Shape& shape_with_layout, const string& outfeed_config);
-
-  // Enqueues a call instruction onto the computation.
-  ComputationDataHandle Call(
-      const Computation& computation,
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands);
-
-  // Enqueues a custom call instruction onto the computation.
-  // During code generation, a call instruction is emitted which targets a
-  // symbol with the name |call_target_name|.  The |operands| are passed to the
-  // call instruction.  |shape| is the resultant shape.
-  ComputationDataHandle CustomCall(
-      const string& call_target_name,
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-      const Shape& shape);
-
-  // Enqueues a pseudo-op to represent host-side computation data-dependencies.
-  // During code generation, host send and receive operations will be generated
-  // to transfer |operands| to the host and a single result of |shape| back to
-  // the device.  Host send/recv operations are emitted using |channel_name|.
-  // Dataflow dependencies and the |cost_estimate_ns| field may be used in HLO
-  // instruction scheduling.
-  ComputationDataHandle HostCompute(
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-      const string& channel_name, int64 cost_estimate_ns, const Shape& shape);
-
-  // The following methods enqueue element-wise binary arithmetic operations
-  // onto the computation. The shapes of the operands have to match unless one
-  // of the operands is a scalar, or an explicit broadcast dimension is given
-  // (see g3doc for more details).
-
-  // Enqueues a complex compose instruction onto the computation.
-  ComputationDataHandle Complex(
-      const ComputationDataHandle& real, const ComputationDataHandle& imag,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a complex conjugate instruction onto the computation.
-  ComputationDataHandle Conj(const ComputationDataHandle& operand);
-
-  // Enqueues an add instruction onto the computation.
-  ComputationDataHandle Add(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a subtract instruction onto the computation.
-  ComputationDataHandle Sub(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a multiply instruction onto the computation.
-  ComputationDataHandle Mul(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a divide instruction onto the computation.
-  ComputationDataHandle Div(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a remainder instruction onto the computation.
-  ComputationDataHandle Rem(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a max instruction onto the computation.
-  ComputationDataHandle Max(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a min instruction onto the computation.
-  ComputationDataHandle Min(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Element-wise logical operators
-  ComputationDataHandle And(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  ComputationDataHandle Or(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  ComputationDataHandle Xor(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  ComputationDataHandle Not(const ComputationDataHandle& operand);
-
-  ComputationDataHandle ShiftLeft(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-  ComputationDataHandle ShiftRightArithmetic(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-  ComputationDataHandle ShiftRightLogical(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Reduces an array among the provided dimensions, given "computation" as a
-  // reduction operator.
-  ComputationDataHandle Reduce(
-      const ComputationDataHandle& operand,
-      const ComputationDataHandle& init_value, const Computation& computation,
-      tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
-
-  // Convenience wrapper around the above that reduces all the dimensions in the
-  // operand shape.
-  ComputationDataHandle ReduceAll(const ComputationDataHandle& operand,
-                                  const ComputationDataHandle& init_value,
-                                  const Computation& computation);
-
-  // Enqueues a windowed reduce instruction onto the computation.
-  ComputationDataHandle ReduceWindow(
-      const ComputationDataHandle& operand,
-      const ComputationDataHandle& init_value, const Computation& computation,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding);
-
-  // As ReduceWindow(), but the padding is given in the format
-  // returned by MakePadding().
-  ComputationDataHandle ReduceWindowWithGeneralPadding(
-      const ComputationDataHandle& operand,
-      const ComputationDataHandle& init_value, const Computation& computation,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-
-  // Returns the sum of the operand value across all replicas. All replicas
-  // supply one input to the sum and all replicas receive the resulting sum.
-  ComputationDataHandle CrossReplicaSum(const ComputationDataHandle& operand);
-
-  // Enqueues an operation that scatters the `source` array to the selected
-  // indices of each window.
-  ComputationDataHandle SelectAndScatter(
-      const ComputationDataHandle& operand, const Computation& select,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-      const ComputationDataHandle& source,
-      const ComputationDataHandle& init_value, const Computation& scatter);
-
-  // As SelectAndScatter(), but the padding is given in the format
-  // returned by MakePadding().
-  ComputationDataHandle SelectAndScatterWithGeneralPadding(
-      const ComputationDataHandle& operand, const Computation& select,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const ComputationDataHandle& source,
-      const ComputationDataHandle& init_value, const Computation& scatter);
-
-  // Enqueues an abs instruction onto the computation.
-  ComputationDataHandle Abs(const ComputationDataHandle& operand);
-
-  // Enqueues a atan2 instruction onto the computation.
-  ComputationDataHandle Atan2(
-      const ComputationDataHandle& y, const ComputationDataHandle& x,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues an exp instruction onto the computation.
-  ComputationDataHandle Exp(const ComputationDataHandle& operand);
-
-  // Enqueues a floor instruction onto the computation.
-  ComputationDataHandle Floor(const ComputationDataHandle& operand);
-
-  // Enqueues a ceil instruction onto the computation.
-  ComputationDataHandle Ceil(const ComputationDataHandle& operand);
-
-  // Enqueues a round instruction onto the computation, rounding to nearest even
-  // with half-way cases rounding away from zero.
-  ComputationDataHandle Round(const ComputationDataHandle& operand);
-
-  // Enqueues an log instruction (natural logarithm) onto the computation.
-  ComputationDataHandle Log(const ComputationDataHandle& operand);
-
-  // Enqueues a sign instruction onto the computation.
-  ComputationDataHandle Sign(const ComputationDataHandle& operand);
-
-  // Enqueues a cosine instruction onto the computation.
-  ComputationDataHandle Cos(const ComputationDataHandle& operand);
-
-  // Enqueues a sine instruction onto the computation.
-  ComputationDataHandle Sin(const ComputationDataHandle& operand);
-
-  // Enqueues a tanh instruction onto the computation.
-  ComputationDataHandle Tanh(const ComputationDataHandle& operand);
-
-  // Enqueues a real-part instruction onto the computation.
-  ComputationDataHandle Real(const ComputationDataHandle& operand);
-
-  // Enqueues an imaginary-part instruction onto the computation.
-  ComputationDataHandle Imag(const ComputationDataHandle& operand);
-
-  // Enqueues a float32 sqrt instruction onto the computation.
-  // (float32 is specified as there is an implicit float32 0.5f constant
-  // exponent).
-  ComputationDataHandle SqrtF32(const ComputationDataHandle& operand);
-
-  // Enqueues a float32 square instruction onto the computation.
-  // (float32 is specified as there is an implicit float32 2.0f constant
-  // exponent).
-  ComputationDataHandle SquareF32(const ComputationDataHandle& operand);
-
-  // Enqueues a lhs^rhs computation onto the computation.
-  ComputationDataHandle Pow(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues an operator that tests if the operand's values are finite, i.e.,
-  // not Inf or NaN. Defined only for floating-point types. Returns an array of
-  // booleans with the same shape where entries are true iff the corresponding
-  // entry was NaN.
-  ComputationDataHandle IsFinite(const ComputationDataHandle& operand);
-
-  // Enqueues a convert instruction onto the computation that changes the
-  // element type of the operand array to primitive_type.
-  ComputationDataHandle ConvertElementType(const ComputationDataHandle& operand,
-                                           PrimitiveType new_element_type);
-
-  // Enqueues a no-op instruction onto the computation that changes
-  // the element type of the operand array to primitive_type. The
-  // bit-widths of the source and destination element types must be
-  // identical.
-  ComputationDataHandle BitcastConvertType(const ComputationDataHandle& operand,
-                                           PrimitiveType new_element_type);
-
-  // Enqueues a float32 reciprocal instruction onto the computation.
-  // (float32 is specified as there is an implicit float32 -1.0f constant
-  // exponent).
-  //
-  // TODO(b/34468990) axe F32 suffix, can be determined by reflecting on the
-  // shape of the operand.
-  ComputationDataHandle ReciprocalF32(const ComputationDataHandle& operand);
-
-  // Enqueues a negate instruction onto the computation.
-  ComputationDataHandle Neg(const ComputationDataHandle& operand);
-
-  // Enqueues a count-leading-zeros instruction onto the computation.
-  ComputationDataHandle Clz(const ComputationDataHandle& operand);
-
-  // Enqueues a transpose instruction onto the computation.
-  ComputationDataHandle Transpose(
-      const ComputationDataHandle& operand,
-      tensorflow::gtl::ArraySlice<int64> permutation);
-
-  // Enqueues a reverse instruction onto the computation. The order of the
-  // elements in the given dimensions is reversed (i.e., the element at index i
-  // is moved to index dimension_size - 1 - i).
-  ComputationDataHandle Rev(const ComputationDataHandle& operand,
-                            tensorflow::gtl::ArraySlice<int64> dimensions);
-
-  // Enqueues a sort (as increasing order) instruction onto the computation.
-  ComputationDataHandle Sort(const ComputationDataHandle& operand);
-
-  // Enqueues a clamp instruction onto the computation.
-  ComputationDataHandle Clamp(const ComputationDataHandle& min,
-                              const ComputationDataHandle& operand,
-                              const ComputationDataHandle& max);
-
-  // Enqueues a map instruction onto the computation.
-  ComputationDataHandle Map(
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-      const Computation& computation,
-      tensorflow::gtl::ArraySlice<int64> dimensions,
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> static_operands = {});
-
-  // Enqueues a N(mu, sigma) random number generation instruction onto the
-  // computation.
-  ComputationDataHandle RngNormal(const ComputationDataHandle& mu,
-                                  const ComputationDataHandle& sigma,
-                                  const Shape& shape);
-
-  // Enqueues a U(a, b) random number generation instruction onto the
-  // computation. Returns values in the semi-open interval [a, b).
-  ComputationDataHandle RngUniform(const ComputationDataHandle& a,
-                                   const ComputationDataHandle& b,
-                                   const Shape& shape);
-
-  // Enqueues a while node onto the computation.
-  ComputationDataHandle While(const Computation& condition,
-                              const Computation& body,
-                              const ComputationDataHandle& init);
-
-  // Enqueues a conditional node onto the computation.
-  ComputationDataHandle Conditional(const ComputationDataHandle& predicate,
-                                    const ComputationDataHandle& true_operand,
-                                    const Computation& true_computation,
-                                    const ComputationDataHandle& false_operand,
-                                    const Computation& false_computation);
-
-  // Enqueues a ReducePrecision node onto the computation.
-  ComputationDataHandle ReducePrecision(const ComputationDataHandle& operand,
-                                        const int exponent_bits,
-                                        const int mantissa_bits);
-
-  // Enqueues a Gather node onto the computation.
-  ComputationDataHandle Gather(
-      const ComputationDataHandle& input,
-      const ComputationDataHandle& gather_indices,
-      const GatherDimensionNumbers& dimension_numbers,
-      tensorflow::gtl::ArraySlice<int64> window_bounds);
-
-  // Enqueues a Send node onto the computation, to send the given operand to
-  // a Recv instruction that shares the same channel handle.
-  void Send(const ComputationDataHandle& operand, const ChannelHandle& handle);
-
-  // Enqueues a Recv node onto the computation. The data comes from a Send
-  // instruction that shares the same channel handle and its shape must
-  // be the same as the given shape.
-  ComputationDataHandle Recv(const Shape& shape, const ChannelHandle& handle);
-
-  // Returns true if 'operand' is a compile-time constant. A compile-time
-  // constant does not depend on parameters with index greater than or equal to
-  // `num_parameters`, or on stateful operators such as `RngNormal` or `Infeed`.
-  // Unlike `ComputeConstant`, `IsConstant` tests whether a computation is a
-  // compile-time constant without evaluating the computation.
-  StatusOr<bool> IsConstant(const ComputationDataHandle& operand,
-                            int64 num_parameters = 0);
-
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
-  // is the normalized result and batch_mean and batch_var are the mean and
-  // variance, respectively, across batch for the operand.
-  ComputationDataHandle BatchNormTraining(const ComputationDataHandle& operand,
-                                          const ComputationDataHandle& scale,
-                                          const ComputationDataHandle& offset,
-                                          float epsilon, int64 feature_index);
-
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // `BatchNormInference` is equivalent to calling `BatchNormTraining` without
-  // computing `mean` and `variance` for each batch inside the operation. It
-  // uses the input `mean` and `variance` instead as estimated values. The
-  // purpose of this op is to reduce latency in inference, hence the name
-  // `BatchNormInference`.
-  //
-  // The output has the same shape as `operand`, and contains the normalized
-  // values for each batch.
-  ComputationDataHandle BatchNormInference(
-      const ComputationDataHandle& operand, const ComputationDataHandle& scale,
-      const ComputationDataHandle& offset, const ComputationDataHandle& mean,
-      const ComputationDataHandle& variance, float epsilon,
-      int64 feature_index);
-
-  // Calculates the gradients of a batch norm op.
-  //
-  // The inputs `batch_mean` and `batch_var` represent the mean and variance
-  // across the batch.
-  //
-  // Returns a tuple of three elements:
-  //   - grad_operand: Gradient with respect to input `operand`
-  //   - grad_offset: Gradient with respect to input `offset`
-  //   - grad_scale: Gradient with respect to input `scale`
-  ComputationDataHandle BatchNormGrad(const ComputationDataHandle& operand,
-                                      const ComputationDataHandle& scale,
-                                      const ComputationDataHandle& batch_mean,
-                                      const ComputationDataHandle& batch_var,
-                                      const ComputationDataHandle& grad_output,
-                                      float epsilon, int64 feature_index);
-
-  // Computes the value of a constant indicated by a
-  // ComputationDataHandle using a non-optimized interpreter on the host.
-  //
-  // The operand must be from the computation currently being built -
-  // i.e., returned from this builder with no intervening call to
-  // Build(). This happens to currently work regardless of that, but
-  // that may stop working at any time.
-  //
-  // The operand must represent a constant value, which in this case
-  // means that it must not statically depend on any parameter of the
-  // computation that is being built other then the ones specified on the
-  // parameter list. The parameters in the list will be indexed by their
-  // parameter id property so the number of parameters specified should be at
-  // least as many as the largest used parameter index.
-  //
-  // `IsConstant` can be used to test whether a computation is a compile-time
-  // constant without evaluation it. `ComputeConstant` only succeeds for
-  // computations where `IsConstant` returns true.
-  //
-  // This functionality can be useful when translating a computation
-  // into XLA where something that looked dynamic is required by
-  // XLA to be specified as a constant. E.g. the source
-  // computation (outside of XLA) may include a dynamic
-  // computation of the shape of something and ComputeConstant lets
-  // you determine what the value of that computation is in the case
-  // where the value can be determined at compile time.
-  //
-  // If output_layout is non-null, then the output of the computation
-  // will be stored using that layout.
-  StatusOr<std::unique_ptr<Literal>> ComputeConstant(
-      const ComputationDataHandle& operand,
-      const Layout* output_layout = nullptr,
-      tensorflow::gtl::ArraySlice<Literal> parameters = {});
-
-  // Returns a new ComputationBuilder whose resultant Computation is used only
-  // by this ComputationBuilder. The sub-ComputationBuilder has the same
-  // die_immediately_on_error behavior as the parent.
-  std::unique_ptr<ComputationBuilder> CreateSubBuilder(
-      const string& computation_name);
-
-  // Modifies the computation being built so that executions of it
-  // will return the value associated with operand, rather than the
-  // last expression enqueued on the ComputationBuilder. Any subsequent
-  // operations added to the ComputationBuilder will not have any effect unless
-  // SetReturnValue is called again.
-  Status SetReturnValue(const ComputationDataHandle& operand);
-
-  // Builds the computation with the requested operations, or returns a non-ok
-  // status.
-  StatusOr<Computation> Build();
-
-  // Builds the computation with the requested operations, or notes an error in
-  // the parent ComputationBuilder and returns an empty computation if building
-  // failed. This function is intended to be used where the returned
-  // Computation is only used by the parent ComputationBuilder and hence further
-  // operation on the returned Computation will simply be error'ed out if an
-  // error occurred while building this computation. If the built computation is
-  // to be used by a ComputationBuilder other than the parent ComputationBuilder
-  // then Build() should be used instead.
-  Computation BuildAndNoteError();
-
-  // Returns the first error that was encountered while building the
-  // computation. When an error is encountered, by default we return a vacuous
-  // ComputationDataHandle and inform the user of the error that occurred while
-  // building the computation when they make a final call to Build().
-  //
-  // See also set_die_immediately_on_error().
-  Status first_error() const { return first_error_; }
-
- private:
-  // Limited checking of convolution parameters. Returns false on
-  // error.
-  bool VerifyConvolution(const Shape& lhs_shape, const Shape& rhs_shape,
-                         const ConvolutionDimensionNumbers& dimension_numbers);
-
-  // The parent ComputationBuilder of a sub-ComputationBuilder. The
-  // parent_builder_ will be the nullptr if not a sub-ComputationBuilder.
-  ComputationBuilder* parent_builder_{nullptr};
-
-  // Helper function for creating a Window proto from user-supplied
-  // data. Returns true if the user-supplied data was valid.
-  bool MakeWindow(tensorflow::gtl::ArraySlice<int64> window_dimensions,
-                  tensorflow::gtl::ArraySlice<int64> window_strides,
-                  tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-                  tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-                  tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-                  Window* window);
-
-  // Internal helper method that does the building for an arbitrary unary op.
-  ComputationDataHandle UnaryOp(UnaryOperation unop,
-                                const ComputationDataHandle& operand);
-
-  // Internal helper method that does the building for an arbitrary binary op.
-  // broadcast_dimensions specifies which dimensions to use for broadcasting
-  // when the operation is between tensors of different ranks.
-  ComputationDataHandle BinaryOp(
-      BinaryOperation binop, const ComputationDataHandle& lhs,
-      const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-
-  // Internal helper method that does the building for an arbitrary ternary op.
-  ComputationDataHandle TernaryOp(TernaryOperation triop,
-                                  const ComputationDataHandle& lhs,
-                                  const ComputationDataHandle& rhs,
-                                  const ComputationDataHandle& ehs);
-
-  // Internal helper method that does the building for a random number generator
-  // of a given distribution with an explicitly specified shape.
-  ComputationDataHandle RngOp(
-      RandomDistribution distribution,
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> parameters,
-      const Shape& shape);
-
-  // Populates computation_ with a valid object or returns a failing status.
-  // This is used before any given operation is enqueued.
-  Status PrepareComputation();
-
-  // Notes that the error occurred by:
-  // * storing it internally and capturing a backtrace if it's the first error
-  //   (this deferred value will be produced on the call to Build())
-  // * dying if die_immediately_on_error_ is true
-  void NoteError(const Status& error);
-
-  // Helper function that runs the given op_request, filling in op_response.
-  // Before the op is run, PrepareComputation is called, and common fields in
-  // the op_request are filled in.
-  Status RunOp(OpRequest* op_request, OpResponse* op_response);
-
-  // Helper function that calls RunOp and calls NoteError on failures.
-  void RunOpAndNoteError(OpRequest* op_request);
-
-  // Helper function that calls RunOp and either returns the output computation
-  // data handle (on success) or a vacuous computation data handle (on failure).
-  ComputationDataHandle RunOpAndParseResponse(OpRequest* op_request);
-
-  // Helper function that implements GetShape without noting errors. This makes
-  // it easier to ensure the real GetShape will note errors on every error path.
-  StatusOr<std::unique_ptr<Shape>> GetShapeWithoutNoteError(
-      const ComputationDataHandle& operand);
-
-  string name_;  // Name to use for the built computation.
-
-  // The first error encountered while building the computation.
-  // This is OK until the first error is encountered.
-  Status first_error_;
-
-  // The saved stack trace from the point at which the first error occurred.
-  tensorflow::SavedStackTrace first_error_backtrace_;
-
-  // The computation that operations are enqueued onto.
-  Computation computation_;
-
-  // The client that the computation is created in. Not owned.
-  Client* client_;
-
-  // Mode bit that indicates whether to die when a first error is encountered.
-  bool die_immediately_on_error_ = false;
-
-  // The metadata to attach to each op. This is structured as a "modal"-like
-  // operation, in order to simplify client code (and not sprinkle this metadata
-  // throughout the TensorFlow op kernel implementations).
-  OpMetadata metadata_;
-
-  // Sharding for this operator. This is structured as a "model"-like operation,
-  // in order to simplify client code, similar to metadata_.
-  tensorflow::gtl::optional<OpSharding> sharding_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ComputationBuilder);
-};
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR0(NativeT value) {
-  return ConstantLiteral(*Literal::CreateR0<NativeT>(value));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR1(
-    tensorflow::gtl::ArraySlice<NativeT> values) {
-  return ConstantLiteral(*Literal::CreateR1<NativeT>(values));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR1(int64 length,
-                                                     NativeT value) {
-  Literal literal(ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
-  literal.PopulateWithValue(value);
-  return ConstantLiteral(literal);
-}
-
-inline ComputationDataHandle ComputationBuilder::ConstantR1(
-    const tensorflow::core::Bitmap& values) {
-  return ConstantLiteral(*Literal::CreateR1(values));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR2(
-    std::initializer_list<std::initializer_list<NativeT>> values) {
-  return ConstantLiteral(*Literal::CreateR2<NativeT>(values));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantFromArrayWithLayout(
-    const Array<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      *Literal::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantFromArray(
-    const Array<NativeT>& values) {
-  return ConstantLiteral(*Literal::CreateFromArray<NativeT>(values));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR2FromArray2DWithLayout(
-    const Array2D<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      *Literal::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR2FromArray2D(
-    const Array2D<NativeT>& values) {
-  return ConstantLiteral(*Literal::CreateR2FromArray2D<NativeT>(values));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR3FromArray3DWithLayout(
-    const Array3D<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      *Literal::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR3FromArray3D(
-    const Array3D<NativeT>& values) {
-  return ConstantFromArray(values);
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR4FromArray4DWithLayout(
-    const Array4D<NativeT>& values, const Layout& layout) {
-  return ConstantFromArrayWithLayout(values, layout);
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR4FromArray4D(
-    const Array4D<NativeT>& values) {
-  return ConstantFromArray(values);
-}
-
-// RAII-style object: sets the current sharding assignment in builder on
-// construction, and sets back to the previous assignment on destruction.
-class ScopedShardingAssignment {
- public:
-  ScopedShardingAssignment(xla::ComputationBuilder* builder,
-                           tensorflow::gtl::optional<OpSharding> sharding)
-      : builder_(builder), prev_sharding_(builder->sharding()) {
-    SetSharding(sharding);
-  }
-
-  ~ScopedShardingAssignment() { SetSharding(prev_sharding_); }
-
- private:
-  void SetSharding(const tensorflow::gtl::optional<OpSharding>& sharding) {
-    if (sharding.has_value()) {
-      builder_->SetSharding(sharding.value());
-    } else {
-      builder_->ClearSharding();
-    }
-  }
-
-  xla::ComputationBuilder* const builder_;
-  tensorflow::gtl::optional<OpSharding> prev_sharding_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ScopedShardingAssignment);
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_COMPUTATION_BUILDER_H_
diff --git a/tensorflow/compiler/xla/client/global_data.cc b/tensorflow/compiler/xla/client/global_data.cc
index 40f59eaa68ebeb47edbd2afbeabad0cd2623ebc6..2986d4060013703873b2cffb6aacbb012606d16f 100644
--- a/tensorflow/compiler/xla/client/global_data.cc
+++ b/tensorflow/compiler/xla/client/global_data.cc
@@ -31,7 +31,7 @@ GlobalData::~GlobalData() {
   *request.mutable_data() = handle_;
   UnregisterResponse response;
   VLOG(1) << "requesting to unregister " << handle_.ShortDebugString();
-  tensorflow::Status s = parent_->Unregister(&request, &response);
+  Status s = parent_->Unregister(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index 9cd87f74735ff50df8a3382723c7d045ff6c9e52..3380af9f303b1dc2cec09aa37410ec40cdeaa526 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -92,21 +92,6 @@ std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
   return MakeFakeDataViaDeviceOrDie(shape, client);
 }
 
-std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
-    const Computation& computation, Client* client) {
-  auto program_shape =
-      client->GetComputationShape(computation).ConsumeValueOrDie();
-
-  // For every (unbound) parameter that the computation wants, we manufacture
-  // some arbitrary data so that we can invoke the computation.
-  std::vector<std::unique_ptr<GlobalData>> fake_arguments;
-  for (const Shape& parameter : program_shape->parameters()) {
-    fake_arguments.push_back(MakeFakeDataOrDie(parameter, client));
-  }
-
-  return fake_arguments;
-}
-
 std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
     const XlaComputation& computation, Client* client) {
   CHECK(computation.proto().has_program_shape())
diff --git a/tensorflow/compiler/xla/client/lib/testing.h b/tensorflow/compiler/xla/client/lib/testing.h
index 9e06141b1f13d24cd033b72e31ee3a0442fe6a37..dc613099e2b42a60d0c11a654ab5cd41f8bd4f6f 100644
--- a/tensorflow/compiler/xla/client/lib/testing.h
+++ b/tensorflow/compiler/xla/client/lib/testing.h
@@ -32,12 +32,6 @@ namespace xla {
 std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
                                               Client* client);
 
-// Returns vector of GlobalData handles of fake data (created using
-// MakeFakeDataOrDie) that are correctly shaped arguments for the given
-// computation.
-std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
-    const Computation& computation, Client* client);
-
 // Returns vector of GlobalData handles of fake data (created using
 // MakeFakeDataOrDie) that are correctly shaped arguments for the given
 // xla computation.
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 1acc6f86860e526b5ff737c45041a863f21da145..a7c55c6b2b7fe2b5541ce71bf3eaa24114522fc5 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -48,7 +48,7 @@ LocalExecutable::LocalExecutable(std::unique_ptr<Executable> executable,
       << "Must have a valid device ordinal that the executable was built for.";
 }
 
-tensorflow::Status LocalExecutable::ValidateExecutionOptions(
+Status LocalExecutable::ValidateExecutionOptions(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     const ExecutableRunOptions& run_options, const Backend& backend) {
   const ComputationLayout& host_computation_layout =
@@ -207,7 +207,7 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::ExecuteAndDump(
   return std::move(result);
 }
 
-tensorflow::Status LocalExecutable::RecordArguments(
+Status LocalExecutable::RecordArguments(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     SessionModule* session_module) {
   session_module->clear_arguments();
@@ -219,8 +219,8 @@ tensorflow::Status LocalExecutable::RecordArguments(
   return Status::OK();
 }
 
-tensorflow::Status LocalExecutable::RecordResult(
-    const ShapedBuffer* result, SessionModule* session_module) {
+Status LocalExecutable::RecordResult(const ShapedBuffer* result,
+                                     SessionModule* session_module) {
   session_module->clear_result();
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
                       LiteralFromShapedBuffer(*result));
@@ -261,25 +261,6 @@ Backend* LocalClient::mutable_backend() {
   return local_service_->mutable_backend();
 }
 
-StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
-    const Computation& computation,
-    const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-    const ExecutableBuildOptions& options) {
-  ExecutableBuildOptions updated_options = options;
-  if (options.device_ordinal() == -1) {
-    updated_options.set_device_ordinal(default_device_ordinal());
-    VLOG(3) << "Set device ordinal to default value of: "
-            << updated_options.device_ordinal();
-  }
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Executable> executable,
-      local_service_->CompileExecutable(computation.handle(), argument_layouts,
-                                        updated_options));
-  return WrapUnique(new LocalExecutable(std::move(executable),
-                                        local_service_->mutable_backend(),
-                                        updated_options));
-}
-
 StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
     const XlaComputation& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index d8fd7a5623d1fecdcff6851aa3e3538822fb50da..d63d4ec7f3744d507cc854213e430e25e861e559 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
@@ -59,7 +58,7 @@ class LocalExecutable {
 
   // Validates that the given arguments and options satisfy various constraints
   // of the computation.
-  tensorflow::Status ValidateExecutionOptions(
+  Status ValidateExecutionOptions(
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       const ExecutableRunOptions& run_options, const Backend& backend);
 
@@ -71,13 +70,13 @@ class LocalExecutable {
 
   // Records the arguments used to invoke the computation in a SessionModule
   // proto.
-  tensorflow::Status RecordArguments(
+  Status RecordArguments(
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       SessionModule* session_module);
 
   // Records the result of the computation in a SessionModule proto.
-  tensorflow::Status RecordResult(const ShapedBuffer* result,
-                                  SessionModule* session_module);
+  Status RecordResult(const ShapedBuffer* result,
+                      SessionModule* session_module);
 
   // Returns a literal containing the contents of the given ShapedBuffer.
   StatusOr<std::unique_ptr<Literal>> LiteralFromShapedBuffer(
@@ -108,17 +107,8 @@ class LocalClient : public Client {
   LocalClient(const LocalClient&) = delete;
   void operator=(const LocalClient&) = delete;
 
-  // Build and return a LocalExecutable object. The executable is compiled using
-  // the given argument layouts and options.
-  StatusOr<std::unique_ptr<LocalExecutable>> Compile(
-      const Computation& computation,
-      const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-      const ExecutableBuildOptions& options);
-
   // Build and return a LocalExecutable object. The executable is compiled using
   // the given XlaComputation, argument layouts and options.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::unique_ptr<LocalExecutable>> Compile(
       const XlaComputation& computation,
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 1899983e442116d3ebf8a3e79b0515653cd624cb..ae506317c2e4862d77cb4f0628e919871ad1aeb2 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -57,16 +57,6 @@ bool CanBeRoot(HloOpcode opcode) {
   }
 }
 
-StatusOr<std::vector<Shape>> GetOperandShapes(
-    tensorflow::gtl::ArraySlice<XlaOp> operands) {
-  std::vector<Shape> operand_shapes;
-  for (const XlaOp& operand : operands) {
-    TF_ASSIGN_OR_RETURN(const Shape& shape, operand.GetShape());
-    operand_shapes.push_back(shape);
-  }
-  return operand_shapes;
-}
-
 }  // namespace
 
 StatusOr<Shape> XlaBuilder::GetShape(const XlaOp& op) const {
@@ -76,12 +66,14 @@ StatusOr<Shape> XlaBuilder::GetShape(const XlaOp& op) const {
   return instr->shape();
 }
 
-StatusOr<Shape> XlaOp::GetShape() const {
-  if (builder_ == nullptr) {
-    return InvalidArgument(
-        "cannot GetShape for an invalid XlaOp with handle %lld", handle());
+StatusOr<std::vector<Shape>> XlaBuilder::GetOperandShapes(
+    tensorflow::gtl::ArraySlice<XlaOp> operands) const {
+  std::vector<Shape> operand_shapes;
+  for (const XlaOp& operand : operands) {
+    TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
+    operand_shapes.push_back(shape);
   }
-  return builder_->GetShape(*this);
+  return operand_shapes;
 }
 
 XlaBuilder::XlaBuilder(const string& computation_name)
@@ -286,7 +278,7 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
                                                  const XlaOp& operand) {
   TF_RETURN_IF_ERROR(first_error_);
 
-  TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+  TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
 
   CHECK(ShapeUtil::IsScalar(operand_shape) ||
         ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(output_shape));
@@ -325,7 +317,7 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
 XlaOp XlaBuilder::UnaryOp(HloOpcode unop, const XlaOp& operand) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
                         ShapeInference::InferUnaryOpShape(unop, operand_shape));
     return AddInstruction(std::move(instr), unop, {operand});
@@ -337,8 +329,8 @@ XlaOp XlaBuilder::BinaryOp(
     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, lhs.GetShape());
-    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, rhs.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
     TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
                         ShapeInference::InferBinaryOpShape(
                             binop, lhs_shape, rhs_shape, broadcast_dimensions));
@@ -374,12 +366,12 @@ XlaOp XlaBuilder::BinaryOp(
       updated_rhs = !should_broadcast_lhs ? broadcasted_operand : rhs;
     }
 
-    TF_ASSIGN_OR_RETURN(Shape updated_lhs_shape, updated_lhs.GetShape());
+    TF_ASSIGN_OR_RETURN(Shape updated_lhs_shape, GetShape(updated_lhs));
     if (!ShapeUtil::SameDimensions(instr.shape(), updated_lhs_shape)) {
       TF_ASSIGN_OR_RETURN(updated_lhs,
                           AddBroadcastSequence(instr.shape(), updated_lhs));
     }
-    TF_ASSIGN_OR_RETURN(Shape updated_rhs_shape, updated_rhs.GetShape());
+    TF_ASSIGN_OR_RETURN(Shape updated_rhs_shape, GetShape(updated_rhs));
     if (!ShapeUtil::SameDimensions(instr.shape(), updated_rhs_shape)) {
       TF_ASSIGN_OR_RETURN(updated_rhs,
                           AddBroadcastSequence(instr.shape(), updated_rhs));
@@ -393,9 +385,9 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
                             const XlaOp& ehs) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, lhs.GetShape());
-    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, rhs.GetShape());
-    TF_ASSIGN_OR_RETURN(const Shape& ehs_shape, ehs.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+    TF_ASSIGN_OR_RETURN(const Shape& ehs_shape, GetShape(ehs));
     TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
                         ShapeInference::InferTernaryOpShape(
                             triop, lhs_shape, rhs_shape, ehs_shape));
@@ -437,7 +429,7 @@ XlaOp XlaBuilder::Mul(const XlaOp& lhs, const XlaOp& rhs,
   return BinaryOp(HloOpcode::kMultiply, lhs, rhs, broadcast_dimensions);
 }
 
-XlaOp XlaBuilder::ConstantLiteral(const Literal& literal) {
+XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = literal.shape();
@@ -485,7 +477,7 @@ XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape,
 XlaOp XlaBuilder::Broadcast(
     const XlaOp& operand, tensorflow::gtl::ArraySlice<int64> broadcast_sizes) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(
         const Shape& shape,
         ShapeInference::InferBroadcastShape(operand_shape, broadcast_sizes));
@@ -633,7 +625,7 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
                           tensorflow::gtl::ArraySlice<int64> dimensions,
                           tensorflow::gtl::ArraySlice<int64> new_sizes) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(const Shape& shape,
                         ShapeInference::InferReshapeShape(
                             operand_shape, dimensions, new_sizes));
@@ -647,7 +639,7 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
 XlaOp XlaBuilder::Reshape(const XlaOp& operand,
                           tensorflow::gtl::ArraySlice<int64> new_sizes) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(auto shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(auto shape, GetShape(operand));
     std::vector<int64> dimensions(shape.dimensions_size());
     std::iota(dimensions.begin(), dimensions.end(), 0);
     return Reshape(operand, dimensions, new_sizes);
@@ -1002,7 +994,7 @@ XlaOp XlaBuilder::Fft(const XlaOp& operand, const FftType fft_type,
                       const tensorflow::gtl::ArraySlice<int64> fft_length) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(
         *instr.mutable_shape(),
         ShapeInference::InferFftShape(operand_shape, fft_type, fft_length));
@@ -1173,6 +1165,10 @@ XlaOp XlaBuilder::Exp(const XlaOp& operand) {
   return UnaryOp(HloOpcode::kExp, operand);
 }
 
+XlaOp XlaBuilder::Expm1(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kExpm1, operand);
+}
+
 XlaOp XlaBuilder::Floor(const XlaOp& operand) {
   return UnaryOp(HloOpcode::kFloor, operand);
 }
@@ -1189,6 +1185,10 @@ XlaOp XlaBuilder::Log(const XlaOp& operand) {
   return UnaryOp(HloOpcode::kLog, operand);
 }
 
+XlaOp XlaBuilder::Log1p(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kLog1p, operand);
+}
+
 XlaOp XlaBuilder::Sign(const XlaOp& operand) {
   return UnaryOp(HloOpcode::kSign, operand);
 }
@@ -1225,7 +1225,7 @@ XlaOp XlaBuilder::Transpose(const XlaOp& operand,
                             tensorflow::gtl::ArraySlice<int64> permutation) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(
         *instr.mutable_shape(),
         ShapeInference::InferTransposeShape(operand_shape, permutation));
@@ -1948,11 +1948,18 @@ StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstruction(
     const XlaOp& op) const {
   TF_RETURN_IF_ERROR(first_error_);
 
+  if (op.builder_ == nullptr) {
+    return InvalidArgument(
+        "invalid XlaOp with handle %lld; the builder of this op is freed",
+        op.handle());
+  }
   if (op.builder_ != this) {
-    return InvalidArgument("invalid XlaOp with handle %lld", op.handle());
+    return InvalidArgument(
+        "XlaOp with handle %lld is built by builder '%s', but is trying to use "
+        "it in builder '%s'",
+        op.handle(), op.builder_->name().c_str(), this->name().c_str());
   }
 
-  TF_RET_CHECK(op.builder_ == this);
   if (op.handle() >= instructions_.size() || op.handle() < 0) {
     return InvalidArgument("no XlaOp value %lld", op.handle());
   }
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index 4955f1515d66af00ddf72e4c7621292a590e662c..2b3013a91c488782098bd81994e899eae5a1f506 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -13,10 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// TODO(b/74197823): Replace computation_builder.h with this file.
-//
-// This is NOT YET ready to use.
-
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
 
@@ -48,15 +44,11 @@ class XlaBuilder;
 // This represents an instruction that has been enqueued using the XlaBuilder.
 // This is used to pass to subsequent computations that depends upon the
 // instruction as an operand.
-//
-// TODO(b/74197823): Replace xla::ComputationDataHandle with this one.
 class XlaOp {
  public:
   XlaOp() : handle_(0), builder_(nullptr) {}
   ~XlaOp() {}
 
-  StatusOr<Shape> GetShape() const;
-
   const XlaBuilder* builder() const { return builder_; }
 
   bool operator==(const XlaOp& rhs) const {
@@ -87,8 +79,6 @@ class XlaOp {
 // A convenient interface for building up computations.
 //
 // Thread-compatible.
-//
-// TODO(b/74197823): Replace xla::ComputationBuilder with this one.
 class XlaBuilder {
  public:
   // computation_name: name to use for the built computation.
@@ -139,7 +129,7 @@ class XlaBuilder {
 
   // Enqueues a constant with the value of the given literal onto the
   // computation.
-  XlaOp ConstantLiteral(const Literal& literal);
+  XlaOp ConstantLiteral(const LiteralSlice& literal);
 
   // Enqueues a constant onto the computation. Methods are templated on the
   // native host type (NativeT) which corresponds to a specific XLA
@@ -571,6 +561,9 @@ class XlaBuilder {
   // Enqueues an exp instruction onto the computation.
   XlaOp Exp(const XlaOp& operand);
 
+  // Enqueues an expm1 instruction onto the computation.
+  XlaOp Expm1(const XlaOp& operand);
+
   // Enqueues a floor instruction onto the computation.
   XlaOp Floor(const XlaOp& operand);
 
@@ -584,6 +577,9 @@ class XlaBuilder {
   // Enqueues an log instruction (natural logarithm) onto the computation.
   XlaOp Log(const XlaOp& operand);
 
+  // Enqueues an log1p instruction (log(x+1)) onto the computation.
+  XlaOp Log1p(const XlaOp& operand);
+
   // Enqueues a sign instruction onto the computation.
   XlaOp Sign(const XlaOp& operand);
 
@@ -847,6 +843,10 @@ class XlaBuilder {
   // computation and fills the root_id in the pointer.
   StatusOr<ProgramShape> GetProgramShape(int64* root_id) const;
 
+  // Returns shapes for the operands.
+  StatusOr<std::vector<Shape>> GetOperandShapes(
+      tensorflow::gtl::ArraySlice<XlaOp> operands) const;
+
   // A visitor which checks whether an operation is a compile-time constant,
   // meaning that it doesn't depend on any parameters, or on any stateful
   // operation such as `RngNormal` or `Infeed`. The visitor walks the
@@ -981,8 +981,6 @@ XlaOp XlaBuilder::ConstantR4FromArray4D(const Array4D<NativeT>& values) {
 
 // RAII-style object: sets the current sharding assignment in builder on
 // construction, and sets back to the previous assignment on destruction.
-//
-// TODO(b/74197823): This is a part of a NOT YET ready refactor.
 class XlaScopedShardingAssignment {
  public:
   XlaScopedShardingAssignment(xla::XlaBuilder* builder,
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
index ce984564d016ce65fa6c932f3cda290cc0d75a4a..2df3ea3af0d4fcfb9bc803feebd96f09042ab1f3 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
@@ -76,7 +76,7 @@ TEST_F(XlaBuilderTest, ParamPlusParamHasBroadcast) {
   auto y = b.Parameter(1, y_shape, "y");
   auto add = b.Add(x, y, /*broadcast_dimensions=*/{0, 1});
 
-  TF_ASSERT_OK_AND_ASSIGN(auto add_shape, add.GetShape());
+  TF_ASSERT_OK_AND_ASSIGN(auto add_shape, b.GetShape(add));
   EXPECT_TRUE(ShapeUtil::Equal(add_shape, x_shape));
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
@@ -188,8 +188,10 @@ TEST_F(XlaBuilderTest, OperandFromWrongBuilder) {
   builder.Add(p0, p0);
   auto statusor = builder.Build();
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("Do not add XlaOp from builder b1 to builder main"));
+  EXPECT_THAT(
+      statusor.status().error_message(),
+      HasSubstr(
+          "built by builder 'b1', but is trying to use it in builder 'main'"));
 }
 
 TEST_F(XlaBuilderTest, ReshapeDefaultOrder) {
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
index b70b57e9ffec40188f246f5e884146012c02f4a2..0ffba208b1f8683fe1d26107cbfd096b856267f1 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
@@ -25,8 +25,6 @@ limitations under the License.
 namespace xla {
 
 // The computation graph that the user builds up with the XlaBuilder.
-//
-// TODO(b/74197823): Replace xla::Computation with this one.
 class XlaComputation {
  public:
   XlaComputation() : unique_id_(-1) {}
diff --git a/tensorflow/compiler/xla/error_spec.h b/tensorflow/compiler/xla/error_spec.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1463aa15941b9c265db94e2eb3cc176fab6695b
--- /dev/null
+++ b/tensorflow/compiler/xla/error_spec.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_ERROR_SPEC_H_
+#define TENSORFLOW_COMPILER_XLA_ERROR_SPEC_H_
+
+namespace xla {
+
+// Structure describing permissible absolute and relative error bounds.
+struct ErrorSpec {
+  explicit ErrorSpec(float aabs, float arel = 0, bool relaxed_nans = false)
+      : abs(aabs), rel(arel), relaxed_nans(relaxed_nans) {}
+
+  float abs;  // Absolute error bound.
+  float rel;  // Relative error bound.
+
+  // If relaxed_nans is true then any result is valid if we are expecting NaNs.
+  // In effect, this allows the tested operation to produce incorrect results
+  // for inputs outside its mathematical domain.
+  bool relaxed_nans;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_ERROR_SPEC_H_
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index c6f8f6766e9d0156d0c68306af214443f584a9fe..a76fdcda250168cbed2acd01bdd9ddc3b4c93b92 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -140,8 +140,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
   LayoutUtil::SetToDefaultLayout(program_shape->mutable_result());
 }
 
-/* static */ tensorflow::Status LayoutUtil::ValidateLayoutInShape(
-    const Shape& shape) {
+/* static */ Status LayoutUtil::ValidateLayoutInShape(const Shape& shape) {
   if (ShapeUtil::IsTuple(shape)) {
     // Tuple shape.
     if (shape.has_layout()) {
@@ -150,12 +149,12 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
     for (auto& element_shape : shape.tuple_shapes()) {
       TF_RETURN_IF_ERROR(ValidateLayoutInShape(element_shape));
     }
-    return tensorflow::Status::OK();
+    return Status::OK();
   } else if (ShapeUtil::IsOpaque(shape)) {
     if (shape.has_layout()) {
       return InvalidArgument("opaque should not have a layout field");
     }
-    return tensorflow::Status::OK();
+    return Status::OK();
   } else {
     // Array shape.
     if (!shape.has_layout()) {
@@ -166,14 +165,14 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
   }
 }
 
-/* static */ tensorflow::Status LayoutUtil::ValidateLayoutForShape(
-    const Layout& layout, const Shape& shape) {
+/* static */ Status LayoutUtil::ValidateLayoutForShape(const Layout& layout,
+                                                       const Shape& shape) {
   if (ShapeUtil::IsTuple(shape)) {
     return InvalidArgument("a single Layout is not valid for tuple shapes");
   }
 
   if (ShapeUtil::IsOpaque(shape)) {
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 
   if (layout.format() == INVALID_FORMAT) {
@@ -225,7 +224,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
     }
   }
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 /* static */ void LayoutUtil::ClearLayout(Shape* shape) {
@@ -384,7 +383,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 namespace {
 
 // Internal helper for recursively copying layouts.
-tensorflow::Status CopyLayoutInternal(const Shape& src, Shape* dst) {
+Status CopyLayoutInternal(const Shape& src, Shape* dst) {
   if (ShapeUtil::IsTuple(src) != ShapeUtil::IsTuple(*dst)) {
     return InvalidArgument(
         "cannot copy layout from shape: shape structure differs");
@@ -411,14 +410,13 @@ tensorflow::Status CopyLayoutInternal(const Shape& src, Shape* dst) {
       dst->clear_layout();
     }
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace
 
 /* static */
-tensorflow::Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src,
-                                                       Shape* dst) {
+Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
   return CopyLayoutInternal(src, dst);
 }
 
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index 6cec7501015e2dff6b5e56e20b793a5458618501..d3d6a2cc94012f7113fd1cb1b17e9c9d5323d9bf 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -20,9 +20,9 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -61,12 +61,12 @@ class LayoutUtil {
   static void SetToDefaultLayout(ProgramShape* program_shape);
 
   // Validates that the layout within the given shape is correct.
-  static tensorflow::Status ValidateLayoutInShape(const Shape& shape);
+  static Status ValidateLayoutInShape(const Shape& shape);
 
   // Validates that the provided layout satisfies invariants for the given
   // shape.
-  static tensorflow::Status ValidateLayoutForShape(const Layout& layout,
-                                                   const Shape& shape);
+  static Status ValidateLayoutForShape(const Layout& layout,
+                                       const Shape& shape);
 
   // Clears the layout in the given Shape. After this function is called,
   // HasLayout will return false for the shape.
@@ -179,8 +179,7 @@ class LayoutUtil {
   // tuples.  'src' and 'dst' need not be compatible but the two shapes must
   // have the same tuple structure (if any) and arrays must have the same
   // rank. within the shapes must have the same number of dimensions.
-  static tensorflow::Status CopyLayoutBetweenShapes(const Shape& src,
-                                                    Shape* dst);
+  static Status CopyLayoutBetweenShapes(const Shape& src, Shape* dst);
 
   // Returns true if the layouts of lhs and rhs are equal, false
   // otherwise. Recursively compares layouts of tuples.
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index bc8405703b02dc1b0c4c87005ea3c15372552157..f42fb92359f40ec763866af094972046f6407ae1 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -47,6 +47,12 @@ void SetDebugOptionsDefaults(DebugOptions* flags) {
   // Set cudnn batchnorm off by default; it does not provide a performance win
   // on average.
   flags->set_xla_gpu_use_cudnn_batchnorm(false);
+
+  // Run all GPU work on one stream by default.  Using multiple streams
+  // increases memory usage and we lack strong motivating benchmarks for tuning
+  // the heuristics needed to decide when to run on multiple streams.  See
+  // b/77879207.
+  flags->set_xla_gpu_disable_multi_streaming(true);
 }
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3696fdbe12e311af3b286ef0dfe91377983b72dd
--- /dev/null
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -0,0 +1,739 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/literal_comparison.h"
+
+#include <unistd.h>
+#include <cmath>
+#include <vector>
+
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/casts.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+
+using tensorflow::strings::Appendf;
+using tensorflow::strings::Printf;
+using tensorflow::strings::StrAppend;
+using tensorflow::strings::StrCat;
+
+namespace xla {
+namespace literal_comparison {
+namespace {
+
+// Helper function for comparing a floating point type, FloatT, bitwise equal
+// between the left-hand-side and right-hand-side, by bit-casting to UnsignedT
+// -- on miscompare, a nice error message is given in the AssertionFailure.
+template <typename FloatT, typename UnsignedT>
+Status CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
+  auto ulhs = tensorflow::bit_cast<UnsignedT>(lhs);
+  auto urhs = tensorflow::bit_cast<UnsignedT>(rhs);
+  auto lhs_double = static_cast<double>(lhs);
+  auto rhs_double = static_cast<double>(rhs);
+  if (ulhs != urhs) {
+    return InvalidArgument(
+        "floating values are not bitwise-equal; and equality testing "
+        "was requested: %s=%g=%a vs %s=%g=%a",
+        StrCat(tensorflow::strings::Hex(ulhs)).c_str(), lhs_double, lhs_double,
+        StrCat(tensorflow::strings::Hex(urhs)).c_str(), rhs_double, rhs_double);
+  }
+  return Status::OK();
+}
+
+// Templated comparator that specializes for float equality comparison with the
+// bitwise helper above (this is the un-specialized fallback, to just use the
+// default gunit implementation).
+template <typename NativeT>
+Status CompareEqual(NativeT lhs, NativeT rhs) {
+  if (lhs == rhs) {
+    return Status::OK();
+  }
+  return InvalidArgument("Expected equality of these values:\n  %s\n  %s",
+                         StrCat(lhs).c_str(), StrCat(rhs).c_str());
+}
+
+// Specializations for floating types that do bitwise comparisons when equality
+// comparison is requested.
+template <>
+Status CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs) {
+  return CompareFloatsBitwiseEqual<bfloat16, uint16>(lhs, rhs);
+}
+template <>
+Status CompareEqual<Eigen::half>(Eigen::half lhs, Eigen::half rhs) {
+  return CompareFloatsBitwiseEqual<Eigen::half, uint16>(lhs, rhs);
+}
+template <>
+Status CompareEqual<float>(float lhs, float rhs) {
+  return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs);
+}
+template <>
+Status CompareEqual<double>(double lhs, double rhs) {
+  return CompareFloatsBitwiseEqual<double, uint64>(lhs, rhs);
+}
+template <>
+Status CompareEqual<complex64>(complex64 lhs, complex64 rhs) {
+  auto res = CompareEqual<float>(lhs.real(), rhs.real());
+  if (!res.ok()) {
+    return res;
+  }
+  return CompareEqual<float>(lhs.imag(), rhs.imag());
+}
+
+// A recursive function which iterates through every index of expected and
+// actual literal and compares their values elementwise. Returns true if all
+// elements are equal.
+template <typename NativeT>
+Status Equal(LiteralSlice expected, LiteralSlice actual,
+             tensorflow::gtl::MutableArraySlice<int64> multi_index,
+             int64 dimension) {
+  if (dimension == expected.shape().dimensions_size()) {
+    NativeT expected_value = expected.Get<NativeT>(multi_index);
+    NativeT actual_value = actual.Get<NativeT>(multi_index);
+    return CompareEqual<NativeT>(expected_value, actual_value);
+  }
+
+  Status result;
+  for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) {
+    multi_index[dimension] = i;
+    result.Update(Equal<NativeT>(expected, actual, multi_index, dimension + 1));
+  }
+  return result;
+}
+
+// Gets the total element count.  For tuples, this is not the count of tuple
+// elements, but the sum of elements of each tuple element.
+int64 RecursiveElementCount(const Shape& shape) {
+  if (ShapeUtil::IsTuple(shape)) {
+    const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
+    int64 total = 0;
+    for (int64 i = 0; i < tuple_elements; ++i) {
+      total += RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
+    }
+    return total;
+  } else {
+    return ShapeUtil::ElementsIn(shape);
+  }
+}
+
+// Returns whether the actual and expected values are mismatched with respect to
+// nans. 'relaxed_nans' is interpreted as in xla::ErrorSpec.
+template <typename NativeT>
+bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) {
+  if (relaxed_nans) {
+    return !std::isnan(expected) && std::isnan(actual);
+  } else {
+    return std::isnan(expected) != std::isnan(actual);
+  }
+}
+
+template <>
+bool NanMismatch<complex64>(complex64 expected, complex64 actual,
+                            bool relaxed_nans) {
+  return NanMismatch<float>(expected.real(), actual.real(), relaxed_nans) ||
+         NanMismatch<float>(expected.imag(), actual.imag(), relaxed_nans);
+}
+
+template <>
+bool NanMismatch<half>(half expected, half actual, bool relaxed_nans) {
+  return NanMismatch<float>(static_cast<float>(expected),
+                            static_cast<float>(actual), relaxed_nans);
+}
+
+// Converts the given floating-point value to a string.
+template <typename NativeT>
+string FpValueToString(NativeT value) {
+  return Printf("%8.4g", static_cast<double>(value));
+}
+
+template <>
+string FpValueToString<complex64>(complex64 value) {
+  return Printf("%8.4g + %8.4fi", value.real(), value.imag());
+}
+
+// Returns the absolute value of the given floating point value. This function
+// is used instead of std::abs directly in order to allow type-dependent
+// implementations for NearComparator.
+template <typename NativeT>
+float FpAbsoluteValue(NativeT value) {
+  return std::abs(value);
+}
+
+template <>
+float FpAbsoluteValue(bfloat16 value) {
+  return FpAbsoluteValue<float>(static_cast<float>(value));
+}
+
+template <>
+float FpAbsoluteValue(half value) {
+  return FpAbsoluteValue<float>(static_cast<float>(value));
+}
+
+// Helper class for comparing floating-point literals within an error bound.
+template <typename NativeT>
+class NearComparator {
+ public:
+  // Compares the two array literals elementwise and returns a comparison
+  // result. The comparison is ok() if all actual and expected elements are
+  // within the given error bound. In case of error, the status contains a
+  // detailed message about the discrepancy.
+  static Status Compare(const LiteralSlice& expected,
+                        const LiteralSlice& actual, ErrorSpec error,
+                        bool detailed_message,
+                        const MiscompareCallback& miscompare_callback) {
+    NearComparator<NativeT> comparator(expected, actual, error,
+                                       detailed_message, miscompare_callback);
+    return comparator.Run();
+  }
+
+ private:
+  // Data structure encapsulating metadata about a single element mismatch.
+  struct Mismatch {
+    NativeT actual;
+    NativeT expected;
+    float rel_error;
+    float abs_error;
+
+    // The linear index of the failure within the shape. This linear index is
+    // from the 'actual' literal.
+    int64 linear_index;
+
+    bool operator<(const Mismatch& other) const {
+      return rel_error < other.rel_error;
+    }
+
+    string ToString(const Shape& shape) const {
+      return Printf(
+          "actual %s, expected %s, index %s, rel error %8.3g, abs error %8.3g",
+          FpValueToString(actual).c_str(), FpValueToString(expected).c_str(),
+          Literal::MultiIndexAsString(
+              IndexUtil::LinearIndexToMultidimensionalIndex(shape,
+                                                            linear_index))
+              .c_str(),
+          rel_error, abs_error);
+    }
+  };
+
+  NearComparator(const LiteralSlice& expected, const LiteralSlice& actual,
+                 ErrorSpec error, bool detailed_message,
+                 const MiscompareCallback& miscompare_callback)
+      : expected_(expected),
+        actual_(actual),
+        error_(error),
+        detailed_message_(detailed_message),
+        miscompare_callback_(miscompare_callback),
+        abs_value_buckets_(kAbsValueBucketBounds.size() - 1, {0, 0}),
+        abs_error_buckets_(kErrorBucketBounds.size(), 0),
+        rel_error_buckets_(kErrorBucketBounds.size(), 0) {}
+
+  // Runs the comparison between expected and actual literals.
+  Status Run() {
+    VLOG(1) << "expected:";
+    XLA_VLOG_LINES(1, ToStringTruncated(expected_));
+    VLOG(1) << "actual:";
+    XLA_VLOG_LINES(1, ToStringTruncated(actual_));
+
+    // If the shapes mismatch, we simply fail the expectation instead of
+    // printing out data, as it's a type error rather than a value error.
+    TF_RETURN_IF_ERROR(EqualShapes(expected_.shape(), actual_.shape()));
+    if (!ShapeUtil::IsArray(expected_.shape())) {
+      return InvalidArgument("Expected array shape; got %s.",
+                             ShapeUtil::HumanString(expected_.shape()).c_str());
+    }
+
+    mismatches_ = Literal(ShapeUtil::ChangeElementType(actual_.shape(), PRED));
+    mismatches_.PopulateWithValue(false);
+
+    CompareLiterals();
+
+    if (num_mismatches_ == 0) {
+      return Status::OK();
+    } else if (!VLOG_IS_ON(1) && miscompare_callback_ != nullptr) {
+      miscompare_callback_(expected_, actual_, mismatches_);
+    }
+    return InvalidArgument("%s", ErrorMessage().c_str());
+  }
+
+  // Insert the given absolute value into the absolute value bucket vector. The
+  // bounds of the buckets are given by kAbsValueBucketBounds.
+  void UpdateAbsValueBucket(NativeT value, bool is_mismatch) {
+    // Adjust the bucket containing the absolute values of the 'actual'
+    // elements.
+    const float abs_value = FpAbsoluteValue(value);
+    for (int i = 0; i < abs_value_buckets_.size(); ++i) {
+      if (i == abs_value_buckets_.size() - 1 ||
+          (abs_value >= kAbsValueBucketBounds[i] &&
+           abs_value < kAbsValueBucketBounds[i + 1])) {
+        // The first value of the pair is the count of elements in the bucket,
+        // the second is the count of mismatches in the bucket.
+        abs_value_buckets_[i].first++;
+        if (is_mismatch) {
+          abs_value_buckets_[i].second++;
+        }
+        return;
+      }
+    }
+  }
+
+  // Insert the given error into the given error bucket vector.
+  void UpdateErrorBucket(
+      float error, tensorflow::gtl::MutableArraySlice<int64> error_buckets) {
+    CHECK_EQ(error_buckets.size(), kErrorBucketBounds.size());
+    for (int i = 0; i < error_buckets.size(); ++i) {
+      if (error >= kErrorBucketBounds[i]) {
+        error_buckets[i]++;
+      }
+    }
+  }
+
+  // Compares the two given elements from the expected and actual literals at
+  // the given literal_index and keeps track of various mismatch statistics.
+  void CompareValues(NativeT expected, NativeT actual, int64 linear_index) {
+    const bool is_nan_mismatch =
+        NanMismatch(expected, actual, error_.relaxed_nans);
+    float abs_error;
+    float rel_error;
+    if (actual == expected) {
+      abs_error = 0;
+      rel_error = 0;
+    } else if (is_nan_mismatch) {
+      num_nan_mismatches_++;
+      // A nan mismatch is considered to have infinite error. rel_error is used
+      // for sorting a std::set of the top mismatchs, and a nan value here will
+      // result in undefined behavior because nan's do not satisfy the strict
+      // weak ordering requirement of std containers.
+      abs_error = std::numeric_limits<float>::infinity();
+      rel_error = std::numeric_limits<float>::infinity();
+    } else {
+      abs_error = FpAbsoluteValue(actual - expected);
+      rel_error = abs_error / FpAbsoluteValue(expected);
+    }
+    const bool is_abs_mismatch = abs_error > error_.abs;
+    const bool is_rel_mismatch = rel_error > error_.rel;
+    const bool is_mismatch =
+        is_nan_mismatch || (is_abs_mismatch && is_rel_mismatch);
+
+    // Update the error of the relative bucket only if the *absolute* error
+    // bound is exceeded and vice versa.
+    if (is_abs_mismatch) {
+      num_abs_mismatches_++;
+      UpdateErrorBucket(rel_error, &rel_error_buckets_);
+    }
+    if (is_rel_mismatch) {
+      num_rel_mismatches_++;
+      UpdateErrorBucket(abs_error, &abs_error_buckets_);
+    }
+
+    UpdateAbsValueBucket(actual, is_mismatch);
+
+    if (!is_mismatch) {
+      return;
+    }
+
+    num_mismatches_++;
+
+    // Keep track of the kTopRelativeErrorCount relative error mismatches.
+    if (top_rel_mismatches_.size() < kTopRelativeErrorCount ||
+        rel_error > top_rel_mismatches_.begin()->rel_error) {
+      Mismatch mismatch = {actual, expected, rel_error, abs_error,
+                           linear_index};
+      top_rel_mismatches_.insert(mismatch);
+      if (top_rel_mismatches_.size() > kTopRelativeErrorCount) {
+        top_rel_mismatches_.erase(top_rel_mismatches_.begin());
+      }
+    }
+
+    mismatches_.data<bool>()[linear_index] = true;
+  }
+
+  // Compares the two literals elementwise.
+  void CompareLiterals() {
+    // Fast path optimization for the case were layouts match.
+    if (LayoutUtil::Equal(actual_.shape().layout(),
+                          expected_.shape().layout())) {
+      tensorflow::gtl::ArraySlice<const NativeT> expected_data =
+          expected_.data<NativeT>();
+      tensorflow::gtl::ArraySlice<const NativeT> actual_data =
+          actual_.data<NativeT>();
+      const int64 len = expected_data.size();
+      for (int64 i = 0; i < len; ++i) {
+        CompareValues(expected_data[i], actual_data[i], i);
+      }
+      return;
+    }
+    std::vector<int64> multi_index(ShapeUtil::Rank(actual_.shape()), 0);
+    CompareLiteralsSlow(0, &multi_index);
+  }
+
+  // Slow path for CompareLiterals when 'actual' and 'expected' literals have
+  // different layouts. In this case, multidimensional indices are constructed
+  // and indexed for each element.
+  void CompareLiteralsSlow(int64 dimension, std::vector<int64>* multi_index) {
+    if (dimension == multi_index->size()) {
+      CompareValues(expected_.Get<NativeT>(*multi_index),
+                    actual_.Get<NativeT>(*multi_index),
+                    IndexUtil::MultidimensionalIndexToLinearIndex(
+                        actual_.shape(), *multi_index));
+    } else {
+      for (int64 i = 0; i < expected_.shape().dimensions(dimension); ++i) {
+        (*multi_index)[dimension] = i;
+        CompareLiteralsSlow(dimension + 1, multi_index);
+      }
+    }
+  }
+
+  // Returns an error message string with a detailed breakdown of the
+  // mismatches. Called after calling Run().
+  string ErrorMessage() {
+    string out;
+    int64 element_count = ShapeUtil::ElementsIn(actual_.shape());
+
+    auto percent_string = [](float a, float b) {
+      float pct = b == 0.0 ? 0.0 : 100.0 * a / b;
+      return Printf("%0.4f%%", pct);
+    };
+
+    Appendf(&out,
+            "\nMismatch count %lld (%s) in shape %s (%lld elements), abs bound "
+            "%g, rel bound %g\n",
+            num_mismatches_,
+            percent_string(num_mismatches_, element_count).c_str(),
+            ShapeUtil::HumanString(actual_.shape()).c_str(),
+            ShapeUtil::ElementsIn(actual_.shape()), error_.abs, error_.rel);
+    if (num_nan_mismatches_ > 0) {
+      StrAppend(&out, "nan mismatches ", num_nan_mismatches_, "\n");
+    }
+    Appendf(&out, "Top relative error mismatches:\n");
+    for (auto it = top_rel_mismatches_.rbegin();
+         it != top_rel_mismatches_.rend(); ++it) {
+      StrAppend(&out, "  ", it->ToString(actual_.shape()).c_str(), "\n");
+    }
+
+    if (!detailed_message_) {
+      return out;
+    }
+
+    StrAppend(&out, "Absolute magnitude breakdown of actual values:\n");
+    CHECK_EQ(abs_value_buckets_.size() + 1, kAbsValueBucketBounds.size());
+    for (int i = 0; i < abs_value_buckets_.size(); ++i) {
+      const int64 bucket_size = abs_value_buckets_[i].first;
+      const int64 bucket_mismatches = abs_value_buckets_[i].second;
+      string mismatch_str = bucket_mismatches > 0
+                                ? Printf(", mismatches %lld", bucket_mismatches)
+                                : "";
+      Appendf(&out, "  %-6g <= x < %-6g : %7lld (%9s)%s\n",
+              kAbsValueBucketBounds[i], kAbsValueBucketBounds[i + 1],
+              bucket_size, percent_string(bucket_size, element_count).c_str(),
+              mismatch_str.c_str());
+    }
+
+    auto print_accum_buckets = [&](const string& header, int64 total,
+                                   tensorflow::gtl::ArraySlice<int64> buckets) {
+      StrAppend(&out, header, ":\n");
+      Appendf(&out, "  <  %-6g : %7lld (%s)\n", kErrorBucketBounds[0],
+              total - buckets[0],
+              percent_string(total - buckets[0], total).c_str());
+      CHECK_EQ(buckets.size(), kErrorBucketBounds.size());
+      for (int i = 0; i < kErrorBucketBounds.size(); ++i) {
+        Appendf(&out, "  >= %-6g : %7lld (%s)\n", kErrorBucketBounds[i],
+                buckets[i], percent_string(buckets[i], total).c_str());
+      }
+    };
+    Appendf(&out, "Elements exceeding abs error bound %g: %lld (%s)\n",
+            error_.abs, num_abs_mismatches_,
+            percent_string(num_abs_mismatches_, element_count).c_str());
+    print_accum_buckets(
+        "Relative error breakdown of elements exceeding abs error bound",
+        num_abs_mismatches_, rel_error_buckets_);
+    Appendf(&out, "Elements exceeding rel error bound %g: %lld (%s)\n",
+            error_.rel, num_rel_mismatches_,
+            percent_string(num_rel_mismatches_, element_count).c_str());
+    print_accum_buckets(
+        "Absolute error breakdown of elements exceeding rel error bound",
+        num_rel_mismatches_, abs_error_buckets_);
+    return out;
+  }
+
+  // 'actual' and 'expected' literals being compared.
+  LiteralSlice expected_;
+  LiteralSlice actual_;
+
+  // The error bounds of the comparison.
+  ErrorSpec error_;
+
+  // Whether to include detailed breakdown of mismatches in the error message.
+  bool detailed_message_;
+
+  // Callback to invoke on miscompare.
+  MiscompareCallback miscompare_callback_;
+
+  // Number of element element mismatches encountered so far.
+  int64 num_mismatches_ = 0;
+
+  // Number of elements with a nan mismatch.
+  int64 num_nan_mismatches_ = 0;
+
+  // Number of elements which exceed the absolute/relative error bound.
+  int64 num_abs_mismatches_ = 0;
+  int64 num_rel_mismatches_ = 0;
+
+  // A Literal containing which elements did not match in the expected and
+  // actual literals. mismatches_ contains PREDs and is of the same sizes as
+  // the comparison literals.
+  Literal mismatches_;
+
+  // The number of mismatches to report in the output, sorted by relative error
+  // magnitude.
+  static constexpr int64 kTopRelativeErrorCount = 5;
+
+  // The set of mismatches with the largest relative error. The size of this set
+  // is bounded by kTopRelativeErrorCount.
+  std::multiset<Mismatch> top_rel_mismatches_;
+
+  // Actual values are bucketed by absolute value. kAbsValueBucketBounds is the
+  // bounds of these buckets. abs_value_buckets_ contains a pair for each
+  // bucket: the element count and failure count.
+  static constexpr std::array<float, 7> kAbsValueBucketBounds = {
+      0.0, 0.0001, 0.001, 0.01, 0.1, 1, std::numeric_limits<float>::infinity()};
+  std::vector<std::pair<int64, int64>> abs_value_buckets_;
+
+  // Buckets for relative and absolute errors. The relative error buckets only
+  // contains those elements which exceed the *absolute* error bound, and vice
+  // versa. This makes it easy to see the effect of adjusting the relative (or
+  // absolute) error bound on the success of the comparison. kErrorBucketBounds
+  // are the lower bounds of the buckets in both vectors. The error buckets are
+  // a cumulative distribution so an error value may appear in more than one
+  // bucket. For example an error value of 0.003 may appear in the buckets
+  // bounded by 0.01, 0.1, and 1.0.
+  static constexpr std::array<float, 5> kErrorBucketBounds = {0.0001, 0.001,
+                                                              0.01, 0.1, 1};
+  std::vector<int64> abs_error_buckets_;
+  std::vector<int64> rel_error_buckets_;
+};
+
+template <typename NativeT>
+constexpr std::array<float, 7> NearComparator<NativeT>::kAbsValueBucketBounds;
+template <typename NativeT>
+constexpr std::array<float, 5> NearComparator<NativeT>::kErrorBucketBounds;
+
+// Helper function for comparing two literals for nearness. Handles tuple-shapes
+// via recursion. shape_index is the ShapeIndex of expected (or actual)
+// currently being compared.
+Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
+                  const ErrorSpec& error, bool detailed_message,
+                  const MiscompareCallback& miscompare_callback,
+                  const ShapeIndex& shape_index) {
+  TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
+
+  if (ShapeUtil::IsTuple(expected.shape())) {
+    Status return_status;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
+      const auto expected_element = LiteralSlice(expected, {i});
+      const auto actual_element = LiteralSlice(actual, {i});
+      ShapeIndex element_index = shape_index;
+      element_index.push_back(i);
+      Status res =
+          NearHelper(expected_element, actual_element, error, detailed_message,
+                     miscompare_callback, element_index);
+      if (!res.ok()) {
+        string err_message = Printf("\nArray at shape index %s%s",
+                                    element_index.ToString().c_str(),
+                                    res.error_message().c_str());
+        if (return_status.ok()) {
+          return_status = res;
+        } else {
+          return_status = AppendStatus(return_status, res.error_message());
+        }
+      }
+    }
+    if (!return_status.ok() && shape_index.empty()) {
+      // Emit a top-level error message containing the top-level shape in case
+      // of mismatch.
+      int64 total_elements = RecursiveElementCount(actual.shape());
+      return_status = InvalidArgument(
+          "\nMismatches in shape %s (%lld elements):\n%s",
+          ShapeUtil::HumanString(actual.shape()).c_str(), total_elements,
+          return_status.error_message().c_str());
+    }
+    return return_status;
+  }
+
+  if (ShapeUtil::ElementIsFloating(expected.shape()) ||
+      ShapeUtil::ElementIsComplex(expected.shape())) {
+    switch (expected.shape().element_type()) {
+      case BF16:
+        return NearComparator<bfloat16>::Compare(
+            expected, actual, error, detailed_message, miscompare_callback);
+        break;
+      case F16:
+        return NearComparator<half>::Compare(
+            expected, actual, error, detailed_message, miscompare_callback);
+        break;
+      case F32:
+        return NearComparator<float>::Compare(
+            expected, actual, error, detailed_message, miscompare_callback);
+        break;
+      case F64:
+        return NearComparator<double>::Compare(
+            expected, actual, error, detailed_message, miscompare_callback);
+        break;
+      case C64:
+        return NearComparator<complex64>::Compare(
+            expected, actual, error, detailed_message, miscompare_callback);
+        break;
+      default:
+        LOG(FATAL) << "Unsupported primitive type in near comparator: "
+                   << PrimitiveType_Name(expected.shape().element_type())
+                   << ". Must be floating-point type.";
+    }
+  }
+
+  // Non-floating point literal.
+  return literal_comparison::Equal(expected, actual);
+}
+
+}  // namespace
+
+Status EqualShapes(const Shape& expected, const Shape& actual) {
+  if (ShapeUtil::IsTuple(expected) != ShapeUtil::IsTuple(actual)) {
+    return InvalidArgument("tupleness-mismatch! want: %s got %s",
+                           ShapeUtil::HumanString(expected).c_str(),
+                           ShapeUtil::HumanString(actual).c_str());
+  }
+  if (ShapeUtil::IsTuple(expected)) {
+    if (ShapeUtil::TupleElementCount(expected) !=
+        ShapeUtil::TupleElementCount(actual)) {
+      return InvalidArgument(
+          "want tuple element count: %lld got tuple element count: %lld",
+          ShapeUtil::TupleElementCount(expected),
+          ShapeUtil::TupleElementCount(actual));
+    }
+    for (int i = 0; i < expected.tuple_shapes_size(); ++i) {
+      Status result =
+          EqualShapes(expected.tuple_shapes(i), actual.tuple_shapes(i));
+      if (!result.ok()) {
+        return AppendStatus(result, StrCat("mismatch in tuple index", i));
+      }
+    }
+  } else {
+    if (ShapeUtil::Rank(expected) != ShapeUtil::Rank(actual)) {
+      return InvalidArgument("want rank of %s got rank of %s",
+                             ShapeUtil::HumanString(expected).c_str(),
+                             ShapeUtil::HumanString(actual).c_str());
+    }
+    if (expected.element_type() != actual.element_type()) {
+      return InvalidArgument(
+          "mismatch in primitive type %s vs %s",
+          PrimitiveType_Name(expected.element_type()).c_str(),
+          PrimitiveType_Name(actual.element_type()).c_str());
+    }
+    if (expected.dimensions_size() != actual.dimensions_size()) {
+      return InvalidArgument("want dimensions_size %d got dimensions_size %d",
+                             expected.dimensions_size(),
+                             actual.dimensions_size());
+    }
+    for (int i = 0; i < expected.dimensions_size(); ++i) {
+      if (expected.dimensions(i) != actual.dimensions(i)) {
+        return InvalidArgument(
+            "mismatch in dimension #%d expected: %s actual: %s", i,
+            ShapeUtil::HumanString(expected).c_str(),
+            ShapeUtil::HumanString(actual).c_str());
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status Equal(const LiteralSlice& expected, const LiteralSlice& actual) {
+  VLOG(1) << "expected:";
+  XLA_VLOG_LINES(1, expected.ToString());
+  VLOG(1) << "actual:";
+  XLA_VLOG_LINES(1, actual.ToString());
+
+  TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
+  std::vector<int64> multi_index(expected.shape().dimensions_size(), 0);
+  Status result;
+  switch (expected.shape().element_type()) {
+    case PRED:
+      result = Equal<bool>(expected, actual, &multi_index, 0);
+      break;
+    case U8:
+      result = Equal<uint8>(expected, actual, &multi_index, 0);
+      break;
+    case S32:
+      result = Equal<int32>(expected, actual, &multi_index, 0);
+      break;
+    case S64:
+      result = Equal<int64>(expected, actual, &multi_index, 0);
+      break;
+    case U32:
+      result = Equal<uint32>(expected, actual, &multi_index, 0);
+      break;
+    case U64:
+      result = Equal<uint64>(expected, actual, &multi_index, 0);
+      break;
+    case BF16:
+      result = Equal<bfloat16>(expected, actual, &multi_index, 0);
+      break;
+    case F16:
+      result = Equal<half>(expected, actual, &multi_index, 0);
+      break;
+    case F32:
+      result = Equal<float>(expected, actual, &multi_index, 0);
+      break;
+    case F64:
+      result = Equal<double>(expected, actual, &multi_index, 0);
+      break;
+    case C64:
+      result = Equal<complex64>(expected, actual, &multi_index, 0);
+      break;
+    case TUPLE: {
+      for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
+        result.Update(
+            Equal(LiteralSlice(expected, {i}), LiteralSlice(actual, {i})));
+      }
+      break;
+    }
+    default:
+      LOG(FATAL)
+          << "Unsupported primitive type in LiteralTestUtil::ExpectEqual: "
+          << PrimitiveType_Name(expected.shape().element_type());
+  }
+
+  if (result.ok()) {
+    return Status::OK();
+  }
+
+  return AppendStatus(result,
+                      tensorflow::strings::Printf("expected: %s\nactual:   %s",
+                                                  expected.ToString().c_str(),
+                                                  actual.ToString().c_str()));
+}
+
+Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
+            const ErrorSpec& error, bool detailed_message,
+            const MiscompareCallback& miscompare_callback) {
+  return NearHelper(expected, actual, error, detailed_message,
+                    miscompare_callback,
+                    /*shape_index=*/{});
+}
+
+string ToStringTruncated(const LiteralSlice& literal) {
+  return RecursiveElementCount(literal.shape()) < 1000
+             ? literal.ToString()
+             : "[TRUNCATED, Literal with more than 1000 values]";
+}
+
+}  // namespace literal_comparison
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/literal_comparison.h b/tensorflow/compiler/xla/literal_comparison.h
new file mode 100644
index 0000000000000000000000000000000000000000..00a13e361932e74a9a1e614d5c851d3851208852
--- /dev/null
+++ b/tensorflow/compiler/xla/literal_comparison.h
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Library for comparing literals without taking a dependency on testing
+// libraries.
+
+#ifndef TENSORFLOW_COMPILER_XLA_LITERAL_COMPARISON_H_
+#define TENSORFLOW_COMPILER_XLA_LITERAL_COMPARISON_H_
+
+#include "tensorflow/compiler/xla/error_spec.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace xla {
+namespace literal_comparison {
+
+// Returns ok if the given shapes have the same rank, dimension sizes, and
+// primitive types.
+Status EqualShapes(const Shape& expected, const Shape& actual);
+
+// Returns ok if the expected and actual literals are (bitwise) equal for all
+// elements in the literal. Also, asserts that the rank, dimensions sizes, and
+// primitive type are equal.
+Status Equal(const LiteralSlice& expected, const LiteralSlice& actual);
+
+using MiscompareCallback =
+    std::function<void(const LiteralSlice& expected, const LiteralSlice& actual,
+                       const LiteralSlice& mismatches)>;
+
+// Inspects whether the expected and actual literals are within the given error
+// bound for all elements. Also, inspects whether the rank, dimensions sizes,
+// and dimension bounds are equivalent.
+//
+// Tuples are matched recursively.
+//
+// When comparing tensors of non-floating-point type, this inspects for exact
+// equality, ignoring the ErrorSpec.
+//
+// If the shape of the literals is neither a complex/floating-point tensor nor a
+// tuple which contains a complex/floating-point tensor, Near() is equivalent to
+// Equal(). We don't raise an error in this case, because we want to allow
+// callers to call Near() even if they have no preconceptions about the shapes
+// being compared.
+//
+// If detailed_message is true, then the error message in the assertion result
+// will contain a more detailed breakdown of mismatches.
+Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
+            const ErrorSpec& error, bool detailed_message,
+            const MiscompareCallback& miscompare_callback);
+
+// Calling ToString on a literal with over 100 million elements takes around
+// 3 minutes.  The utility of printing a literal with >1000 elements is
+// questionable, especially when writing the Literal proto to disk is orders
+// of magnitude faster.
+string ToStringTruncated(const LiteralSlice& literal);
+
+}  // namespace literal_comparison
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_LITERAL_COMPARISON_H_
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index b3b5e34ba220c7e9bf1cefef4b27baa6faee2c20..4c560767dc603bf805f365d594810f4df7e90ed3 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -62,8 +62,49 @@ void ConvertEndianShort(char* bytes, int64 size) {
   }
 }
 
+// Return a literal with all arrays of type FromNativeT converted to type
+// ToNativeT in the given literal.
+template <typename FromNativeT, typename ToNativeT>
+std::unique_ptr<Literal> ConvertType(LiteralSlice literal) {
+  // First construct shape of the result.
+  Shape result_shape(literal.shape());
+  ShapeUtil::ForEachMutableSubshape(
+      &result_shape, [](Shape* subshape, const ShapeIndex&) {
+        if (subshape->element_type() ==
+            primitive_util::NativeToPrimitiveType<FromNativeT>()) {
+          subshape->set_element_type(
+              primitive_util::NativeToPrimitiveType<ToNativeT>());
+        }
+      });
+  auto result = MakeUnique<Literal>(result_shape);
+
+  // Then copy over the data from 'literal' converting FromNativeT values to
+  // ToNativeT values as necessary.
+  ShapeUtil::ForEachSubshape(
+      literal.shape(),
+      [&](const Shape& subshape, const ShapeIndex& shape_index) {
+        if (ShapeUtil::IsArray(subshape)) {
+          if (subshape.element_type() ==
+              primitive_util::NativeToPrimitiveType<FromNativeT>()) {
+            auto src = literal.data<FromNativeT>(shape_index);
+            auto dest = result->data<ToNativeT>(shape_index);
+            for (int64 i = 0; i < src.size(); ++i) {
+              dest[i] = static_cast<ToNativeT>(src[i]);
+            }
+          } else {
+            TF_CHECK_OK(result->CopyFrom(literal,
+                                         /*dest_shape_index=*/shape_index,
+                                         /*src_shape_index=*/shape_index));
+          }
+        }
+      });
+  return result;
+}
+
 }  // namespace
 
+LiteralBase::~LiteralBase() {}
+
 std::ostream& operator<<(std::ostream& out, const Literal& literal) {
   out << literal.ToString();
   return out;
@@ -95,99 +136,89 @@ Literal::StrideConfig::StrideConfig(
 Literal::Literal(const Shape& shape)
     : Literal(shape, /*allocate_arrays=*/true) {}
 
-Literal::Literal(const Shape& shape, bool allocate_arrays)
-    : shape_(shape), pieces_(shape), owns_buffers_(true) {
-  CHECK(LayoutUtil::HasLayout(shape));
-  for (auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    Piece& piece = pair.second;
-
-    piece.set_subshape(&ShapeUtil::GetSubshape(shape_, index));
-    const Shape& subshape = piece.subshape();
-    if (ShapeUtil::IsArray(subshape)) {
-      if (allocate_arrays) {
-        if (LayoutUtil::IsSparseArray(subshape)) {
-          // For sparse arrays, the buffer must be of the size of the maximum
-          // number of sparse elements possible.
-          const int64 max_sparse_elements =
-              LayoutUtil::MaxSparseElements(subshape.layout());
-          piece.set_buffer(
-              new char[max_sparse_elements * ShapeUtil::ByteSizeOfPrimitiveType(
-                                                 subshape.element_type())]);
-          piece.set_sparse_indices(new SparseIndexArray(
-              max_sparse_elements, ShapeUtil::Rank(subshape)));
-        } else {
-          piece.set_buffer(new char[piece.size_bytes()]);
-        }
+void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays) {
+  if (ShapeUtil::IsTuple(shape)) {
+    for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      const Shape& subshape = shape.tuple_shapes(i);
+
+      auto child_piece = Piece();
+      child_piece.set_subshape(&subshape);
+
+      SetPiece(subshape, &child_piece, allocate_arrays);
+
+      piece->emplace_back(std::move(child_piece));
+    }
+  } else {
+    CHECK(ShapeUtil::IsArray(shape));
+    if (allocate_arrays) {
+      if (LayoutUtil::IsSparseArray(shape)) {
+        // For sparse arrays, the buffer must be of the size of the maximum
+        // number of sparse elements possible.
+        const int64 max_sparse_elements =
+            LayoutUtil::MaxSparseElements(shape.layout());
+        piece->set_buffer(
+            new char[max_sparse_elements *
+                     ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type())]);
+        piece->set_sparse_indices(
+            new SparseIndexArray(max_sparse_elements, ShapeUtil::Rank(shape)));
       } else {
-        piece.set_buffer(nullptr);
+        piece->set_buffer(new char[piece->size_bytes()]);
       }
     }
   }
 }
 
-Literal::~Literal() { DeallocateBuffers(); }
+Literal::Literal(const Shape& shape, bool allocate_arrays)
+    : LiteralBase(), shape_(MakeUnique<Shape>(shape)) {
+  CHECK(LayoutUtil::HasLayout(*shape_));
+  root_piece_ = new Piece();
+  root_piece_->set_subshape(shape_.get());
+  CHECK(&root_piece_->subshape() == shape_.get());
 
-void Literal::DeallocateBuffers() {
-  if (owns_buffers_) {
-    for (auto& pair : pieces_) {
-      Piece& piece = pair.second;
-      if (piece.buffer() != nullptr) {
-        delete[] piece.buffer();
-        delete piece.sparse_indices();
-      }
-    }
-  }
+  SetPiece(*shape_, root_piece_, allocate_arrays);
 }
 
-Literal::Literal(Literal&& other) {
-  shape_ = std::move(other.shape_);
-  pieces_ = std::move(other.pieces_);
-  // We need to iterate through the pieces to set the subshape pointer
-  // properly. It must refer to subshapes within shape_.
-  for (auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    Piece& piece = pair.second;
-    piece.set_subshape(&ShapeUtil::GetSubshape(shape_, index));
+Literal::~Literal() {
+  if (root_piece_ != nullptr) {
+    DeallocateBuffers();
+    delete root_piece_;
   }
-  owns_buffers_ = other.owns_buffers_;
+}
 
-  other.shape_ = ShapeUtil::MakeNil();
-  other.pieces_ = ShapeTree<Piece>(other.shape_);
-  other.piece({}).set_subshape(&other.shape_);
+void Literal::DeallocateBuffers() {
+  root_piece_->ForEachMutableSubpiece(
+      [&](const ShapeIndex& index, Piece* piece) {
+        if (piece->buffer() != nullptr) {
+          delete[] piece->buffer();
+          delete piece->sparse_indices();
+        }
+      });
 }
 
+Literal::Literal(Literal&& other) : LiteralBase() { *this = std::move(other); }
+
 Literal& Literal::operator=(Literal&& other) {
-  DeallocateBuffers();
-  shape_ = std::move(other.shape_);
-  pieces_ = std::move(other.pieces_);
-  // We need to iterate through the pieces to set the subshape pointer
-  // properly. It must refer to subshapes within shape_.
-  for (auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    Piece& piece = pair.second;
-    piece.set_subshape(&ShapeUtil::GetSubshape(shape_, index));
-  }
-  owns_buffers_ = other.owns_buffers_;
-
-  other.shape_ = ShapeUtil::MakeNil();
-  other.pieces_ = ShapeTree<Piece>(other.shape_);
-  other.piece({}).set_subshape(&other.shape_);
+  DCHECK(&other.root_piece_->subshape() == other.shape_.get());
+  using std::swap;
+  swap(shape_, other.shape_);
+  swap(root_piece_, other.root_piece_);
+  DCHECK(&root_piece_->subshape() == shape_.get());
+
   return *this;
 }
 
-std::unique_ptr<Literal> Literal::CreateFromShape(const Shape& shape) {
+std::unique_ptr<Literal> LiteralBase::CreateFromShape(const Shape& shape) {
   auto literal = MakeUnique<Literal>(shape);
-  for (auto& pair : literal->pieces_) {
-    Piece& piece = pair.second;
-    if (ShapeUtil::IsArray(piece.subshape())) {
-      memset(piece.untyped_data(), 0, piece.size_bytes());
-    }
-  }
+  literal->root_piece_->ForEachMutableSubpiece(
+      [&](const ShapeIndex& index, Piece* piece) {
+        if (ShapeUtil::IsArray(piece->subshape())) {
+          memset(piece->untyped_data(), 0, piece->size_bytes());
+        }
+      });
   return literal;
 }
 
-const SparseIndexArray* Literal::sparse_indices(
+const SparseIndexArray* LiteralBase::sparse_indices(
     const ShapeIndex& shape_index) const {
   return piece(shape_index).sparse_indices();
 }
@@ -202,9 +233,19 @@ SparseIndexArray* Literal::sparse_indices(const ShapeIndex& shape_index) {
   return CreateFromShape(ShapeUtil::MakeShape(primitive_type, dimensions));
 }
 
+/* static */ std::unique_ptr<Literal> Literal::ConvertBF16ToF32(
+    const LiteralSlice& bf16_literal) {
+  return ConvertType<bfloat16, float>(bf16_literal);
+}
+
+/* static */ std::unique_ptr<Literal> Literal::ConvertF32ToBF16(
+    const LiteralSlice& f32_literal) {
+  return ConvertType<float, bfloat16>(f32_literal);
+}
+
 template <typename NativeT>
 Status Literal::CopySliceFromInternal(
-    const Literal& src_literal, tensorflow::gtl::ArraySlice<int64> src_base,
+    const LiteralBase& src_literal, tensorflow::gtl::ArraySlice<int64> src_base,
     tensorflow::gtl::ArraySlice<int64> dest_base,
     tensorflow::gtl::ArraySlice<int64> copy_size) {
   TF_RET_CHECK(ShapeUtil::Rank(src_literal.shape()) == src_base.size());
@@ -264,7 +305,7 @@ Status Literal::CopySliceFromInternal(
   return Status::OK();
 }
 
-Status Literal::CopyElementFrom(const Literal& src_literal,
+Status Literal::CopyElementFrom(const LiteralSlice& src_literal,
                                 tensorflow::gtl::ArraySlice<int64> src_index,
                                 tensorflow::gtl::ArraySlice<int64> dest_index) {
   DCHECK_EQ(shape().element_type(), src_literal.shape().element_type());
@@ -293,22 +334,21 @@ std::vector<Literal> Literal::DecomposeTuple() {
     elements.push_back(Literal(ShapeUtil::GetSubshape(shape(), {i}),
                                /*allocate_arrays=*/false));
     Literal& element = elements.back();
-    for (auto& pair : element.pieces_) {
-      const ShapeIndex& index = pair.first;
-      Piece& dest_piece = pair.second;
-      ShapeIndex src_index = {i};
-      for (int64 j : index) {
-        src_index.push_back(j);
-      }
-      Piece& src_piece = piece(src_index);
-
-      // Move the respective buffer and sparse indices over to the element
-      // Literal.
-      dest_piece.set_buffer(src_piece.buffer());
-      src_piece.set_buffer(nullptr);
-      dest_piece.set_sparse_indices(src_piece.sparse_indices());
-      src_piece.set_sparse_indices(nullptr);
-    }
+    element.root_piece_->ForEachMutableSubpiece(
+        [&](const ShapeIndex& index, Piece* dest_piece) {
+          ShapeIndex src_index = {i};
+          for (int64 j : index) {
+            src_index.push_back(j);
+          }
+          Piece& src_piece = piece(src_index);
+
+          // Move the respective buffer and sparse indices over to the element
+          // Literal.
+          dest_piece->set_buffer(src_piece.buffer());
+          src_piece.set_buffer(nullptr);
+          dest_piece->set_sparse_indices(src_piece.sparse_indices());
+          src_piece.set_sparse_indices(nullptr);
+        });
   }
   // Set this literal to be nil-shaped.
   *this = Literal();
@@ -351,7 +391,9 @@ void CopyElementsBetween(tensorflow::gtl::MutableArraySlice<NativeT> dest,
 
 }  // namespace
 
-Status Literal::Piece::CopyFrom(const Literal::Piece& src) {
+Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src) {
+  CHECK(subshape_ != nullptr);
+  CHECK(src.subshape_ != nullptr);
   if (ShapeUtil::Equal(subshape(), src.subshape())) {
     // If the layouts are equal it's faster just to memcpy.
     memcpy(buffer(), src.buffer(), src.size_bytes());
@@ -388,7 +430,7 @@ Status Literal::Piece::CopyFrom(const Literal::Piece& src) {
   return Status::OK();
 }
 
-Status Literal::CopyFrom(const Literal& src_literal,
+Status Literal::CopyFrom(const LiteralSlice& src_literal,
                          const ShapeIndex& dest_shape_index,
                          const ShapeIndex& src_shape_index) {
   const Shape& dest_subshape =
@@ -401,36 +443,32 @@ Status Literal::CopyFrom(const Literal& src_literal,
         ShapeUtil::HumanString(dest_subshape).c_str(),
         ShapeUtil::HumanString(src_subshape).c_str());
   }
+  return root_piece_->ForEachMutableSubpieceWithStatus(
+      [&](const ShapeIndex& index, Piece* piece) {
+        if (!ShapeUtil::IsArray(piece->subshape())) {
+          return Status::OK();
+        }
 
-  for (auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    Piece& piece = pair.second;
-    if (!ShapeUtil::IsArray(piece.subshape())) {
-      continue;
-    }
-
-    // Determine if this index is in the part of this literal that we want to
-    // copy over from src_literal.
-    bool in_subtree_to_copy = true;
-    for (int i = 0; i < dest_shape_index.size(); ++i) {
-      if (index[i] != dest_shape_index[i]) {
-        in_subtree_to_copy = false;
-        break;
-      }
-    }
-    if (!in_subtree_to_copy) {
-      continue;
-    }
-
-    // Construct the index of the corresponding piece in the source literal.
-    ShapeIndex src_piece_index = src_shape_index;
-    for (int64 i = dest_shape_index.size(); i < index.size(); ++i) {
-      src_piece_index.push_back(index[i]);
-    }
-
-    TF_RETURN_IF_ERROR(piece.CopyFrom(src_literal.piece(src_piece_index)));
-  }
-  return Status::OK();
+        // Determine if this index is in the part of this literal that we want
+        // to copy over from src_literal.
+        bool in_subtree_to_copy = true;
+        for (int i = 0; i < dest_shape_index.size(); ++i) {
+          if (index[i] != dest_shape_index[i]) {
+            in_subtree_to_copy = false;
+            break;
+          }
+        }
+        if (!in_subtree_to_copy) {
+          return Status::OK();
+        }
+        // Construct the index of the corresponding piece in the source literal.
+        ShapeIndex src_piece_index = src_shape_index;
+        for (int64 i = dest_shape_index.size(); i < index.size(); ++i) {
+          src_piece_index.push_back(index[i]);
+        }
+        TF_RETURN_IF_ERROR(piece->CopyFrom(src_literal.piece(src_piece_index)));
+        return Status::OK();
+      });
 }
 
 Status Literal::MoveFrom(Literal&& src_literal,
@@ -444,37 +482,32 @@ Status Literal::MoveFrom(Literal&& src_literal,
         ShapeUtil::HumanString(src_literal.shape()).c_str());
   }
 
-  if (!(owns_buffers_ && src_literal.owns_buffers_)) {
-    return InvalidArgument(
-        "Source and destination literals must both own their buffers (ie, not "
-        "be views)");
-  }
+  src_literal.root_piece_->ForEachSubpiece(
+      [&](const ShapeIndex& src_index, const Piece& src_piece) {
+        if (!ShapeUtil::IsArray(src_piece.subshape())) {
+          return;
+        }
 
-  for (auto& pair : src_literal.pieces_) {
-    const ShapeIndex& src_index = pair.first;
-    Piece& src_piece = pair.second;
-    if (!ShapeUtil::IsArray(src_piece.subshape())) {
-      continue;
-    }
+        ShapeIndex dest_index = dest_shape_index;
+        for (int64 i : src_index) {
+          dest_index.push_back(i);
+        }
+        Piece& dest_piece = piece(dest_index);
+        delete[] dest_piece.buffer();
+        dest_piece.set_buffer(src_piece.buffer());
+        delete dest_piece.sparse_indices();
+        dest_piece.set_sparse_indices(src_piece.sparse_indices());
+      });
 
-    ShapeIndex dest_index = dest_shape_index;
-    for (int64 i : src_index) {
-      dest_index.push_back(i);
-    }
-    Piece& dest_piece = piece(dest_index);
-    delete[] dest_piece.buffer();
-    dest_piece.set_buffer(src_piece.buffer());
-    delete dest_piece.sparse_indices();
-    dest_piece.set_sparse_indices(src_piece.sparse_indices());
-  }
+  src_literal.shape_ = MakeUnique<Shape>(ShapeUtil::MakeNil());
+  delete src_literal.root_piece_;
+  src_literal.root_piece_ = new LiteralBase::Piece();
+  src_literal.root_piece_->set_subshape(src_literal.shape_.get());
 
-  src_literal.shape_ = ShapeUtil::MakeNil();
-  src_literal.pieces_ = ShapeTree<Piece>(src_literal.shape_);
-  src_literal.piece({}).set_subshape(&src_literal.shape_);
   return Status::OK();
 }
 
-Status Literal::CopySliceFrom(const Literal& src_literal,
+Status Literal::CopySliceFrom(const LiteralSlice& src_literal,
                               tensorflow::gtl::ArraySlice<int64> src_base,
                               tensorflow::gtl::ArraySlice<int64> dest_base,
                               tensorflow::gtl::ArraySlice<int64> copy_size) {
@@ -743,7 +776,7 @@ void Literal::PopulateR1(const tensorflow::core::Bitmap& values) {
   return CreateR2FromArray2D(*value);
 }
 
-std::unique_ptr<Literal> Literal::Relayout(
+std::unique_ptr<Literal> LiteralBase::Relayout(
     const Layout& new_layout, const ShapeIndex& shape_index) const {
   // Create new shape with 'new_layout' set at the given shape index.
   Shape new_shape = shape();
@@ -755,7 +788,7 @@ std::unique_ptr<Literal> Literal::Relayout(
   return result;
 }
 
-std::unique_ptr<Literal> Literal::Relayout(
+std::unique_ptr<Literal> LiteralBase::Relayout(
     const Shape& shape_with_layout) const {
   CHECK(ShapeUtil::Compatible(shape_with_layout, shape()))
       << "Given shape_with_layout " << ShapeUtil::HumanString(shape_with_layout)
@@ -774,7 +807,7 @@ std::unique_ptr<Literal> Literal::Relayout(
   return result;
 }
 
-StatusOr<std::unique_ptr<Literal>> Literal::Reshape(
+StatusOr<std::unique_ptr<Literal>> LiteralBase::Reshape(
     tensorflow::gtl::ArraySlice<int64> dimensions) const {
   if (!ShapeUtil::IsArray(shape())) {
     return InvalidArgument("Reshape does not support tuples.");
@@ -788,7 +821,8 @@ StatusOr<std::unique_ptr<Literal>> Literal::Reshape(
   }
   // Because the layout is monotonic, we can simply reuse the same sequence of
   // values without changing their order.
-  output->shape_ = ShapeUtil::MakeShape(shape().element_type(), dimensions);
+  *output->mutable_shape_do_not_use() =
+      ShapeUtil::MakeShape(shape().element_type(), dimensions);
 
   int64 elements_before = ShapeUtil::ElementsIn(shape());
   int64 elements_after = ShapeUtil::ElementsIn(output->shape());
@@ -802,7 +836,79 @@ StatusOr<std::unique_ptr<Literal>> Literal::Reshape(
   return std::move(output);
 }
 
-std::unique_ptr<Literal> Literal::Transpose(
+/* static */ std::unique_ptr<Literal> Literal::ReshapeSlice(
+    tensorflow::gtl::ArraySlice<int64> new_dimensions,
+    tensorflow::gtl::ArraySlice<int64> minor_to_major,
+    const LiteralSlice& literal) {
+  int64 new_num_elements = 1;
+  for (int64 i = 0; i < new_dimensions.size(); ++i) {
+    new_num_elements *= new_dimensions[i];
+  }
+  CHECK_EQ(ShapeUtil::ElementsIn(literal.shape()), new_num_elements);
+  CHECK_EQ(new_dimensions.size(), minor_to_major.size());
+
+  auto new_literal = MakeUnique<Literal>(
+      ShapeUtil::MakeShape(literal.shape().element_type(), new_dimensions));
+
+  // Create a new shape with the given minor-to-major layout. This shape is used
+  // solely for converting linear address to multi-dimensional addresses when
+  // writing elements to the new literal.
+  Shape shape_with_layout = new_literal->shape();
+  *shape_with_layout.mutable_layout() = LayoutUtil::MakeLayout(minor_to_major);
+
+  // Copy data into new literal, element-by-element.
+  for (int64 i = 0; i < ShapeUtil::ElementsIn(literal.shape()); ++i) {
+    std::vector<int64> from_multi_index =
+        IndexUtil::LinearIndexToMultidimensionalIndex(literal.shape(), i);
+    std::vector<int64> to_multi_index =
+        IndexUtil::LinearIndexToMultidimensionalIndex(shape_with_layout, i);
+    switch (literal.shape().element_type()) {
+      case PRED:
+        new_literal->Set<bool>(to_multi_index,
+                               literal.Get<bool>(from_multi_index));
+        break;
+      case U8:
+        new_literal->Set<uint8>(to_multi_index,
+                                literal.Get<uint8>(from_multi_index));
+        break;
+      case U32:
+        new_literal->Set<uint32>(to_multi_index,
+                                 literal.Get<uint32>(from_multi_index));
+        break;
+      case S32:
+        new_literal->Set<int32>(to_multi_index,
+                                literal.Get<int32>(from_multi_index));
+        break;
+      case U64:
+        new_literal->Set<uint64>(to_multi_index,
+                                 literal.Get<uint64>(from_multi_index));
+        break;
+      case S64:
+        new_literal->Set<int64>(to_multi_index,
+                                literal.Get<int64>(from_multi_index));
+        break;
+      case F32:
+        new_literal->Set<float>(to_multi_index,
+                                literal.Get<float>(from_multi_index));
+        break;
+      case F64:
+        new_literal->Set<double>(to_multi_index,
+                                 literal.Get<double>(from_multi_index));
+        break;
+      case C64:
+        new_literal->Set<complex64>(to_multi_index,
+                                    literal.Get<complex64>(from_multi_index));
+        break;
+      default:
+        LOG(FATAL) << "Unhandled primitive element type: "
+                   << PrimitiveType_Name(literal.shape().element_type());
+    }
+  }
+
+  return new_literal;
+}
+
+std::unique_ptr<Literal> LiteralBase::Transpose(
     tensorflow::gtl::ArraySlice<int64> permutation) const {
   CHECK(ShapeUtil::IsArray(shape())) << "Tuple is not supported for transpose";
   CHECK(IsPermutation(permutation, ShapeUtil::Rank(shape())))
@@ -833,15 +939,14 @@ std::unique_ptr<Literal> Literal::Transpose(
   for (auto index : LayoutUtil::MinorToMajor(shape())) {
     layout->add_minor_to_major(inverse_permutation[index]);
   }
-  std::unique_ptr<Literal> new_literal = CreateFromShape(permuted_shape);
-  DCHECK_GE(ShapeUtil::ByteSizeOf(new_literal->shape()),
+  auto new_literal = MakeUnique<Literal>(permuted_shape);
+  DCHECK_EQ(ShapeUtil::ByteSizeOf(new_literal->shape()),
             ShapeUtil::ByteSizeOf(shape()));
-  std::memcpy(new_literal->root_piece().buffer(), root_piece().buffer(),
-              root_piece().size_bytes());
+  std::memcpy(new_literal->untyped_data(), untyped_data(), size_bytes());
   return new_literal;
 }
 
-std::unique_ptr<Literal> Literal::Slice(
+std::unique_ptr<Literal> LiteralBase::Slice(
     tensorflow::gtl::ArraySlice<int64> start_indices,
     tensorflow::gtl::ArraySlice<int64> limit_indices) const {
   CHECK(ShapeUtil::IsArray(shape())) << "tuple is not supported for slice";
@@ -909,20 +1014,20 @@ std::unique_ptr<Literal> Literal::Slice(
   }
 }
 
-Literal Literal::Clone() const {
+Literal LiteralBase::Clone() const {
   Literal result(shape());
   TF_CHECK_OK(result.CopyFrom(*this));
   return result;
 }
 
-std::unique_ptr<Literal> Literal::CloneToUnique() const {
+std::unique_ptr<Literal> LiteralBase::CloneToUnique() const {
   auto result = MakeUnique<Literal>(shape());
   TF_CHECK_OK(result->CopyFrom(*this));
   return result;
 }
 
-string Literal::GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index,
-                            const ShapeIndex& shape_index) const {
+string LiteralBase::GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index,
+                                const ShapeIndex& shape_index) const {
   const Shape& subshape = ShapeUtil::GetSubshape(shape(), shape_index);
   CHECK(LayoutUtil::IsDenseArray(subshape));
   switch (subshape.element_type()) {
@@ -962,8 +1067,8 @@ string Literal::GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index,
   }
 }
 
-string Literal::GetSparseElementAsString(int64 sparse_element_number,
-                                         const ShapeIndex& shape_index) const {
+string LiteralBase::GetSparseElementAsString(
+    int64 sparse_element_number, const ShapeIndex& shape_index) const {
   const Shape& subshape = ShapeUtil::GetSubshape(shape(), shape_index);
   CHECK(LayoutUtil::IsSparseArray(subshape));
   switch (subshape.element_type()) {
@@ -1017,7 +1122,7 @@ string Literal::GetSparseElementAsString(int64 sparse_element_number,
   }
 }
 
-StatusOr<int64> Literal::GetIntegralAsS64(
+StatusOr<int64> LiteralBase::GetIntegralAsS64(
     tensorflow::gtl::ArraySlice<int64> multi_index) const {
   CHECK(LayoutUtil::IsDenseArray(shape()));
   switch (shape().element_type()) {
@@ -1040,6 +1145,27 @@ StatusOr<int64> Literal::GetIntegralAsS64(
   }
 }
 
+size_t LiteralBase::Hash() const {
+  using tensorflow::Hash64;
+  using tensorflow::Hash64Combine;
+
+  size_t hash_value = ShapeUtil::Hash(shape());
+
+  ShapeUtil::ForEachSubshape(
+      shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+        if (ShapeUtil::IsTuple(subshape)) {
+          return;
+        }
+
+        CHECK(LayoutUtil::IsDense(subshape.layout()));
+        hash_value = Hash64Combine(
+            hash_value, Hash64(static_cast<const char*>(untyped_data(index)),
+                               size_bytes(index)));
+      });
+
+  return hash_value;
+}
+
 Status Literal::SetIntegralAsS64(tensorflow::gtl::ArraySlice<int64> multi_index,
                                  int64 value) {
   CHECK(LayoutUtil::IsDenseArray(shape()));
@@ -1070,7 +1196,7 @@ Status Literal::SetIntegralAsS64(tensorflow::gtl::ArraySlice<int64> multi_index,
   return Status::OK();
 }
 
-tensorflow::gtl::ArraySlice<int64> Literal::GetSparseIndex(
+tensorflow::gtl::ArraySlice<int64> LiteralBase::GetSparseIndex(
     int64 sparse_element_number, const ShapeIndex& shape_index) const {
   const Piece& p = piece(shape_index);
   CHECK_GE(sparse_element_number, 0);
@@ -1082,10 +1208,10 @@ void Literal::SortSparseElements(const ShapeIndex& shape_index) {
   piece(shape_index).SortSparseElements();
 }
 
-Literal Literal::GetFirstScalarLiteral() const {
-  CHECK(ShapeUtil::IsArray(shape_));
-  CHECK_GT(ShapeUtil::ElementsIn(shape_), 0);
-  switch (shape_.element_type()) {
+Literal LiteralBase::GetFirstScalarLiteral() const {
+  CHECK(ShapeUtil::IsArray(shape()));
+  CHECK_GT(ShapeUtil::ElementsIn(shape()), 0);
+  switch (shape().element_type()) {
     case PRED:
       return std::move(*Literal::CreateR0<bool>(GetFirstElement<bool>()));
     // 8 bit types.
@@ -1121,11 +1247,11 @@ Literal Literal::GetFirstScalarLiteral() const {
     case U64:
       return std::move(*Literal::CreateR0<uint64>(GetFirstElement<uint64>()));
     default:
-      LOG(FATAL) << "Unhandled primitive type " << shape_.element_type();
+      LOG(FATAL) << "Unhandled primitive type " << shape().element_type();
   }
 }
 
-void Literal::Piece::SortSparseElements() {
+void LiteralBase::Piece::SortSparseElements() {
   switch (subshape().element_type()) {
     case PRED:
       SortSparseElementsInternal<bool>();
@@ -1176,7 +1302,7 @@ void Literal::Piece::SortSparseElements() {
 }
 
 template <typename NativeT>
-void Literal::Piece::SortSparseElementsInternal() {
+void LiteralBase::Piece::SortSparseElementsInternal() {
   CHECK(LayoutUtil::IsSparseArray(subshape()));
   int64 num_elements = sparse_indices()->index_count();
   auto values = data<NativeT>();
@@ -1187,9 +1313,11 @@ void Literal::Piece::SortSparseElementsInternal() {
 
 namespace {
 
-void ToStringHelper(const Literal& literal, const ShapeIndex& shape_index,
+void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
                     bool print_layout, std::vector<string>* pieces) {
   const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
+  CHECK(LayoutUtil::HasLayout(literal.shape()));
+  CHECK(LayoutUtil::HasLayout(subshape));
 
   auto shape_to_string = [print_layout](const Shape& shape) {
     if (print_layout) {
@@ -1348,13 +1476,14 @@ void ToStringHelper(const Literal& literal, const ShapeIndex& shape_index,
 
 }  // namespace
 
-int64 Literal::sparse_element_count() const {
+int64 LiteralBase::sparse_element_count() const {
   CHECK(LayoutUtil::IsSparseArray(shape()));
   return sparse_indices()->index_count();
 }
 
-string Literal::ToString(bool print_layout) const {
+string LiteralBase::ToString(bool print_layout) const {
   std::vector<string> pieces;
+  CHECK(LayoutUtil::HasLayout(this->shape()));
   ToStringHelper(*this, {}, print_layout, &pieces);
   return tensorflow::str_util::Join(pieces, "");
 }
@@ -1362,7 +1491,7 @@ string Literal::ToString(bool print_layout) const {
 /* static */ std::unique_ptr<Literal> Literal::MakeTuple(
     tensorflow::gtl::ArraySlice<const Literal*> elements) {
   std::vector<Shape> element_shapes;
-  for (const Literal* element : elements) {
+  for (const auto* element : elements) {
     element_shapes.push_back(element->shape());
   }
   auto literal = MakeUnique<Literal>(ShapeUtil::MakeTupleShape(element_shapes));
@@ -1372,6 +1501,19 @@ string Literal::ToString(bool print_layout) const {
   return literal;
 }
 
+/* static */ std::unique_ptr<Literal> Literal::MakeTupleFromSlices(
+    tensorflow::gtl::ArraySlice<LiteralSlice> elements) {
+  std::vector<Shape> element_shapes;
+  for (const auto& element : elements) {
+    element_shapes.push_back(element.shape());
+  }
+  auto literal = MakeUnique<Literal>(ShapeUtil::MakeTupleShape(element_shapes));
+  for (int i = 0; i < elements.size(); ++i) {
+    TF_CHECK_OK(literal->CopyFrom(elements[i], /*dest_shape_index=*/{i}));
+  }
+  return literal;
+}
+
 /* static */ std::unique_ptr<Literal> Literal::MakeTupleOwned(
     std::vector<std::unique_ptr<Literal>> elements) {
   std::vector<Shape> element_shapes;
@@ -1387,7 +1529,7 @@ string Literal::ToString(bool print_layout) const {
   return literal;
 }
 
-void Literal::EachCellAsString(
+void LiteralBase::EachCellAsString(
     const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
                              const string& value)>& per_cell) const {
   if (ShapeUtil::HasZeroElements(shape())) {
@@ -1403,7 +1545,7 @@ void Literal::EachCellAsString(
 namespace {
 template <typename NativeSrcT, typename NativeDestT, typename ConverterType>
 std::unique_ptr<Literal> ConvertBetweenNativeTypesWithConverter(
-    const Literal& src_literal, const ConverterType& converter) {
+    const LiteralBase& src_literal, const ConverterType& converter) {
   CHECK(ShapeUtil::IsArray(src_literal.shape()));
   auto result_literal = MakeUnique<Literal>(ShapeUtil::ChangeElementType(
       src_literal.shape(),
@@ -1419,7 +1561,8 @@ std::unique_ptr<Literal> ConvertBetweenNativeTypesWithConverter(
 }
 
 template <typename NativeSrcT, typename NativeDestT>
-std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
+std::unique_ptr<Literal> ConvertBetweenNativeTypes(
+    const LiteralBase& src_literal) {
   auto converter = [](NativeSrcT src) { return static_cast<NativeDestT>(src); };
   return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
       src_literal, converter);
@@ -1428,7 +1571,7 @@ std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
 template <typename NativeSrcT, typename NativeDestT>
 typename std::enable_if<(sizeof(NativeSrcT) == sizeof(NativeDestT)),
                         std::unique_ptr<Literal>>::type
-BitcastBetweenNativeTypes(const Literal& src_literal) {
+BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
   auto converter = [](NativeSrcT src) {
     return tensorflow::bit_cast<NativeDestT>(src);
   };
@@ -1443,12 +1586,12 @@ BitcastBetweenNativeTypes(const Literal& src_literal) {
 template <typename NativeSrcT, typename NativeDestT>
 typename std::enable_if<(sizeof(NativeSrcT) != sizeof(NativeDestT)),
                         std::unique_ptr<Literal>>::type
-BitcastBetweenNativeTypes(const Literal& src_literal) {
+BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
   LOG(FATAL) << "Invalid bitcast between types of different sizes.";
 }
 
 template <PrimitiveType primitive_src_type>
-std::unique_ptr<Literal> ConvertToC64(const Literal& src_literal) {
+std::unique_ptr<Literal> ConvertToC64(const LiteralBase& src_literal) {
   CHECK(ShapeUtil::IsArray(src_literal.shape()));
   auto result_literal = MakeUnique<Literal>(
       ShapeUtil::ChangeElementType(src_literal.shape(), C64));
@@ -1466,7 +1609,7 @@ std::unique_ptr<Literal> ConvertToC64(const Literal& src_literal) {
 }
 
 template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
-std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal,
+std::unique_ptr<Literal> ConvertIfTypesMatch(const LiteralBase& src_literal,
                                              bool bitcast) {
   CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
   if (bitcast) {
@@ -1486,7 +1629,7 @@ std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal,
 
 template <PrimitiveType primitive_src_type>
 StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
-    const Literal& src_literal, PrimitiveType primitive_dest_type,
+    const LiteralBase& src_literal, PrimitiveType primitive_dest_type,
     bool bitcast) {
   switch (primitive_dest_type) {
 #define CONVERT_IF_TYPES_MATCH(type)                                    \
@@ -1521,7 +1664,8 @@ StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
 }
 
 StatusOr<std::unique_ptr<Literal>> ConvertSwitch(
-    const Literal& literal, PrimitiveType primitive_dest_type, bool bitcast) {
+    const LiteralBase& literal, PrimitiveType primitive_dest_type,
+    bool bitcast) {
   TF_RET_CHECK(ShapeUtil::IsArray(literal.shape()));
   if (literal.shape().element_type() == primitive_dest_type) {
     return literal.CloneToUnique();
@@ -1555,12 +1699,12 @@ StatusOr<std::unique_ptr<Literal>> ConvertSwitch(
 
 }  // namespace
 
-StatusOr<std::unique_ptr<Literal>> Literal::Convert(
+StatusOr<std::unique_ptr<Literal>> LiteralBase::Convert(
     PrimitiveType primitive_dest_type) const {
   return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/false);
 }
 
-StatusOr<std::unique_ptr<Literal>> Literal::BitcastConvert(
+StatusOr<std::unique_ptr<Literal>> LiteralBase::BitcastConvert(
     PrimitiveType primitive_dest_type) const {
   if (primitive_util::BitWidth(shape().element_type()) !=
       primitive_util::BitWidth(primitive_dest_type)) {
@@ -1575,7 +1719,7 @@ StatusOr<std::unique_ptr<Literal>> Literal::BitcastConvert(
   return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/true);
 }
 
-StatusOr<std::unique_ptr<Literal>> Literal::ConvertToShape(
+StatusOr<std::unique_ptr<Literal>> LiteralBase::ConvertToShape(
     const Shape& dest_shape, bool round_f32_to_bf16) const {
   if (!ShapeUtil::IsTuple(dest_shape)) {
     if (round_f32_to_bf16 && shape().element_type() == F32 &&
@@ -1590,7 +1734,7 @@ StatusOr<std::unique_ptr<Literal>> Literal::ConvertToShape(
   }
   std::vector<Literal> elements;
   for (int i = 0; i < ShapeUtil::TupleElementCount(shape()); ++i) {
-    auto element = LiteralView::Create(*this, {i});
+    auto element = LiteralSlice(*this, {i});
     TF_ASSIGN_OR_RETURN(
         auto new_element,
         element.ConvertToShape(ShapeUtil::GetSubshape(dest_shape, {i})));
@@ -1602,8 +1746,8 @@ StatusOr<std::unique_ptr<Literal>> Literal::ConvertToShape(
 }
 
 template <typename NativeT>
-bool Literal::Piece::EqualElementsInternal(
-    const Literal::Piece& other, std::vector<int64>* multi_index) const {
+bool LiteralBase::Piece::EqualElementsInternal(
+    const LiteralBase::Piece& other, std::vector<int64>* multi_index) const {
   if (multi_index->size() == ShapeUtil::Rank(subshape())) {
     return (Get<NativeT>(*multi_index) == other.Get<NativeT>(*multi_index));
   }
@@ -1617,7 +1761,7 @@ bool Literal::Piece::EqualElementsInternal(
   return true;
 }
 
-bool Literal::Piece::EqualElements(const Literal::Piece& other) const {
+bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
   DCHECK(ShapeUtil::Compatible(subshape(), other.subshape()));
 
   std::vector<int64> multi_index;
@@ -1645,28 +1789,28 @@ bool Literal::Piece::EqualElements(const Literal::Piece& other) const {
     case C64:
       return EqualElementsInternal<complex64>(other, &multi_index);
     default:
-      LOG(FATAL) << "Unimplemented: Literal::Piece::EqualElements for type "
+      LOG(FATAL) << "Unimplemented: LiteralBase::Piece::EqualElements for type "
                  << PrimitiveType_Name(subshape().element_type());
   }
 }
 
-bool Literal::operator==(const Literal& other) const {
+bool LiteralBase::operator==(const LiteralBase& other) const {
   if (!ShapeUtil::Compatible(shape(), other.shape())) {
     return false;
   }
-  for (const auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    const Piece& piece = pair.second;
-    if (!ShapeUtil::IsArray(piece.subshape())) {
-      continue;
-    }
 
-    const Piece& other_piece = other.piece(index);
-    if (!piece.EqualElements(other_piece)) {
-      return false;
-    }
-  }
-  return true;
+  return root_piece().ForEachSubpieceWithBool(
+      [&](const ShapeIndex& index, const Piece& piece) {
+        if (!ShapeUtil::IsArray(piece.subshape())) {
+          return true;
+        }
+
+        const Piece& other_piece = other.piece(index);
+        if (!piece.EqualElements(other_piece)) {
+          return false;
+        }
+        return true;
+      });
 }
 
 namespace {
@@ -1684,11 +1828,11 @@ static bool AllElementsEqualValue(tensorflow::gtl::ArraySlice<NativeT> data,
 
 }  // namespace
 
-bool Literal::IsAll(int8 value) const {
-  for (const auto& pair : pieces_) {
-    const Piece& piece = pair.second;
+bool LiteralBase::IsAll(int8 value) const {
+  return root_piece().ForEachSubpieceWithBool([&](const ShapeIndex& index,
+                                                  const Piece& piece) {
     if (!ShapeUtil::IsArray(piece.subshape())) {
-      continue;
+      return true;
     }
 
     auto piece_is_all = [&]() {
@@ -1741,41 +1885,41 @@ bool Literal::IsAll(int8 value) const {
     if (!piece_is_all()) {
       return false;
     }
-  }
-  return true;
+    return true;
+  });
 }
 
-bool Literal::IsAllFloat(float value) const {
-  for (const auto& pair : pieces_) {
-    const Piece& piece = pair.second;
-    if (!ShapeUtil::IsArray(piece.subshape())) {
-      continue;
-    }
+bool LiteralBase::IsAllFloat(float value) const {
+  return root_piece().ForEachSubpieceWithBool(
+      [&](const ShapeIndex& index, const Piece& piece) {
+        if (!ShapeUtil::IsArray(piece.subshape())) {
+          return true;
+        }
 
-    auto piece_is_all = [&]() {
-      switch (shape().element_type()) {
-        case F32:
-          return AllElementsEqualValue<float>(piece.data<float>(), value);
-        case F64:
-          return AllElementsEqualValue<double>(piece.data<double>(), value);
-        case F16:
-          return AllElementsEqualValue<half>(piece.data<half>(),
-                                             static_cast<half>(value));
-        case BF16:
-          return AllElementsEqualValue<bfloat16>(piece.data<bfloat16>(),
-                                                 static_cast<bfloat16>(value));
-        default:
+        auto piece_is_all = [&]() {
+          switch (shape().element_type()) {
+            case F32:
+              return AllElementsEqualValue<float>(piece.data<float>(), value);
+            case F64:
+              return AllElementsEqualValue<double>(piece.data<double>(), value);
+            case F16:
+              return AllElementsEqualValue<half>(piece.data<half>(),
+                                                 static_cast<half>(value));
+            case BF16:
+              return AllElementsEqualValue<bfloat16>(
+                  piece.data<bfloat16>(), static_cast<bfloat16>(value));
+            default:
+              return false;
+          }
+        };
+        if (!piece_is_all()) {
           return false;
-      }
-    };
-    if (!piece_is_all()) {
-      return false;
-    }
-  }
-  return true;
+        }
+        return true;
+      });
 }
 
-bool Literal::IsAllComplex(complex64 value) const {
+bool LiteralBase::IsAllComplex(complex64 value) const {
   switch (shape().element_type()) {
     case C64:
       return AllElementsEqualValue<complex64>(root_piece().data<complex64>(),
@@ -1785,93 +1929,93 @@ bool Literal::IsAllComplex(complex64 value) const {
   }
 }
 
-bool Literal::IsAllFirst() const {
-  for (const auto& pair : pieces_) {
-    const Piece& piece = pair.second;
-    if (!ShapeUtil::IsArray(piece.subshape())) {
-      continue;
-    }
-
-    // Empty shapes are not all the first element since there is no first
-    // element.
-    if (ShapeUtil::HasZeroElements(piece.subshape())) {
-      return false;
-    }
-    auto piece_is_all = [&]() {
-      switch (piece.subshape().element_type()) {
-        case PRED: {
-          auto data = piece.data<bool>();
-          return AllElementsEqualValue<bool>(data, data[0]);
-        }
-        // 8 bit types
-        case S8: {
-          auto data = piece.data<int8>();
-          return AllElementsEqualValue<int8>(data, data[0]);
-        }
-        case U8: {
-          auto data = piece.data<uint8>();
-          return AllElementsEqualValue<uint8>(data, data[0]);
-        }
-        // 16 bit types
-        case BF16: {
-          auto data = piece.data<bfloat16>();
-          return AllElementsEqualValue<bfloat16>(data, data[0]);
-        }
-        case F16: {
-          auto data = piece.data<half>();
-          return AllElementsEqualValue<half>(data, data[0]);
-        }
-        case S16: {
-          auto data = piece.data<int16>();
-          return AllElementsEqualValue<int16>(data, data[0]);
-        }
-        case U16: {
-          auto data = piece.data<uint16>();
-          return AllElementsEqualValue<uint16>(data, data[0]);
-        }
-        // 32 bit types
-        case F32: {
-          auto data = piece.data<float>();
-          return AllElementsEqualValue<float>(data, data[0]);
+bool LiteralBase::IsAllFirst() const {
+  return root_piece().ForEachSubpieceWithBool(
+      [&](const ShapeIndex& index, const Piece& piece) {
+        if (!ShapeUtil::IsArray(piece.subshape())) {
+          return true;
         }
-        case U32: {
-          auto data = piece.data<uint32>();
-          return AllElementsEqualValue<uint32>(data, data[0]);
-        }
-        case S32: {
-          auto data = piece.data<int32>();
-          return AllElementsEqualValue<int32>(data, data[0]);
-        }
-        // 64 bit types
-        case C64: {
-          auto data = piece.data<complex64>();
-          return AllElementsEqualValue<complex64>(data, data[0]);
-        }
-        case F64: {
-          auto data = piece.data<double>();
-          return AllElementsEqualValue<double>(data, data[0]);
-        }
-        case S64: {
-          auto data = piece.data<int64>();
-          return AllElementsEqualValue<int64>(data, data[0]);
-        }
-        case U64: {
-          auto data = piece.data<uint64>();
-          return AllElementsEqualValue<uint64>(data, data[0]);
-        }
-        default:
+
+        // Empty shapes are not all the first element since there is no first
+        // element.
+        if (ShapeUtil::HasZeroElements(piece.subshape())) {
           return false;
-      }
-    };
+        }
+        auto piece_is_all = [&]() {
+          switch (piece.subshape().element_type()) {
+            case PRED: {
+              auto data = piece.data<bool>();
+              return AllElementsEqualValue<bool>(data, data[0]);
+            }
+            // 8 bit types
+            case S8: {
+              auto data = piece.data<int8>();
+              return AllElementsEqualValue<int8>(data, data[0]);
+            }
+            case U8: {
+              auto data = piece.data<uint8>();
+              return AllElementsEqualValue<uint8>(data, data[0]);
+            }
+            // 16 bit types
+            case BF16: {
+              auto data = piece.data<bfloat16>();
+              return AllElementsEqualValue<bfloat16>(data, data[0]);
+            }
+            case F16: {
+              auto data = piece.data<half>();
+              return AllElementsEqualValue<half>(data, data[0]);
+            }
+            case S16: {
+              auto data = piece.data<int16>();
+              return AllElementsEqualValue<int16>(data, data[0]);
+            }
+            case U16: {
+              auto data = piece.data<uint16>();
+              return AllElementsEqualValue<uint16>(data, data[0]);
+            }
+            // 32 bit types
+            case F32: {
+              auto data = piece.data<float>();
+              return AllElementsEqualValue<float>(data, data[0]);
+            }
+            case U32: {
+              auto data = piece.data<uint32>();
+              return AllElementsEqualValue<uint32>(data, data[0]);
+            }
+            case S32: {
+              auto data = piece.data<int32>();
+              return AllElementsEqualValue<int32>(data, data[0]);
+            }
+            // 64 bit types
+            case C64: {
+              auto data = piece.data<complex64>();
+              return AllElementsEqualValue<complex64>(data, data[0]);
+            }
+            case F64: {
+              auto data = piece.data<double>();
+              return AllElementsEqualValue<double>(data, data[0]);
+            }
+            case S64: {
+              auto data = piece.data<int64>();
+              return AllElementsEqualValue<int64>(data, data[0]);
+            }
+            case U64: {
+              auto data = piece.data<uint64>();
+              return AllElementsEqualValue<uint64>(data, data[0]);
+            }
+            default:
+              return false;
+          }
+        };
 
-    if (!piece_is_all()) {
-      return false;
-    }
-  }
-  return true;
+        if (!piece_is_all()) {
+          return false;
+        }
+        return true;
+      });
 }
 
-bool Literal::IsZero(tensorflow::gtl::ArraySlice<int64> indices) const {
+bool LiteralBase::IsZero(tensorflow::gtl::ArraySlice<int64> indices) const {
   CHECK(ShapeUtil::IsArray(shape()));
   switch (shape().element_type()) {
     case U8:
@@ -1913,7 +2057,7 @@ void CopyToRepeatedField(RepeatedFieldT* dest,
 
 }  // namespace
 
-void Literal::Piece::WriteToProto(LiteralProto* proto) const {
+void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
   *proto->mutable_shape() = subshape();
   switch (subshape().element_type()) {
     case PRED:
@@ -1969,12 +2113,12 @@ void Literal::Piece::WriteToProto(LiteralProto* proto) const {
   }
 }
 
-const void* Literal::Piece::untyped_data() const {
+const void* LiteralBase::Piece::untyped_data() const {
   CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
   return buffer();
 }
 
-void* Literal::Piece::untyped_data() {
+void* LiteralBase::Piece::untyped_data() {
   CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
   return buffer();
 }
@@ -1995,7 +2139,7 @@ Status CopyFromRepeatedField(tensorflow::gtl::MutableArraySlice<NativeT> dest,
 
 }  // namespace
 
-Status Literal::Piece::CopyFromProto(const LiteralProto& proto) {
+Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
   // These conditions should have been checked in Literal::CreateFromProto.
   TF_RET_CHECK(proto.has_shape());
   TF_RET_CHECK(LayoutUtil::HasLayout(proto.shape()));
@@ -2062,21 +2206,19 @@ Status Literal::Piece::CopyFromProto(const LiteralProto& proto) {
   return Status::OK();
 }
 
-LiteralProto Literal::ToProto() const {
+LiteralProto LiteralBase::ToProto() const {
   LiteralProto proto;
-  for (const auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    const Piece& piece = pair.second;
-
-    LiteralProto* proto_piece = &proto;
-    for (int64 i : index) {
-      while (proto_piece->tuple_literals_size() <= i) {
-        proto_piece->add_tuple_literals();
-      }
-      proto_piece = proto_piece->mutable_tuple_literals(i);
-    }
-    piece.WriteToProto(proto_piece);
-  }
+  root_piece().ForEachSubpiece(
+      [&](const ShapeIndex& index, const Piece& piece) {
+        LiteralProto* proto_piece = &proto;
+        for (int64 i : index) {
+          while (proto_piece->tuple_literals_size() <= i) {
+            proto_piece->add_tuple_literals();
+          }
+          proto_piece = proto_piece->mutable_tuple_literals(i);
+        }
+        piece.WriteToProto(proto_piece);
+      });
 
   if (LayoutUtil::IsSparseArray(shape())) {
     CopyToRepeatedField(proto.mutable_sparse_indices(),
@@ -2098,33 +2240,40 @@ StatusOr<std::unique_ptr<Literal>> Literal::CreateFromProto(
 
   auto literal = MakeUnique<Literal>(proto.shape());
 
-  for (auto& pair : literal->pieces_) {
-    const ShapeIndex& index = pair.first;
-    Piece& piece = pair.second;
-    const LiteralProto* proto_element = &proto;
-    for (int64 i : index) {
-      TF_RET_CHECK(i < proto_element->tuple_literals_size());
-      proto_element = &proto_element->tuple_literals(i);
-    }
+  TF_RETURN_IF_ERROR(literal->root_piece_->ForEachMutableSubpieceWithStatus(
+      [&](const ShapeIndex& index, Piece* piece) {
+        const LiteralProto* proto_element = &proto;
+        for (int64 i : index) {
+          CHECK(i < proto_element->tuple_literals_size());
+          proto_element = &proto_element->tuple_literals(i);
+        }
 
-    if (ShapeUtil::IsTuple(piece.subshape())) {
-      if (proto_element->tuple_literals_size() !=
-          ShapeUtil::TupleElementCount(piece.subshape())) {
-        return InvalidArgument(
-            "Expected %lld tuple elements in LiteralProto, has %d",
-            ShapeUtil::TupleElementCount(piece.subshape()),
-            proto_element->tuple_literals_size());
-      }
-      continue;
-    }
+        if (ShapeUtil::IsTuple(piece->subshape())) {
+          if (proto_element->tuple_literals_size() !=
+              ShapeUtil::TupleElementCount(piece->subshape())) {
+            return InvalidArgument(
+                "Expected %lld tuple elements in LiteralProto, has %d",
+                ShapeUtil::TupleElementCount(piece->subshape()),
+                proto_element->tuple_literals_size());
+          }
+          return Status::OK();
+        }
+
+        CHECK(ShapeUtil::IsArray(piece->subshape()));
+        TF_RETURN_IF_ERROR(piece->CopyFromProto(*proto_element));
+
+        return Status::OK();
+      }));
 
-    TF_RET_CHECK(ShapeUtil::IsArray(piece.subshape()));
-    TF_RETURN_IF_ERROR(piece.CopyFromProto(*proto_element));
-  }
   return std::move(literal);
 }
 
-const void* Literal::untyped_data(const ShapeIndex& shape_index) const {
+/* static */ string Literal::MultiIndexAsString(
+    tensorflow::gtl::ArraySlice<int64> multi_index) {
+  return StrCat("{", tensorflow::str_util::Join(multi_index, ","), "}");
+}
+
+const void* LiteralBase::untyped_data(const ShapeIndex& shape_index) const {
   return piece(shape_index).untyped_data();
 }
 
@@ -2132,11 +2281,11 @@ void* Literal::untyped_data(const ShapeIndex& shape_index) {
   return piece(shape_index).untyped_data();
 }
 
-int64 Literal::size_bytes(const ShapeIndex& shape_index) const {
+int64 LiteralBase::size_bytes(const ShapeIndex& shape_index) const {
   return piece(shape_index).size_bytes();
 }
 
-string Literal::GetR1U8AsString() const {
+string LiteralBase::GetR1U8AsString() const {
   CHECK(ShapeUtil::IsArray(shape()));
   CHECK_EQ(ShapeUtil::Rank(shape()), 1);
   CHECK_EQ(shape().element_type(), U8);
@@ -2144,72 +2293,55 @@ string Literal::GetR1U8AsString() const {
                 ShapeUtil::ElementsIn(shape()));
 }
 
-/* static */ const LiteralView LiteralView::Create(
-    const Literal& literal, const ShapeIndex& view_root) {
-  return LiteralView(literal, view_root);
-}
-
-size_t Literal::Hash() const {
-  using tensorflow::Hash64;
-  using tensorflow::Hash64Combine;
-
-  size_t hash_value = ShapeUtil::Hash(shape());
+void BorrowingLiteral::BuildPieceSubtree(const Shape& shape, Piece* piece) {
+  CHECK(ShapeUtil::IsTuple(shape));
+  for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+    const Shape& subshape = shape.tuple_shapes(i);
 
-  ShapeUtil::ForEachSubshape(
-      shape(), [&](const Shape& subshape, const ShapeIndex& index) {
-        if (ShapeUtil::IsTuple(subshape)) {
-          return;
-        }
+    auto child_piece = Piece();
+    child_piece.set_subshape(&subshape);
 
-        CHECK(LayoutUtil::IsDense(subshape.layout()));
-        hash_value = Hash64Combine(
-            hash_value, Hash64(static_cast<const char*>(untyped_data(index)),
-                               size_bytes(index)));
-      });
-
-  return hash_value;
-}
-
-LiteralView::LiteralView(const Literal& literal, const ShapeIndex& view_root) {
-  shape_ = ShapeUtil::GetSubshape(literal.shape(), view_root);
-  pieces_ = ShapeTree<Piece>(shape_);
-  owns_buffers_ = false;
-  for (auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    Piece& piece = pair.second;
-
-    ShapeIndex src_index = view_root;
-    for (int64 i : index) {
-      src_index.push_back(i);
+    if (ShapeUtil::IsTuple(subshape)) {
+      BuildPieceSubtree(subshape, &child_piece);
     }
-    const Piece& src_piece = literal.piece(src_index);
-    piece.set_buffer(src_piece.buffer());
-    piece.set_sparse_indices(src_piece.sparse_indices());
-    piece.set_subshape(&ShapeUtil::GetSubshape(shape_, index));
+
+    piece->emplace_back(std::move(child_piece));
   }
 }
 
-LiteralView::~LiteralView() {}
+LiteralSlice::LiteralSlice(const LiteralBase& literal)
+    : LiteralBase(), root_piece_(&literal.root_piece()) {}
 
-LiteralView::LiteralView(const LiteralView& other) { CopyFrom(other); }
+LiteralSlice::LiteralSlice(const LiteralBase& literal,
+                           const ShapeIndex& view_root)
+    : LiteralBase(), root_piece_(&literal.piece(view_root)) {}
 
-LiteralView& LiteralView::operator=(const LiteralView& other) {
-  CopyFrom(other);
-  return *this;
+BorrowingLiteral::BorrowingLiteral(const char* src_buf_ptr, const Shape& shape)
+    : LiteralBase(), shape_(shape) {
+  CHECK(ShapeUtil::IsArray(shape_));
+  CHECK_NE(src_buf_ptr, nullptr);
+  CHECK(LayoutUtil::HasLayout(shape_));
+
+  root_piece_ = Piece();
+  root_piece_.set_buffer(const_cast<char*>(src_buf_ptr));
+  root_piece_.set_subshape(&shape_);
 }
 
-void LiteralView::CopyFrom(const LiteralView& other) {
-  // We can't use the default copy-constructor/copy-assignment because
-  // Piece::subshape_ points to subshapes within the Shape of the owning
-  // Literal/LiteralView.
-  shape_ = other.shape();
-  pieces_ = other.pieces_;
-  for (auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    Piece& piece = pair.second;
-    piece.set_subshape(&ShapeUtil::GetSubshape(shape_, index));
+BorrowingLiteral::BorrowingLiteral(
+    tensorflow::gtl::ArraySlice<const char*> src_buf_ptrs, const Shape& shape)
+    : LiteralBase(), shape_(shape) {
+  CHECK(ShapeUtil::IsTuple(shape_));
+  CHECK(!ShapeUtil::IsNestedTuple(shape_));
+  CHECK_EQ(src_buf_ptrs.size(), ShapeUtil::TupleElementCount(shape_));
+  root_piece_ = Piece();
+  root_piece_.set_subshape(&shape_);
+  BuildPieceSubtree(shape_, &root_piece_);
+
+  for (int i = 0; i < src_buf_ptrs.size(); ++i) {
+    const auto& src_shape = shape_.tuple_shapes(i);
+    CHECK(ShapeUtil::IsArray(src_shape));
+    root_piece_.child(i).set_buffer(const_cast<char*>(src_buf_ptrs[i]));
   }
-  owns_buffers_ = false;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index c6bd03bf21ac8dc88e96856cffe02c758e7b996d..609dc7a3aca646a5bb787487de101ac115df8ea5 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/sparse_index_array.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -52,14 +51,497 @@ limitations under the License.
 
 namespace xla {
 
+// Forward declare Literal and LiteralSlice class to be used by the creation
+// methods in the base class.
+class Literal;
+class LiteralSlice;
+
+// Abstract base class for literals.
+class LiteralBase {
+ public:
+  virtual ~LiteralBase() = 0;
+
+  // Literals are equal if they have compatible shapes and the same data
+  // values. Layout is not compared.
+  bool operator==(const LiteralBase& other) const;
+  bool operator!=(const LiteralBase& other) const { return !(*this == other); }
+
+  // Returns the shape of the literal.
+  const Shape& shape() const { return root_piece().subshape(); }
+
+  // Serialize to proto.
+  LiteralProto ToProto() const;
+
+  // Returns an ArraySlice of the array for this literal for the given NativeT
+  // (e.g., float). CHECKs if the subshape of the literal at the given
+  // ShapeIndex is not array. See primitive_util.h for the mapping from XLA type
+  // to native type.
+  template <typename NativeT>
+  tensorflow::gtl::ArraySlice<NativeT> data(
+      const ShapeIndex& shape_index = {}) const;
+
+  // Returns a const pointer to the sparse index array. Returns nullptr if the
+  // literal is not a sparse array.
+  const SparseIndexArray* sparse_indices(
+      const ShapeIndex& shape_index = {}) const;
+
+  // Returns a const pointer to (or size of) the underlying buffer holding the
+  // array at the given shape index. CHECKs if the subshape of the literal at
+  // the given ShapeIndex is not array.
+  const void* untyped_data(const ShapeIndex& shape_index = {}) const;
+  int64 size_bytes(const ShapeIndex& shape_index = {}) const;
+
+  // Returns this literal's data as a string. This literal must be a rank-1 U8
+  // array.
+  string GetR1U8AsString() const;
+
+  // Returns a string representation of the literal value.
+  // Warning: this function can take minutes for multi-million element Literals.
+  string ToString(bool print_layout = false) const;
+
+  // Gets an element in the literal at the given index. The multi_index is
+  // CHECKed against the dimension sizes.
+  template <typename NativeT>
+  NativeT Get(tensorflow::gtl::ArraySlice<int64> multi_index,
+              const ShapeIndex& shape_index) const;
+  // Overloads of Get for array literals. CHECKs if the literal is not
+  // array-shaped and dense.
+  template <typename NativeT>
+  NativeT Get(tensorflow::gtl::ArraySlice<int64> multi_index) const;
+
+  // Returns the element value at index (0, ..., 0), however many zeroes are
+  // required for that index.
+  template <typename NativeT>
+  NativeT GetFirstElement() const;
+
+  // As Get(), but determines the correct type and converts the value
+  // into text.
+  string GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index,
+                     const ShapeIndex& shape_index = {}) const;
+  // As GetSparseElement(), but determines the correct type and converts the
+  // value into text.
+  string GetSparseElementAsString(int64 sparse_element_number,
+                                  const ShapeIndex& shape_index = {}) const;
+  // As Get(), but determines the correct type and converts the value into
+  // int64.  This literal must be an array.
+  StatusOr<int64> GetIntegralAsS64(
+      tensorflow::gtl::ArraySlice<int64> multi_index) const;
+
+  // Returns the multi-index of the element in a sparse literal at the given
+  // sparse element number.  The sparse element number is the position with in
+  // the sparse array's list of (index, value) pairs, and is checked against the
+  // total number of (index, value) pairs in the sparse array.
+  tensorflow::gtl::ArraySlice<int64> GetSparseIndex(
+      int64 sparse_element_number, const ShapeIndex& shape_index = {}) const;
+
+  // Returns the value of the element in a sparse literal at the given sparse
+  // element number.  The sparse element number is the position with in the
+  // sparse array's list of (index, value) pairs, and is checked against the
+  // total number of (index, value) pairs in the sparse array.
+  template <typename NativeT>
+  NativeT GetSparseElement(int64 sparse_element_number,
+                           const ShapeIndex& shape_index = {}) const;
+
+  // Invokes the "per cell" callback for each element in the provided
+  // literal with the element's indices and a string representation of
+  // the element's value.
+  //
+  // This function is useful if you want a polymorphic representation
+  // of the tensor's elements (turning it to a string for something
+  // like representation in a protobuf).
+  //
+  // This literal must have a dense layout.
+  void EachCellAsString(
+      const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
+                               const string& value)>& per_cell) const;
+  template <typename NativeT>
+  void EachCell(std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
+                                   NativeT value)>
+                    per_cell) const;
+
+  // Returns whether every element in this literal is equal to value.
+  //
+  // value is an int8 because we expect this to be called with small
+  // compile-time constants (0, -1, etc.) and so that whatever value you pass
+  // can be represented exactly by floating-point types as small as 16 bits.
+  //
+  // If value doesn't fit in this literal's type, returns false.  Values of 1/0
+  // are considered equal to true/false; other values are not considered equal
+  // to true. Also if this literal is not array-shaped false is returned.
+  bool IsAll(int8 value) const;
+
+  // Like IsAll(const Literal&, int8), except we check whether the literal is
+  // equal to a particular floating-point number.
+  //
+  // If the literal is not a floating-point value, this always returns false.
+  //
+  // This casts value to the type of literal, then compares using ==.  The usual
+  // admonishments about floating-point equality checks apply.  We expect you to
+  // use this to check for values that can be expressed precisely as a float,
+  // e.g. -0.5.  Also if this literal is not array-shaped false is returned.
+  bool IsAllFloat(float value) const;
+
+  // Like IsAll(const Literal&, int8), except we check whether the literal is
+  // equal to a particular complex number.
+  //
+  // If the literal is not a complex value, this always returns false.
+  //
+  // This casts value to the type of literal, then compares using ==.  The usual
+  // admonishments about floating-point equality checks apply.  We expect you to
+  // use this to check for complex values that can be expressed precisely as
+  // float pairs e.g. (-0.5, 1.0).
+  //
+  // This literal must have a dense layout.
+  bool IsAllComplex(complex64 value) const;
+
+  // Literal consists entirely of the first element of the literal.
+  bool IsAllFirst() const;
+
+  // Returns whether this literal is zero at the specified index. This literal
+  // must be an array with a dense layout.
+  bool IsZero(tensorflow::gtl::ArraySlice<int64> indices) const;
+
+  // Returns the count of the elements in the array at the given shape index in
+  // this literal.
+  int64 element_count(const ShapeIndex& index = {}) const {
+    return ShapeUtil::ElementsIn(ShapeUtil::GetSubshape(shape(), index));
+  }
+
+  // Returns the count of the elements in the sparse array at the given shape
+  // index in this literal, which will be no larger than
+  // LayoutUtil::MaxSparseElements(SetSubshape(shape(), index).layout()).
+  int64 sparse_element_count() const;
+
+  // Compute a hash for this literal.  This literal must not be a sparse tensor
+  // or a tuple containing a sparse tensor.
+  size_t Hash() const;
+
+  // Converts this literal to the given shape. Returns an error is the
+  // conversion is not possible.
+  //
+  // round_f32_to_bf16: if true, converting F32 elements to BF16 uses rounding
+  // instead of truncation; otherwise, truncation is used.
+  //
+  // TODO(b/69266521): remove the round_to_bfloat16 flag when rounding becomes
+  // the default behavior.
+  StatusOr<std::unique_ptr<Literal>> ConvertToShape(
+      const Shape& dest_shape, bool round_f32_to_bf16 = false) const;
+
+  // Converts this literal to another primitive type using a bitcast
+  // conversion. The to and from primitive types must have the same bit
+  // width. Returns an error if the conversion is not possible. This literal
+  // must be array-shaped.
+  StatusOr<std::unique_ptr<Literal>> BitcastConvert(
+      PrimitiveType primitive_dest_type) const;
+
+  // Converts this literal to another primitive type. Returns an error if the
+  // conversion is not possible. This literal must be array-shaped.
+  StatusOr<std::unique_ptr<Literal>> Convert(
+      PrimitiveType primitive_dest_type) const;
+
+  // Returns a literal scalar representing the first element.
+  Literal GetFirstScalarLiteral() const;
+
+  // Clones the underlying buffers into a new Literal, or new
+  // std::unique_ptr<Literal>.
+  Literal Clone() const;
+  std::unique_ptr<Literal> CloneToUnique() const;
+
+  // TODO(b/67651157): The methods below which perform computation on Literals
+  // (Reshape, Slice, etc) should be moved elsewhere, and perhaps combined with
+  // evaluator code which operates on Literals.
+  //
+  // Creates a new value that has the equivalent value as this
+  // literal, but conforms to new_layout; e.g. a literal matrix that was in {0,
+  // 1} minor-to-major dimension layout can be re-layed-out as {1, 0}
+  // minor-to-major dimension layout and the value in the cell at any given
+  // logical index (i0, i1) will be the same.
+  //
+  // For tuple shaped literals, shape_index should be used to select the inner
+  // array that the new layout applies to.
+  //
+  // Note: this is useful when the client wants to ensure that a value placed in
+  // the XLA allocation tracker has a particular layout; for efficiency
+  // purposes or avoiding unimplemented operation/layout combinations.
+  std::unique_ptr<Literal> Relayout(const Layout& new_layout,
+                                    const ShapeIndex& shape_index = {}) const;
+
+  // An overload of Relayout which changes the layout of the entire shape rather
+  // than being limited to a single array within the shape.
+  std::unique_ptr<Literal> Relayout(const Shape& shape_with_layout) const;
+
+  // Creates a new literal by reshaping this literal to have the given
+  // dimensions. The total number of elements must not change; The
+  // implementation currently only supports monotonic dim0-major layouts.
+  // This literal must be an array.
+  StatusOr<std::unique_ptr<Literal>> Reshape(
+      tensorflow::gtl::ArraySlice<int64> dimensions) const;
+
+  // Creates a new literal by reordering the dimensions of this literal.
+  // The given `permutation` must be a permutation of the dimension numbers
+  // in the original literal, and it specifies the order of the new dimensions
+  // in the result literal (i.e., new_order[i] = old_order[permutation[i]]).
+  // For example, a transpose call on a literal of shape [3 x 8 x 4] and
+  // `permutation` = {2, 0, 1} returns a new literal of shape [4 x 3 x 8].
+  // This literal must be an array.
+  std::unique_ptr<Literal> Transpose(
+      tensorflow::gtl::ArraySlice<int64> permutation) const;
+
+  // Creates a sub-array from this literal by extracting the indices
+  // [start_index, limit_index) of each dimension. The result literal has the
+  // same rank and layout as for the given literal. The number of indices in
+  // start_indices and limit_indices must be the rank of the literal, and the
+  // indices follow the order of the dimensions.
+  // This literal must be an array.
+  std::unique_ptr<Literal> Slice(
+      tensorflow::gtl::ArraySlice<int64> start_indices,
+      tensorflow::gtl::ArraySlice<int64> limit_indices) const;
+
+  // Creates a literal with a prepended dimension with bound "times"; e.g. a
+  // f32[3x2] with times=4 will produce a f32[4x3x2] with the 3x2 from this
+  // literal replicated four times.
+  // This literal must be an array.
+  template <typename NativeT>
+  std::unique_ptr<Literal> Replicate(int64 times) const;
+
+  // Creates a new Literal object with the shape specified as parameter.
+  // The content of the literal values is the default value of the primitive
+  // type of literal itself (0 for numeric types, and false for predicates).
+  //
+  // Note: It's an antipattern to use this method then immediately call
+  // Literal::Populate on the result (since that results in zero initialization,
+  // then reinitialization. Conside if a call to MakeUnique<Literal>(shape),
+  // followed by the call to Literal::Populate can be used instead.
+  static std::unique_ptr<Literal> CreateFromShape(const Shape& shape);
+
+ protected:
+  // A data structure representing a subshape at a particular ShapeIndex within
+  // the literal. For array-shaped ShapeIndexes, this data structure holds the
+  // pointer to the memory allocated for the array data.
+  class Piece {
+   public:
+    // Returns the buffer holding the array data for this piece as an array
+    // slice. This piece must be array-shaped.
+    template <typename NativeT>
+    tensorflow::gtl::ArraySlice<NativeT> data() const;
+    template <typename NativeT>
+    tensorflow::gtl::MutableArraySlice<NativeT> data();
+
+    // Returns the buffer holding the array data for this piece as a void*. This
+    // piece must be array-shaped.
+    void* untyped_data();
+    const void* untyped_data() const;
+
+    // Gets or sets an element in the array at the given index. The multi_index
+    // is CHECKed against the dimension sizes of the array.  This piece must be
+    // array-shaped.
+    template <typename NativeT>
+    NativeT Get(tensorflow::gtl::ArraySlice<int64> index) const;
+    template <typename NativeT>
+    void Set(tensorflow::gtl::ArraySlice<int64> index, NativeT value);
+
+    // Gets/sets the buffer holding the array data.
+    char* buffer() const { return buffer_; }
+    void set_buffer(char* buffer) { buffer_ = buffer; }
+
+    // The array of multi-indices that provide the locations of non-zero
+    // elements in a sparse array.  Only used if
+    // LayoutUtil::IsSparseArray(shape()) is true.
+    SparseIndexArray* sparse_indices() const { return sparse_indices_; }
+    void set_sparse_indices(SparseIndexArray* sparse_indices) {
+      sparse_indices_ = sparse_indices;
+    }
+
+    // Gets or sets the subshape of this piece. This reference points to a
+    // subshape within the shape in the containing Literal (Literal::shape_).
+    const Shape& subshape() const { return *subshape_; }
+    void set_subshape(const Shape* subshape) { subshape_ = subshape; }
+
+    // Returns the size in bytes of the buffer holding the array data.
+    int64 size_bytes() const { return ShapeUtil::ByteSizeOf(subshape()); }
+
+    // Returns the number of elements in this piece's array.
+    int64 element_count() const {
+      // If this is a sparse array, use the number of elements represented by
+      // the indices in the associated SparseIndexArray.
+      return LayoutUtil::IsSparseArray(subshape())
+                 ? sparse_indices()->index_count()
+                 : ShapeUtil::ElementsIn(subshape());
+    }
+
+    // Returns the child piece at 'index' of this piece.
+    Piece& child(int64 index) { return children_[index]; }
+
+    // Adds a child piece to this piece's children.
+    void emplace_back(Piece child_piece) {
+      children_.emplace_back(std::move(child_piece));
+    }
+
+    // Returns the size of children pieces of this piece.
+    int64 children_size() { return children_.size(); }
+
+    // Visitor functions that recursively traverses the piece and calls the
+    // given function at each child piece. The function has the type:
+    //    void (const ShapeIndex& index, const Piece& piece)
+    template <typename Fn>
+    void ForEachSubpiece(const Fn& func) const {
+      ShapeIndex index;
+      return ForEachHelper(
+                 [&func](const ShapeIndex& index, const Piece& piece) {
+                   func(index, piece);
+                   return Status::OK();
+                 },
+                 *this, &index)
+          .IgnoreError();
+    }
+    // Same as above, but the function has the type:
+    //    Status (const ShapeIndex& index, const Piece& piece)
+    // The first non-OK return value is returned by the function.
+    template <typename Fn>
+    Status ForEachSubpieceWithStatus(const Fn& func) const {
+      ShapeIndex index;
+      return ForEachHelper(func, *this, &index);
+    }
+    // Same as above, but the function has the type:
+    //    Bool (const ShapeIndex& index, const Piece& piece)
+    // The first non-true return value is returned by the function.
+    template <typename Fn>
+    bool ForEachSubpieceWithBool(const Fn& func) const {
+      ShapeIndex index;
+      return ForEachHelperBool(func, *this, &index);
+    }
+    // Same as above, but the function has the type:
+    //    Void (const ShapeIndex& index, Piece& piece)
+    template <typename Fn>
+    void ForEachMutableSubpiece(const Fn& func) {
+      ShapeIndex index;
+      return ForEachMutableHelper(
+                 [&func](const ShapeIndex& index, Piece* piece) {
+                   func(index, piece);
+                   return Status::OK();
+                 },
+                 const_cast<xla::LiteralBase::Piece*>(this), &index)
+          .IgnoreError();
+    }
+    // Same as above, but the function has the type:
+    //    Status (const ShapeIndex& index, Piece& piece)
+    // The first non-OK return value is returned by the function.
+    template <typename Fn>
+    Status ForEachMutableSubpieceWithStatus(const Fn& func) {
+      ShapeIndex index;
+      return ForEachMutableHelper(
+          func, const_cast<xla::LiteralBase::Piece*>(this), &index);
+    }
+
+    // Returns true if this piece and 'other' contain the same data. This piece
+    // and 'other' must be array-shaped and compatible.
+    bool EqualElements(const Piece& other) const;
+
+    // Writes the shape and data (if array-shaped) into the given proto.
+    void WriteToProto(LiteralProto* proto) const;
+
+    // Copy the data from 'src' into this piece's buffer. Shapes of this piece
+    // and src must be compatible.
+    Status CopyFrom(const Piece& src);
+
+    // Copies the data from the given proto into this piece. The shape of this
+    // piece must be equal (not just compatible) to the shape of the proto.
+    Status CopyFromProto(const LiteralProto& proto);
+
+    // Sorts the elements in a sparse array.
+    void SortSparseElements();
+
+   private:
+    // Helpers for traversing the piece via ForEachSubpiece rooted at 'index'.
+    // The first non-OK (or non-true) value is returned by the function.
+    // The callable 'func' has the same signature as described above in
+    // ForEachSubpiece*.
+    template <typename Fn>
+    Status ForEachHelper(const Fn& func, const Piece& piece,
+                         ShapeIndex* index) const {
+      TF_RETURN_IF_ERROR(func(*index, piece));
+      for (int64 i = 0; i < piece.children_.size(); ++i) {
+        index->push_back(i);
+        TF_RETURN_IF_ERROR(ForEachHelper(func, piece.children_[i], index));
+        index->pop_back();
+      }
+      return Status::OK();
+    }
+    template <typename Fn>
+    bool ForEachHelperBool(const Fn& func, const Piece& piece,
+                           ShapeIndex* index) const {
+      if (!func(*index, piece)) {
+        return false;
+      }
+      for (int64 i = 0; i < piece.children_.size(); ++i) {
+        index->push_back(i);
+        if (!ForEachHelperBool(func, piece.children_[i], index)) {
+          return false;
+        }
+        index->pop_back();
+      }
+      return true;
+    }
+    template <typename Fn>
+    Status ForEachMutableHelper(const Fn& func, Piece* piece,
+                                ShapeIndex* index) {
+      TF_RETURN_IF_ERROR(func(*index, piece));
+      for (int64 i = 0; i < piece->children_.size(); ++i) {
+        index->push_back(i);
+        TF_RETURN_IF_ERROR(
+            ForEachMutableHelper(func, &piece->children_[i], index));
+        index->pop_back();
+      }
+      return Status::OK();
+    }
+
+    // Recursive helper for EqualElements.
+    template <typename NativeT>
+    bool EqualElementsInternal(const Piece& other,
+                               std::vector<int64>* multi_index) const;
+
+    // Helper for SortSparseElements that has the element type as a template
+    // parameter.
+    template <typename NativeT>
+    void SortSparseElementsInternal();
+
+    // For array-shaped pieces, this is the buffer holding the literal data.
+    char* buffer_ = nullptr;
+
+    // For sparse arrays, this is the array of indices.
+    SparseIndexArray* sparse_indices_ = nullptr;
+
+    // The shape of piece. This points into the shape of the containing Literal
+    // (Literal::shape_).
+    const Shape* subshape_ = nullptr;
+
+    // Children pieces for tuple shaped pieces.
+    std::vector<Piece> children_ = {};
+  };  // class Piece
+
+  const Piece& piece(const ShapeIndex& shape_index) const {
+    Piece* piece = &const_cast<Piece&>(root_piece());
+    for (const auto i : shape_index) {
+      DCHECK_GE(i, 0);
+      DCHECK_LT(i, piece->children_size());
+      piece = &piece->child(i);
+    }
+    return *piece;
+  }
+
+  // Returns the piece at the root of the shape.
+  virtual const Piece& root_piece() const = 0;
+
+  // LiteralSlice and Literal must access Pieces of other Literals.
+  friend class Literal;
+  friend class LiteralSlice;
+  friend class BorrowingLiteral;
+};
+
 // Class representing literal values in XLA.
 //
-// TODO(b/67651157): The methods in this class should be reduced to a minimal
-// set of methods which construct Literals and accessors methods. Other methods
-// which perform computation on Literals (Reshape, Slice, etc) should be moved
-// elsewhere, and perhaps combined with evaluator code which operates on
-// Literals.
-class Literal {
+// The underlying buffer and shape is always owned by this class.
+class Literal : public LiteralBase {
  public:
   Literal() : Literal(ShapeUtil::MakeNil()) {}
 
@@ -80,46 +562,156 @@ class Literal {
   Literal(const Shape& shape, bool allocate_arrays);
   Literal& operator=(Literal&& other);
 
-  // Literals are equal if they have compatible shapes and the same data
-  // values. Layout is not compared.
-  bool operator==(const Literal& other) const;
-  bool operator!=(const Literal& other) const { return !(*this == other); }
+  // TODO(b/67651157): Remove this accessor. Literal users should not be able to
+  // mutate the shape as this can produce malformed Literals.
+  Shape* mutable_shape_do_not_use() { return shape_.get(); }
 
-  // Serialize to and from a proto.
-  static StatusOr<std::unique_ptr<Literal>> CreateFromProto(
-      const LiteralProto& proto);
-  LiteralProto ToProto() const;
+  // Returns a MutableArraySlice view of the array for this literal for the
+  // given NativeT (e.g., float). CHECKs if the subshape of the literal at the
+  // given ShapeIndex is not array. See primitive_util.h for the mapping from
+  // XLA type to native type.
+  template <typename NativeT>
+  tensorflow::gtl::MutableArraySlice<NativeT> data(
+      const ShapeIndex& shape_index = {});
+  // Unhide const method from parent class.
+  using LiteralBase::data;
+
+  // Returns a pointer to the sparse index array. Returns nullptr if the literal
+  // is not a sparse array.
+  SparseIndexArray* sparse_indices(const ShapeIndex& shape_index = {});
+
+  // Returns a pointer to the underlying buffer holding the array at the given
+  // shape index. CHECKs if the subshape of the literal at the given ShapeIndex
+  // is not array.
+  void* untyped_data(const ShapeIndex& shape_index = {});
+  // Unhide const method from parent class.
+  using LiteralBase::untyped_data;
+
+  // Populates a literal with a sparse layout with the given indices and values.
+  // Each index in the indices array is CHECKed against the dimensions in the
+  // literal's shape.  If sort is true, then the indices and values will be
+  // sorted.  If sort is false, then the indices and values are assumed to
+  // already be in sorted order.  See CreateSparse for an example of how data
+  // are populated.
+  template <typename NativeT>
+  void PopulateSparse(SparseIndexArray indices,
+                      tensorflow::gtl::ArraySlice<NativeT> values,
+                      bool sort = true);
+
+  // Copy values from 'src_literal' rooted at 'src_shape_index' into this
+  // literal rooted at 'dest_shape_index'. The subshape of this literal rooted
+  // at 'dest_shape_index' must be compatible with the subshape of 'src_literal'
+  // rooted at 'src_shape_index', but need not be arrays.
+  Status CopyFrom(const LiteralSlice& src_literal,
+                  const ShapeIndex& dest_shape_index = {},
+                  const ShapeIndex& src_shape_index = {});
+
+  // Similar to CopyFrom, but with move semantincs. The subshape of this literal
+  // rooted at 'dest_shape_index' must be *equal* to the shape 'src_literal'
+  // (layouts and shapes must match), but need not be arrays. The memory
+  // allocated in this literal for the subshape at dest_shape_index is
+  // deallocated, and the respective buffers are replaced with those in
+  // src_literal. Upon return, src_literal is set to a nil shape (empty tuple).
+  Status MoveFrom(Literal&& src_literal,
+                  const ShapeIndex& dest_shape_index = {});
+
+  // Copies the values from src_literal, starting at src_base shape indexes,
+  // to this literal, starting at dest_base, where the copy size in each
+  // dimension is specified by copy_size.
+  // The src_literal and this literal must have the same primitive type,
+  // src_base+copy_size must fit the source literal dimensions, as well as
+  // dest_base+copy_size must fit the destination literal dimensions.
+  // Note: if either src_literal or this literal contains dimensions with zero
+  // element, then copy_size must be 0 in these dimensions while the
+  // corresponding base indices being 0.
+  // This literal and 'src_literal' must be arrays.
+  Status CopySliceFrom(const LiteralSlice& src_literal,
+                       tensorflow::gtl::ArraySlice<int64> src_base,
+                       tensorflow::gtl::ArraySlice<int64> dest_base,
+                       tensorflow::gtl::ArraySlice<int64> copy_size);
 
-  // Return the shape of the literal.
-  const Shape& shape() const { return shape_; }
+  // Copies one element from src_literal[src_index] to (*this)[dest_index].
+  Status CopyElementFrom(const LiteralSlice& src_literal,
+                         tensorflow::gtl::ArraySlice<int64> src_index,
+                         tensorflow::gtl::ArraySlice<int64> dest_index);
+
+  // Sets an element in the literal at the given index. The multi_index is
+  // CHECKed against the dimension sizes.
+  template <typename NativeT>
+  void Set(tensorflow::gtl::ArraySlice<int64> multi_index,
+           const ShapeIndex& shape_index, NativeT value);
+  // Overloads of Set for array literals. CHECKs if the literal is not
+  // array-shaped and dense.
+  template <typename NativeT>
+  void Set(tensorflow::gtl::ArraySlice<int64> multi_index, NativeT value);
+
+  // Appends the given element to the literal.  If the elements are not appended
+  // in sorted order, then SortSparseElements should be called before calling
+  // other methods.  This literal must have a sparse layout.
+  template <typename NativeT>
+  void AppendSparseElement(tensorflow::gtl::ArraySlice<int64> multi_index,
+                           NativeT value, const ShapeIndex& shape_index = {});
+
+  // Sorts the elements in a sparse array.
+  void SortSparseElements(const ShapeIndex& shape_index = {});
+
+  // As Set(), but truncates `value` to the literal element type before storing.
+  // This literal must be an array.
+  Status SetIntegralAsS64(tensorflow::gtl::ArraySlice<int64> multi_index,
+                          int64 value);
+
+  // Populate this literal with the given values. Examples:
+  //
+  //   // Populate with floats.
+  //   Array2D<float> float_values = ...
+  //   literal.PopulateR2FromArray2D(values);
+  //
+  //   // Populate with int32s.
+  //   literal.PopulateR2<int32>({{1, 2}, {3, 4}});
+  //
+  // The shape and element type of this literal must match given values. For
+  // example, in the call above to literal.PopulateR2(), 'literal' must be a 2x2
+  // array of S32.
+  template <typename NativeT>
+  void PopulateR1(tensorflow::gtl::ArraySlice<NativeT> values);
+  void PopulateR1(const tensorflow::core::Bitmap& values);
+  template <typename NativeT>
+  void PopulateR2(std::initializer_list<std::initializer_list<NativeT>> values);
+  template <typename NativeT>
+  void PopulateFromArray(const Array<NativeT>& values);
+  template <typename NativeT>
+  void PopulateR2FromArray2D(const Array2D<NativeT>& values);
+  template <typename NativeT>
+  void PopulateR3FromArray3D(const Array3D<NativeT>& values);
+  template <typename NativeT>
+  void PopulateR4FromArray4D(const Array4D<NativeT>& values);
+
+  // Populates literal values by calling the generator function for every cell
+  // in this literal object.
+  //
+  // generator must be a callable of the type
+  // NativeT(tensorflow::gtl::ArraySlice<int64> indexes) or compatible.
+  //
+  // This literal must have a dense layout.
+  template <typename NativeT, typename FnType>
+  Status Populate(const FnType& generator);
 
-  // TODO(b/67651157): Remove this accessor. Literal users should not be able to
-  // mutate the shape as this can produce malformed Literals.
-  Shape* mutable_shape_do_not_use() { return &shape_; }
+  // A parallel version of Populate(). This can be used if the generator is
+  // thread-safe and the values for the shape's different elements are
+  // independent.
+  template <typename NativeT, typename FnType>
+  Status PopulateParallel(const FnType& generator);
 
-  // Returns a (Mutable)ArraySlice view of the array for this literal for the
-  // given NativeT (e.g., float). CHECKs if the subshape of the literal at the
-  // given ShapeIndex is not array. See primitive_util.h for the mapping from
-  // XLA type to native type.
-  template <typename NativeT>
-  tensorflow::gtl::ArraySlice<NativeT> data(
-      const ShapeIndex& shape_index = {}) const;
+  // Fills this literal with the given value.
   template <typename NativeT>
-  tensorflow::gtl::MutableArraySlice<NativeT> data(
-      const ShapeIndex& shape_index = {});
+  void PopulateWithValue(NativeT value);
 
-  // Returns a pointer to the sparse index array. Returns nullptr if the literal
-  // is not a sparse array.
-  const SparseIndexArray* sparse_indices(
-      const ShapeIndex& shape_index = {}) const;
-  SparseIndexArray* sparse_indices(const ShapeIndex& shape_index = {});
+  // Factory methods below.
+  //
 
-  // Returns a pointer to (or size of) the underlying buffer holding the array
-  // at the given shape index. CHECKs if the subshape of the literal at the
-  // given ShapeIndex is not array.
-  const void* untyped_data(const ShapeIndex& shape_index = {}) const;
-  void* untyped_data(const ShapeIndex& shape_index = {});
-  int64 size_bytes(const ShapeIndex& shape_index = {}) const;
+  // Serialize from a proto.
+  static StatusOr<std::unique_ptr<Literal>> CreateFromProto(
+      const LiteralProto& proto);
 
   // Creates a new literal of a given rank. To minimize ambiguity (for users
   // and the compiler) these CreateR[0-2] methods should explicitly specify the
@@ -167,10 +759,6 @@ class Literal {
           values,
       const Layout& layout);
 
-  // Returns this literal's data as a string. This literal must be a rank-1 U8
-  // array.
-  string GetR1U8AsString() const;
-
   // Creates a literal with a sparse layout and the given indices and values.
   // The shape is initialized from the given dimensions.  The minor dimension of
   // the indices array must equal the rank of the shape (i.e. size of the
@@ -210,171 +798,16 @@ class Literal {
       tensorflow::gtl::ArraySlice<int64> dimensions, SparseIndexArray indices,
       tensorflow::gtl::ArraySlice<NativeT> values, bool sort = true);
 
-  // Populates a literal with a sparse layout with the given indices and values.
-  // Each index in the indices array is CHECKed against the dimensions in the
-  // literal's shape.  If sort is true, then the indices and values will be
-  // sorted.  If sort is false, then the indices and values are assumed to
-  // already be in sorted order.  See CreateSparse for an example of how data
-  // are populated.
-  template <typename NativeT>
-  void PopulateSparse(SparseIndexArray indices,
-                      tensorflow::gtl::ArraySlice<NativeT> values,
-                      bool sort = true);
-
-  // Creates a new Literal object with the shape specified as parameter.
-  // The content of the literal values is the default value of the primitive
-  // type of literal itself (0 for numeric types, and false for predicates).
-  static std::unique_ptr<Literal> CreateFromShape(const Shape& shape);
-
-  // Creates a new Literal object with its values havings the primitive_type
-  // type, and with dimensions defined by the dimensions parameter.
-  // The content of the literal values is the default value of the primitive
-  // type of literal itself (0 for numeric types, and false for predicates).
-  static std::unique_ptr<Literal> CreateFromDimensions(
-      PrimitiveType primitive_type,
-      tensorflow::gtl::ArraySlice<int64> dimensions);
-
-  // Copy values from 'src_literal' rooted at 'src_shape_index' into this
-  // literal rooted at 'dest_shape_index'. The subshape of this literal rooted
-  // at 'dest_shape_index' must be compatible with the subshape of 'src_literal'
-  // rooted at 'src_shape_index', but need not be arrays.
-  Status CopyFrom(const Literal& src_literal,
-                  const ShapeIndex& dest_shape_index = {},
-                  const ShapeIndex& src_shape_index = {});
-
-  // Similar to CopyFrom, but with move semantincs. The subshape of this literal
-  // rooted at 'dest_shape_index' must be *equal* to the shape 'src_literal'
-  // (layouts and shapes must match), but need not be arrays. The memory
-  // allocated in this literal for the subshape at dest_shape_index is
-  // deallocated, and the respective buffers are replaced with those in
-  // src_literal. Upon return, src_literal is set to a nil shape (empty tuple).
-  Status MoveFrom(Literal&& src_literal,
-                  const ShapeIndex& dest_shape_index = {});
-
-  // Copies the values from src_literal, starting at src_base shape indexes,
-  // to this literal, starting at dest_base, where the copy size in each
-  // dimension is specified by copy_size.
-  // The src_literal and this literal must have the same primitive type,
-  // src_base+copy_size must fit the source literal dimensions, as well as
-  // dest_base+copy_size must fit the destination literal dimensions.
-  // Note: if either src_literal or this literal contains dimensions with zero
-  // element, then copy_size must be 0 in these dimensions while the
-  // corresponding base indices being 0.
-  // This literal and 'src_literal' must be arrays.
-  Status CopySliceFrom(const Literal& src_literal,
-                       tensorflow::gtl::ArraySlice<int64> src_base,
-                       tensorflow::gtl::ArraySlice<int64> dest_base,
-                       tensorflow::gtl::ArraySlice<int64> copy_size);
-
-  // Copies one element from src_literal[src_index] to (*this)[dest_index].
-  Status CopyElementFrom(const Literal& src_literal,
-                         tensorflow::gtl::ArraySlice<int64> src_index,
-                         tensorflow::gtl::ArraySlice<int64> dest_index);
-
-  // Returns a vector containing the tuple elements of this Literal as separate
-  // Literals. This Literal must be tuple-shaped and can be a nested tuple. The
-  // elements are moved into the new Literals; no data is copied. Upon return
-  // this Literal is set to a nil shape (empty tuple)
-  std::vector<Literal> DecomposeTuple();
-
-  // This operation is the inverse of DecomposeTuple. The given elements are
-  // moved into the tuple elements of a new tuple-shaped Literal which is
-  // returned. Upon return, each of the Literals in 'elements' is set to a nil
-  // shape (empty tuple).
-  static Literal MoveIntoTuple(
-      tensorflow::gtl::MutableArraySlice<Literal> elements);
-
-  // Creates a new value that has the equivalent value as this literal, but
-  // conforms to new_layout; e.g. a literal matrix that was in {0, 1}
-  // minor-to-major dimension layout can be re-laid-out as {1, 0}
-  // minor-to-major dimension layout and the value in the cell at any given
-  // logical index (i0, i1) will be the same.
-  //
-  // For tuple shaped literals, shape_index should be used to select the inner
-  // array that the new layout applies to.
-  //
-  // Note: this is useful when the client wants to ensure that a value placed in
-  // the XLA allocation tracker has a particular layout; for efficiency
-  // purposes or avoiding unimplemented operation/layout combinations.
-  std::unique_ptr<Literal> Relayout(const Layout& new_layout,
-                                    const ShapeIndex& shape_index = {}) const;
-
-  // An overload of Relayout which changes the layout of the entire shape rather
-  // than being limited to a single array within the shape.
-  std::unique_ptr<Literal> Relayout(const Shape& shape_with_layout) const;
-
-  // Creates a new literal by reshaping this literal to have the given
-  // dimensions. The total number of elements must not change; The
-  // implementation currently only supports monotonic dim0-major layouts.
-  // This literal must be an array.
-  StatusOr<std::unique_ptr<Literal>> Reshape(
-      tensorflow::gtl::ArraySlice<int64> dimensions) const;
-
-  // Creates a new literal by reordering the dimensions of this literal.
-  // The given `permutation` must be a permutation of the dimension numbers
-  // in the original literal, and it specifies the order of the new dimensions
-  // in the result literal (i.e., new_order[i] = old_order[permutation[i]]).
-  // For example, a transpose call on a literal of shape [3 x 8 x 4] and
-  // `permutation` = {2, 0, 1} returns a new literal of shape [4 x 3 x 8].
-  // This literal must be an array.
-  std::unique_ptr<Literal> Transpose(
-      tensorflow::gtl::ArraySlice<int64> permutation) const;
-
-  // Creates a sub-array from this literal by extracting the indices
-  // [start_index, limit_index) of each dimension. The result literal has the
-  // same rank and layout as for the given literal. The number of indices in
-  // start_indices and limit_indices must be the rank of the literal, and the
-  // indices follow the order of the dimensions.
-  // This literal must be an array.
-  std::unique_ptr<Literal> Slice(
-      tensorflow::gtl::ArraySlice<int64> start_indices,
-      tensorflow::gtl::ArraySlice<int64> limit_indices) const;
-
-  // Creates a literal with a prepended dimension with bound "times"; e.g. a
-  // f32[3x2] with times=4 will produce a f32[4x3x2] with the 3x2 from this
-  // literal replicated four times.
-  // This literal must be an array.
-  template <typename NativeT>
-  std::unique_ptr<Literal> Replicate(int64 times) const;
-
-  // Converts this literal to another primitive type using
-  // static_cast<>. Returns an error if the conversion is not possible. This
-  // literal must be array-shaped.
-  StatusOr<std::unique_ptr<Literal>> Convert(
-      PrimitiveType primitive_dest_type) const;
-
-  // Converts this literal to another primitive type using a bitcast
-  // conversion. The to and from primitive types must have the same bit
-  // width. Returns an error if the conversion is not possible. This literal
-  // must be array-shaped.
-  StatusOr<std::unique_ptr<Literal>> BitcastConvert(
-      PrimitiveType primitive_dest_type) const;
-
-  // Converts this literal to the given shape. Returns an error is the
-  // conversion is not possible.
-  //
-  // round_f32_to_bf16: if true, converting F32 elements to BF16 uses rounding
-  // instead of truncation; otherwise, truncation is used.
-  //
-  // TODO(b/69266521): remove the round_to_bfloat16 flag when rounding becomes
-  // the default behavior.
-  StatusOr<std::unique_ptr<Literal>> ConvertToShape(
-      const Shape& dest_shape, bool round_f32_to_bf16 = false) const;
-
   // Creates a scalar literal value zero of the given primitive type.
   static Literal Zero(PrimitiveType primitive_type);
-
   // Creates a scalar literal value one of the given primitive type.
   static Literal One(PrimitiveType primitive_type);
-
   // Creates a scalar literal value containing the minimum value of the given
   // primitive type. For floating-point types, returns -inf.
   static Literal MinValue(PrimitiveType primitive_type);
-
   // Creates a scalar literal value containing the maximum value of the given
   // primitive type. For floating-point types, returns inf.
   static Literal MaxValue(PrimitiveType primitive_type);
-
   // Creates a literal of the given shape where each element is `value`.
   template <typename NativeT>
   static std::unique_ptr<Literal> CreateFullWithDescendingLayout(
@@ -423,84 +856,11 @@ class Literal {
       int64 projection);
 
   // Creates a literal that projects the (x, y) dimensions given in values into
-  // the z and p dimensions given.
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR4Projected(
-      std::initializer_list<std::initializer_list<NativeT>> values,
-      int64 projection_p, int64 projection_z);
-
-  // Clones this literal into a new Literal, or new std::unique_ptr<Literal>.
-  Literal Clone() const;
-  std::unique_ptr<Literal> CloneToUnique() const;
-
-  // Gets or sets an element in the literal at the given index. The multi_index
-  // is CHECKed against the dimension sizes.
-  template <typename NativeT>
-  NativeT Get(tensorflow::gtl::ArraySlice<int64> multi_index,
-              const ShapeIndex& shape_index) const;
-  template <typename NativeT>
-  void Set(tensorflow::gtl::ArraySlice<int64> multi_index,
-           const ShapeIndex& shape_index, NativeT value);
-
-  // Overloads of Get and Set for array literals. CHECKs if the literal is not
-  // array-shaped and dense.
-  template <typename NativeT>
-  NativeT Get(tensorflow::gtl::ArraySlice<int64> multi_index) const;
-  template <typename NativeT>
-  void Set(tensorflow::gtl::ArraySlice<int64> multi_index, NativeT value);
-
-  // Returns the multi-index of the element in a sparse literal at the given
-  // sparse element number.  The sparse element number is the position with in
-  // the sparse array's list of (index, value) pairs, and is checked against the
-  // total number of (index, value) pairs in the sparse array.
-  tensorflow::gtl::ArraySlice<int64> GetSparseIndex(
-      int64 sparse_element_number, const ShapeIndex& shape_index = {}) const;
-
-  // Returns the value of the element in a sparse literal at the given sparse
-  // element number.  The sparse element number is the position with in the
-  // sparse array's list of (index, value) pairs, and is checked against the
-  // total number of (index, value) pairs in the sparse array.
-  template <typename NativeT>
-  NativeT GetSparseElement(int64 sparse_element_number,
-                           const ShapeIndex& shape_index = {}) const;
-
-  // Appends the given element to the literal.  If the elements are not appended
-  // in sorted order, then SortSparseElements should be called before calling
-  // other methods.  This literal must have a sparse layout.
-  template <typename NativeT>
-  void AppendSparseElement(tensorflow::gtl::ArraySlice<int64> multi_index,
-                           NativeT value, const ShapeIndex& shape_index = {});
-
-  // Sorts the elements in a sparse array.
-  void SortSparseElements(const ShapeIndex& shape_index = {});
-
-  // Returns the element value at index (0, ..., 0), however many zeroes are
-  // required for that index.
-  template <typename NativeT>
-  NativeT GetFirstElement() const;
-
-  // Returns a literal scalar representing the first element.
-  Literal GetFirstScalarLiteral() const;
-
-  // As Get(), but determines the correct type and converts the value
-  // into text.
-  string GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index,
-                     const ShapeIndex& shape_index = {}) const;
-
-  // As GetSparseElement(), but determines the correct type and converts the
-  // value into text.
-  string GetSparseElementAsString(int64 sparse_element_number,
-                                  const ShapeIndex& shape_index = {}) const;
-
-  // As Get(), but determines the correct type and converts the value into
-  // int64.  This literal must be an array.
-  StatusOr<int64> GetIntegralAsS64(
-      tensorflow::gtl::ArraySlice<int64> multi_index) const;
-
-  // As Set(), but truncates `value` to the literal element type before storing.
-  // This literal must be an array.
-  Status SetIntegralAsS64(tensorflow::gtl::ArraySlice<int64> multi_index,
-                          int64 value);
+  // the z and p dimensions given.
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR4Projected(
+      std::initializer_list<std::initializer_list<NativeT>> values,
+      int64 projection_p, int64 projection_z);
 
   // Returns an identity matrix (rank 2) with the given row and column count.
   template <typename NativeT>
@@ -511,6 +871,9 @@ class Literal {
   static std::unique_ptr<Literal> MakeTuple(
       tensorflow::gtl::ArraySlice<const Literal*> elements);
 
+  static std::unique_ptr<Literal> MakeTupleFromSlices(
+      tensorflow::gtl::ArraySlice<LiteralSlice> elements);
+
   // As above, but intended to be invoked with move semantics; i.e.
   //
   //  std::vector<std::unique_ptr<Literal>> elements = ...;
@@ -542,135 +905,104 @@ class Literal {
     return MakeTupleOwned(std::move(v));
   }
 
-  // Returns a string representation of the literal value.
-  // Warning: this function can take minutes for multi-million element Literals.
-  string ToString(bool print_layout = false) const;
-
-  // Invokes the "per cell" callback for each element in the provided
-  // literal with the element's indices and a string representation of
-  // the element's value.
-  //
-  // This function is useful if you want a polymorphic representation
-  // of the tensor's elements (turning it to a string for something
-  // like representation in a protobuf).
-  //
-  // This literal must have a dense layout.
-  void EachCellAsString(
-      const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
-                               const string& value)>& per_cell) const;
-  template <typename NativeT>
-  void EachCell(std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
-                                   NativeT value)>
-                    per_cell) const;
-
-  // Populate this literal with the given values. Examples:
-  //
-  //   // Populate with floats.
-  //   Array2D<float> float_values = ...
-  //   literal.PopulateR2FromArray2D(values);
-  //
-  //   // Populate with int32s.
-  //   literal.PopulateR2<int32>({{1, 2}, {3, 4}});
-  //
-  // The shape and element type of this literal must match given values. For
-  // example, in the call above to literal.PopulateR2(), 'literal' must be a 2x2
-  // array of S32.
-  template <typename NativeT>
-  void PopulateR1(tensorflow::gtl::ArraySlice<NativeT> values);
-  void PopulateR1(const tensorflow::core::Bitmap& values);
-  template <typename NativeT>
-  void PopulateR2(std::initializer_list<std::initializer_list<NativeT>> values);
-  template <typename NativeT>
-  void PopulateFromArray(const Array<NativeT>& values);
-  template <typename NativeT>
-  void PopulateR2FromArray2D(const Array2D<NativeT>& values);
-  template <typename NativeT>
-  void PopulateR3FromArray3D(const Array3D<NativeT>& values);
-  template <typename NativeT>
-  void PopulateR4FromArray4D(const Array4D<NativeT>& values);
-
-  // Populates literal values by calling the generator function for every cell
-  // in this literal object.
-  //
-  // generator must be a callable of the type
-  // NativeT(tensorflow::gtl::ArraySlice<int64> indexes) or compatible.
-  //
-  // This literal must have a dense layout.
-  template <typename NativeT, typename FnType>
-  Status Populate(const FnType& generator);
-
-  // A parallel version of Populate(). This can be used if the generator is
-  // thread-safe and the values for the shape's different elements are
-  // independent.
-  template <typename NativeT, typename FnType>
-  Status PopulateParallel(const FnType& generator);
+  // Returns a vector containing the tuple elements of this Literal as separate
+  // Literals. This Literal must be tuple-shaped and can be a nested tuple. The
+  // elements are moved into the new Literals; no data is copied. Upon return
+  // this Literal is set to a nil shape (empty tuple)
+  std::vector<Literal> DecomposeTuple();
 
-  // Fills this literal with the given value.
-  template <typename NativeT>
-  void PopulateWithValue(NativeT value);
+  // This operation is the inverse of DecomposeTuple. The given elements are
+  // moved into the tuple elements of a new tuple-shaped Literal which is
+  // returned. Upon return, each of the Literals in 'elements' is set to a nil
+  // shape (empty tuple).
+  static Literal MoveIntoTuple(
+      tensorflow::gtl::MutableArraySlice<Literal> elements);
 
-  // Returns whether every element in this literal is equal to value.
-  //
-  // value is an int8 because we expect this to be called with small
-  // compile-time constants (0, -1, etc.) and so that whatever value you pass
-  // can be represented exactly by floating-point types as small as 16 bits.
-  //
-  // If value doesn't fit in this literal's type, returns false.  Values of 1/0
-  // are considered equal to true/false; other values are not considered equal
-  // to true. Also if this literal is not array-shaped false is returned.
-  bool IsAll(int8 value) const;
+  // Creates a new Literal object with its values havings the primitive_type
+  // type, and with dimensions defined by the dimensions parameter.
+  // The content of the literal values is the default value of the primitive
+  // type of literal itself (0 for numeric types, and false for predicates).
+  static std::unique_ptr<Literal> CreateFromDimensions(
+      PrimitiveType primitive_type,
+      tensorflow::gtl::ArraySlice<int64> dimensions);
 
-  // Like IsAll(const Literal&, int8), except we check whether the literal is
-  // equal to a particular floating-point number.
-  //
-  // If the literal is not a floating-point value, this always returns false.
-  //
-  // This casts value to the type of literal, then compares using ==.  The usual
-  // admonishments about floating-point equality checks apply.  We expect you to
-  // use this to check for values that can be expressed precisely as a float,
-  // e.g. -0.5.  Also if this literal is not array-shaped false is returned.
-  bool IsAllFloat(float value) const;
+  // If the given literal's data type is bfloat16, converts it to a float
+  // literal; otherwise, returns a copy of it. If the literal is a tuple,
+  // recursively converts its elements.
+  static std::unique_ptr<Literal> ConvertBF16ToF32(
+      const LiteralSlice& bf16_literal);
+
+  // If the given literal's data type is float, converts it to a bfloat16
+  // literal; otherwise, returns a copy of it. If the literal is a tuple,
+  // recursively converts its elements.
+  static std::unique_ptr<Literal> ConvertF32ToBF16(
+      const LiteralSlice& f32_literal);
+
+  // Creates a literal with a new shape with the given new dimensions using the
+  // data in the given input literal. For reshaping purposes the (flat) data
+  // buffer of the input literal is assumed to have the given minor_to_major
+  // layout order.
+  static std::unique_ptr<Literal> ReshapeSlice(
+      tensorflow::gtl::ArraySlice<int64> new_dimensions,
+      tensorflow::gtl::ArraySlice<int64> minor_to_major,
+      const LiteralSlice& literal);
+
+  // Creates a literal with the supplied shape, and uses the provided value
+  // generator to populate the literal's values.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape,
+      const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator);
+
+  // Creates a literal with the supplied shape, and initializes the literal
+  // values using a normal distribution with given mean and stddev standard
+  // deviation, and using the engine as entropy generator.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type, typename E,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape, E* engine, T mean, T stddev);
+
+  // Creates a literal with the supplied shape, and initializes the literal
+  // values using a normal distribution with given mean and stddev standard
+  // deviation.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape, T mean, T stddev);
 
-  // Like IsAll(const Literal&, int8), except we check whether the literal is
-  // equal to a particular complex number.
-  //
-  // If the literal is not a complex value, this always returns false.
-  //
-  // This casts value to the type of literal, then compares using ==.  The usual
-  // admonishments about floating-point equality checks apply.  We expect you to
-  // use this to check for complex values that can be expressed precisely as
-  // float pairs e.g. (-0.5, 1.0).
   //
-  // This literal must have a dense layout.
-  bool IsAllComplex(complex64 value) const;
+  // End of factory methods.
 
-  // Literal consists entirely of the first element of the literal.
-  bool IsAllFirst() const;
+  // Returns a multi-dimensional index as a string. For example: '{7, 8}' will
+  // be returned for a 2-dimensional index with dimension 0 index equal to 7,
+  // dimension 1 equal to 8.
+  static string MultiIndexAsString(
+      tensorflow::gtl::ArraySlice<int64> multi_index);
 
-  // Returns whether this literal is zero at the specified index. This literal
-  // must be an array with a dense layout.
-  bool IsZero(tensorflow::gtl::ArraySlice<int64> indices) const;
+ private:
+  // Recursively sets the subshapes and buffers of all subpieces rooted at
+  // 'piece'. If 'allocate_array' is true, memory is allocated for the arrays in
+  // the shape.
+  void SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays);
 
-  // Return the count of the elements in the array at the given shape index in
-  // this literal.
-  int64 element_count(const ShapeIndex& index = {}) const {
-    return ShapeUtil::ElementsIn(ShapeUtil::GetSubshape(shape(), index));
+  // Returns the piece at the given ShapeIndex.
+  Piece& piece(const ShapeIndex& shape_index) {
+    return const_cast<Piece&>(LiteralBase::piece(shape_index));
   }
 
-  // Return the count of the elements in the sparse array at the given shape
-  // index in this literal, which will be no larger than
-  // LayoutUtil::MaxSparseElements(SetSubshape(shape(), index).layout()).
-  int64 sparse_element_count() const;
-
-  // Compute a hash for this literal.  This literal must not be a sparse tensor
-  // or a tuple containing a sparse tensor.
-  size_t Hash() const;
+  Piece& root_piece() const override { return *root_piece_; };
 
- protected:
   // Internal template helper for the Literal::CopySliceFrom(), matching its
   // arguments one by one.
   template <typename NativeT>
-  Status CopySliceFromInternal(const Literal& src_literal,
+  Status CopySliceFromInternal(const LiteralBase& src_literal,
                                tensorflow::gtl::ArraySlice<int64> src_base,
                                tensorflow::gtl::ArraySlice<int64> dest_base,
                                tensorflow::gtl::ArraySlice<int64> copy_size);
@@ -698,162 +1030,69 @@ class Literal {
     int64 minor_loop_size = 1;
   };
 
-  // A data structure representing a subshape at a particular ShapeIndex within
-  // the literal. For array-shaped ShapeIndexes, this data structure holds the
-  // pointer to the memory allocated for the array data.
-  class Piece {
-   public:
-    // Return the buffer holding the array data for this piece as an array
-    // slice. This piece must be array-shaped.
-    template <typename NativeT>
-    tensorflow::gtl::ArraySlice<NativeT> data() const;
-    template <typename NativeT>
-    tensorflow::gtl::MutableArraySlice<NativeT> data();
-
-    // Return the buffer holding the array data for this piece as a void*. This
-    // piece must be array-shaped.
-    void* untyped_data();
-    const void* untyped_data() const;
-
-    // Gets or sets an element in the array at the given index. The multi_index
-    // is CHECKed against the dimension sizes of the array.  This piece must be
-    // array-shaped.
-    template <typename NativeT>
-    NativeT Get(tensorflow::gtl::ArraySlice<int64> index) const;
-    template <typename NativeT>
-    void Set(tensorflow::gtl::ArraySlice<int64> index, NativeT value);
-
-    // Gets/sets the buffer holding the array data.
-    char* buffer() const { return buffer_; }
-    void set_buffer(char* buffer) { buffer_ = buffer; }
-
-    // The array of multi-indices that provide the locations of non-zero
-    // elements in a sparse array.  Only used if
-    // LayoutUtil::IsSparseArray(shape()) is true.
-    SparseIndexArray* sparse_indices() const { return sparse_indices_; }
-    void set_sparse_indices(SparseIndexArray* sparse_indices) {
-      sparse_indices_ = sparse_indices;
-    }
-
-    // Gets or sets the subshape of this piece. This reference points to a
-    // subshape within the shape in the containing Literal (Literal::shape_).
-    const Shape& subshape() const { return *subshape_; }
-    void set_subshape(const Shape* subshape) { subshape_ = subshape; }
-
-    // Returns the size in bytes of the buffer holding the array data.
-    int64 size_bytes() const { return ShapeUtil::ByteSizeOf(subshape()); }
-
-    // Returns the number of elements in this piece's array.
-    int64 element_count() const {
-      // If this is a sparse array, use the number of elements represented by
-      // the indices in the associated SparseIndexArray.
-      return LayoutUtil::IsSparseArray(subshape())
-                 ? sparse_indices()->index_count()
-                 : ShapeUtil::ElementsIn(subshape());
-    }
-
-    // Copy the data from 'src' into this piece's buffer. Shapes of this piece
-    // and src must be compatible.
-    Status CopyFrom(const Piece& src);
-
-    // Returns true if this piece and 'other' contain the same data. This piece
-    // and 'other' must be array-shaped and compatible.
-    bool EqualElements(const Piece& other) const;
-
-    // Writes the shape and data (if array-shaped) into the given proto.
-    void WriteToProto(LiteralProto* proto) const;
-
-    // Copies the data from the given proto into this piece. The shape of this
-    // piece must be equal (not just compatible) to the shape of the proto.
-    Status CopyFromProto(const LiteralProto& proto);
-
-    // Sorts the elements in a sparse array.
-    void SortSparseElements();
-
-   private:
-    // Recursive helper for EqualElements.
-    template <typename NativeT>
-    bool EqualElementsInternal(const Piece& other,
-                               std::vector<int64>* multi_index) const;
-
-    // Helper for SortSparseElements that has the element type as a template
-    // parameter.
-    template <typename NativeT>
-    void SortSparseElementsInternal();
-
-    // For array-shaped pieces, this is the buffer holding the literal data.
-    char* buffer_ = nullptr;
-
-    // For sparse arrays, this is the array of indices.
-    SparseIndexArray* sparse_indices_ = nullptr;
-
-    // The shape of piece. This points into the shape of the containing Literal
-    // (Literal::shape_).
-    const Shape* subshape_ = nullptr;
-  };
-
-  // Returns the piece at the given ShapeIndex.
-  Piece& piece(const ShapeIndex& shape_index) {
-    return *pieces_.mutable_element(shape_index);
-  }
-  const Piece& piece(const ShapeIndex& shape_index) const {
-    return pieces_.element(shape_index);
-  }
-
-  // Returns the piece at the root of the shape (empty ShapeIndex).
-  Piece& root_piece() { return piece({}); }
-  const Piece& root_piece() const { return piece({}); }
+  // Literal class always owns the shape. The parent class borrows this shape.
+  std::unique_ptr<Shape> shape_;
 
-  // Deallocate the buffers held by this literal (if the literal owns the
-  // buffer).
-  void DeallocateBuffers();
+  Piece* root_piece_ = nullptr;
 
   // Implementation details shared between Populate() and PopulateParallel()
   template <typename NativeT, typename FnType>
   Status PopulateInternal(const FnType& generator, bool parallel);
 
-  Shape shape_;
-  ShapeTree<Piece> pieces_;
-
-  // Whether the buffers held in pieces_ are owned by this Literal.
-  bool owns_buffers_;
-
-  // LiteralView must access and manipulate Pieces of other Literals.
-  friend class LiteralView;
-};  // namespace xla
+  // Deallocate the buffers held by this literal.
+  void DeallocateBuffers();
 
+  friend class LiteralBase;
+};
 std::ostream& operator<<(std::ostream& out, const Literal& literal);
 
-// A read-only view of a Literal. A LiteralView contains pointers to buffers
-// owned by the viewed Literal.
-//
-// TODO(b/71550060): Replace LiteralView with Literal slice classes (immutable
-// and mutable) similar to (Mutable)ArraySlice.
-class LiteralView : public Literal {
+// A read-only view of a Literal. A LiteralSlice contains pointers to shape and
+// literal buffers always owned by others.
+class LiteralSlice : public LiteralBase {
  public:
-  // Create and return a view of the given literal rooted at the given shape
-  // index within the given literal. A factory is used rather than a public
-  // constructor because only const LiteralViews are supported. It's still
-  // possible to create non-const LiteralViews via the copy constructors, but
-  // the factory method makes it a bit less likely. Implementing literal slices
-  // will fix this undesirable situation (b/71550060).
-  static const LiteralView Create(const Literal& literal,
-                                  const ShapeIndex& view_root = {});
+  LiteralSlice() : LiteralBase() {}
+
+  // Implicit conversion constructors.
+  LiteralSlice(const LiteralBase& literal);
+  LiteralSlice(const LiteralBase& literal, const ShapeIndex& view_root);
 
-  LiteralView(const LiteralView& other);
-  LiteralView& operator=(const LiteralView& other);
+ private:
+  const Piece& root_piece() const override { return *root_piece_; };
+
+  const Piece* root_piece_;  // Not owned.
+};
 
-  virtual ~LiteralView();
+// A read-only Literal where the underlying buffers are never owned by this
+// class.
+class BorrowingLiteral : public LiteralBase {
+ public:
+  BorrowingLiteral() : LiteralBase() {}
+
+  // 'src_buf_ptr' is not owned by this class and must outlive the
+  // lifetime of this class. It points to an appropirately sized buffer with
+  // data interpretered as indicated by 'shape'.
+  // This constructor is only used for array shapes.
+  BorrowingLiteral(const char* src_buf_ptr, const Shape& shape);
+  // Similar as above, except to be used for constructing non-nested tuples.
+  BorrowingLiteral(tensorflow::gtl::ArraySlice<const char*> src_buf_ptrs,
+                   const Shape& shape);
+  // TODO(b/79707221): adding constructors for nested tuples as well.
 
  private:
-  LiteralView(const Literal& literal, const ShapeIndex& view_root);
+  // Recursively builds the subtree for the given piece and sets the subshapes
+  // of the given piece with the given shape.
+  void BuildPieceSubtree(const Shape& shape, Piece* piece);
 
-  // Helper for the copy constructor and copy assignment operator.
-  void CopyFrom(const LiteralView& other);
+  // Accessor for the root piece of this literal.
+  const Piece& root_piece() const override { return root_piece_; };
+  Piece root_piece_;
+
+  // Shape of this literal.
+  const Shape shape_;
 };
 
 template <typename NativeT>
-tensorflow::gtl::ArraySlice<NativeT> Literal::Piece::data() const {
+tensorflow::gtl::ArraySlice<NativeT> LiteralBase::Piece::data() const {
   CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
   CHECK_EQ(subshape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>())
@@ -866,7 +1105,7 @@ tensorflow::gtl::ArraySlice<NativeT> Literal::Piece::data() const {
 }
 
 template <typename NativeT>
-tensorflow::gtl::MutableArraySlice<NativeT> Literal::Piece::data() {
+tensorflow::gtl::MutableArraySlice<NativeT> LiteralBase::Piece::data() {
   CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
   CHECK_EQ(subshape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>())
@@ -879,7 +1118,7 @@ tensorflow::gtl::MutableArraySlice<NativeT> Literal::Piece::data() {
 }
 
 template <typename NativeT>
-NativeT Literal::Piece::Get(
+NativeT LiteralBase::Piece::Get(
     tensorflow::gtl::ArraySlice<int64> multi_index) const {
   CHECK(LayoutUtil::IsDenseArray(subshape()));
   return data<NativeT>()[IndexUtil::MultidimensionalIndexToLinearIndex(
@@ -887,15 +1126,15 @@ NativeT Literal::Piece::Get(
 }
 
 template <typename NativeT>
-void Literal::Piece::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
-                         NativeT value) {
+void LiteralBase::Piece::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
+                             NativeT value) {
   CHECK(LayoutUtil::IsDenseArray(subshape()));
   data<NativeT>()[IndexUtil::MultidimensionalIndexToLinearIndex(
       subshape(), multi_index)] = value;
 }
 
 template <typename NativeT>
-tensorflow::gtl::ArraySlice<NativeT> Literal::data(
+tensorflow::gtl::ArraySlice<NativeT> LiteralBase::data(
     const ShapeIndex& shape_index) const {
   return piece(shape_index).data<NativeT>();
 }
@@ -907,13 +1146,13 @@ tensorflow::gtl::MutableArraySlice<NativeT> Literal::data(
 }
 
 template <typename NativeT>
-inline NativeT Literal::Get(tensorflow::gtl::ArraySlice<int64> multi_index,
-                            const ShapeIndex& shape_index) const {
+inline NativeT LiteralBase::Get(tensorflow::gtl::ArraySlice<int64> multi_index,
+                                const ShapeIndex& shape_index) const {
   return piece(shape_index).Get<NativeT>(multi_index);
 }
 
 template <typename NativeT>
-inline NativeT Literal::Get(
+inline NativeT LiteralBase::Get(
     tensorflow::gtl::ArraySlice<int64> multi_index) const {
   return root_piece().Get<NativeT>(multi_index);
 }
@@ -1160,13 +1399,13 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-NativeT Literal::GetFirstElement() const {
+NativeT LiteralBase::GetFirstElement() const {
   return data<NativeT>().at(0);
 }
 
 template <typename NativeT>
-NativeT Literal::GetSparseElement(int64 sparse_element_number,
-                                  const ShapeIndex& shape_index) const {
+NativeT LiteralBase::GetSparseElement(int64 sparse_element_number,
+                                      const ShapeIndex& shape_index) const {
   CHECK(
       LayoutUtil::IsSparseArray(ShapeUtil::GetSubshape(shape(), shape_index)));
   return data<NativeT>(shape_index)[sparse_element_number];
@@ -1199,7 +1438,7 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-void Literal::EachCell(
+void LiteralBase::EachCell(
     std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
                        NativeT value)>
         per_cell) const {
@@ -1375,7 +1614,7 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-std::unique_ptr<Literal> Literal::Replicate(int64 times) const {
+std::unique_ptr<Literal> LiteralBase::Replicate(int64 times) const {
   DimensionVector bounds = {times};
   bounds.reserve(shape().dimensions_size() + 1);
   for (int64 bound : shape().dimensions()) {
@@ -1410,6 +1649,38 @@ std::unique_ptr<Literal> Literal::Replicate(int64 times) const {
   return literal;
 }
 
+template <PrimitiveType type, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>> Literal::CreateRandomLiteral(
+    const Shape& shape,
+    const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator) {
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
+  TF_RET_CHECK(shape.element_type() == type);
+  auto literal = MakeUnique<Literal>(shape);
+  TF_RETURN_IF_ERROR(literal.get()->Populate<NativeT>(
+      [&](tensorflow::gtl::ArraySlice<int64> indexes) {
+        return generator(indexes);
+      }));
+  return std::move(literal);
+}
+
+template <PrimitiveType type, typename E, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>> Literal::CreateRandomLiteral(
+    const Shape& shape, E* engine, T mean, T stddev) {
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
+  std::normal_distribution<NativeT> generator(mean, stddev);
+  return CreateRandomLiteral<type, NativeT>(
+      shape, [&](tensorflow::gtl::ArraySlice<int64> /*indexes*/) {
+        return generator(*engine);
+      });
+}
+
+template <PrimitiveType type, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>> Literal::CreateRandomLiteral(
+    const Shape& shape, T mean, T stddev) {
+  std::minstd_rand0 engine;
+  return CreateRandomLiteral<type>(shape, &engine, mean, stddev);
+}
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_LITERAL_UTIL_H_
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index 61046784e05623cd3117c24ecc6d6c474739bbd5..77f979a0d701f09162e112b69f6128008872aa18 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/layout_util.h"
@@ -974,7 +975,7 @@ TEST_F(LiteralUtilTest, CopyFromTuples) {
                                    Literal::CreateR1<double>({2.0, 4.0}).get(),
                                    &nil_literal});
 
-  EXPECT_EQ(*matrix, LiteralView::Create(*nested_tuple, {0}));
+  EXPECT_EQ(*matrix, LiteralSlice(*nested_tuple, {0}));
   EXPECT_EQ(nested_tuple->Get<int32>({}, {1, 0}), 42);
   EXPECT_EQ(nested_tuple->Get<double>({0}, {1, 1}), 23.0);
   EXPECT_EQ(nested_tuple->Get<double>({1}, {1, 1}), 44.0);
@@ -985,7 +986,7 @@ TEST_F(LiteralUtilTest, CopyFromTuples) {
                                       /*src_shape_index=*/{}));
 
   // The matrix element should be unchanged.
-  EXPECT_EQ(*matrix, LiteralView::Create(*nested_tuple, {0}));
+  EXPECT_EQ(*matrix, LiteralSlice(*nested_tuple, {0}));
 
   // The tuple element should have been copied from 'tuple'.
   EXPECT_EQ(nested_tuple->Get<int32>({}, {1, 0}), -5);
@@ -1065,7 +1066,7 @@ TEST_F(LiteralUtilTest, Populate) {
     Shape shape = ShapeUtil::MakeShapeWithLayout(
         primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
         data.layout);
-    auto literal = Literal::CreateFromShape(shape);
+    auto literal = MakeUnique<Literal>(shape);
     auto generator = [&](ArraySlice<int64> indexes) -> uint32 {
       // Offsets from linear index just to avoid R0 literals to be initialized
       // with zero.
@@ -1107,7 +1108,7 @@ TEST_F(LiteralUtilTest, PopulateParallel) {
     Shape shape = ShapeUtil::MakeShapeWithLayout(
         primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
         data.layout);
-    auto literal = Literal::CreateFromShape(shape);
+    auto literal = MakeUnique<Literal>(shape);
     auto generator = [&](ArraySlice<int64> indexes) -> uint32 {
       // Offsets from linear index just to avoid R0 literals to be initialized
       // with zero.
@@ -1373,36 +1374,36 @@ TEST_F(LiteralUtilTest, CopyFromProto_f16) {
   ASSERT_EQ(h1, r[3]);
 }
 
-TEST_F(LiteralUtilTest, LiteralViewTest) {
+TEST_F(LiteralUtilTest, LiteralSliceTest) {
   auto scalar = Literal::CreateR0<float>(1.0);
   auto matrix = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()});
   auto nested_tuple = Literal::MakeTuple({tuple.get(), scalar.get()});
   Literal nil(ShapeUtil::MakeNil());
 
-  EXPECT_EQ(LiteralView::Create(*scalar, {}), *scalar);
-  EXPECT_EQ(LiteralView::Create(*matrix, {}), *matrix);
-  EXPECT_EQ(LiteralView::Create(*tuple, {}), *tuple);
-  EXPECT_EQ(LiteralView::Create(*nested_tuple, {}), *nested_tuple);
-  EXPECT_EQ(LiteralView::Create(nil, {}), nil);
+  EXPECT_EQ(LiteralSlice(*scalar, {}), *scalar);
+  EXPECT_EQ(LiteralSlice(*matrix, {}), *matrix);
+  EXPECT_EQ(LiteralSlice(*tuple, {}), *tuple);
+  EXPECT_EQ(LiteralSlice(*nested_tuple, {}), *nested_tuple);
+  EXPECT_EQ(LiteralSlice(nil, {}), nil);
 
-  EXPECT_EQ(LiteralView::Create(*tuple, {0}), *scalar);
-  EXPECT_EQ(LiteralView::Create(*tuple, {1}), *matrix);
+  EXPECT_EQ(LiteralSlice(*tuple, {0}), *scalar);
+  EXPECT_EQ(LiteralSlice(*tuple, {1}), *matrix);
 
-  EXPECT_EQ(LiteralView::Create(*nested_tuple, {0}), *tuple);
-  EXPECT_EQ(LiteralView::Create(*nested_tuple, {0, 0}), *scalar);
-  EXPECT_EQ(LiteralView::Create(*nested_tuple, {0, 1}), *matrix);
-  EXPECT_EQ(LiteralView::Create(*nested_tuple, {1}), *scalar);
+  EXPECT_EQ(LiteralSlice(*nested_tuple, {0}), *tuple);
+  EXPECT_EQ(LiteralSlice(*nested_tuple, {0, 0}), *scalar);
+  EXPECT_EQ(LiteralSlice(*nested_tuple, {0, 1}), *matrix);
+  EXPECT_EQ(LiteralSlice(*nested_tuple, {1}), *scalar);
 }
 
-TEST_F(LiteralUtilTest, MutatingLiteralView) {
+TEST_F(LiteralUtilTest, MutatingLiteralSlice) {
   auto scalar = Literal::CreateR0<float>(1.0);
   auto matrix = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()});
   auto nested_tuple = Literal::MakeTuple({tuple.get(), scalar.get()});
   // Verify that changing the underlying data beneath the view changes the
   // data of the view itself.
-  const auto nested_tuple_view = LiteralView::Create(*nested_tuple);
+  const auto nested_tuple_view = LiteralSlice(*nested_tuple);
   EXPECT_EQ(
       nested_tuple->Get<float>(/*multi_index=*/{}, /*shape_index=*/{0, 0}),
       1.0f);
@@ -1418,19 +1419,57 @@ TEST_F(LiteralUtilTest, MutatingLiteralView) {
             555.0f);
 }
 
-TEST_F(LiteralUtilTest, LiteralViewOfALiteralView) {
+TEST_F(LiteralUtilTest, LiteralSliceOfALiteralSlice) {
   auto scalar = Literal::CreateR0<float>(1.0);
   auto matrix = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()});
   auto nested_tuple = Literal::MakeTuple({tuple.get(), scalar.get()});
 
-  const auto nested_tuple_view = LiteralView::Create(*nested_tuple);
-  const auto tuple_view =
-      LiteralView::Create(nested_tuple_view, /*view_root=*/{0});
-  const auto matrix_view = LiteralView::Create(tuple_view, /*view_root=*/{1});
+  const auto nested_tuple_view = LiteralSlice(*nested_tuple);
+  const auto tuple_view = LiteralSlice(nested_tuple_view, /*view_root=*/{0});
+  const auto matrix_view = LiteralSlice(tuple_view, /*view_root=*/{1});
   EXPECT_EQ(matrix_view, *Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}}));
 }
 
+TEST_F(LiteralUtilTest, BorrowingLiteralFromOneBufferPtrTest) {
+  std::vector<int64> int64_values = {1, 2, 3};
+  const Shape literal_shape = ShapeUtil::MakeShape(S64, {3});
+
+  BorrowingLiteral literal(reinterpret_cast<const char*>(int64_values.data()),
+                           literal_shape);
+
+  EXPECT_EQ(literal.Get<int64>({0}), 1);
+  EXPECT_EQ(literal.Get<int64>({1}), 2);
+  EXPECT_EQ(literal.Get<int64>({2}), 3);
+}
+
+TEST_F(LiteralUtilTest, BorrowingLiteralFromMultipleBufferPtrsTest) {
+  std::vector<int64> one_two_three = {1, 2, 3};
+  const Shape one_two_three_shape = ShapeUtil::MakeShape(S64, {3});
+
+  std::vector<int64> hundred = {100};
+  const Shape hundred_shape = ShapeUtil::MakeShape(S64, {1});
+
+  std::vector<const char*> src_buf_ptrs;
+  src_buf_ptrs.emplace_back(
+      reinterpret_cast<const char*>(one_two_three.data()));
+  src_buf_ptrs.emplace_back(reinterpret_cast<const char*>(hundred.data()));
+  auto literal_tuple = BorrowingLiteral(
+      src_buf_ptrs,
+      ShapeUtil::MakeTupleShape({one_two_three_shape, hundred_shape}));
+
+  EXPECT_EQ(literal_tuple.Get<int64>(/*multi_index=*/{0}, /*shape_index=*/{0}),
+            1);
+  EXPECT_EQ(literal_tuple.Get<int64>(/*multi_index=*/{0}, /*shape_index=*/{1}),
+            100);
+
+  EXPECT_EQ(literal_tuple.Get<int64>(/*multi_index=*/{1}, /*shape_index=*/{0}),
+            2);
+
+  EXPECT_EQ(literal_tuple.Get<int64>(/*multi_index=*/{2}, /*shape_index=*/{0}),
+            3);
+}
+
 TEST_F(LiteralUtilTest, LiteralMove) {
   std::unique_ptr<Literal> matrix =
       Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
@@ -1533,11 +1572,11 @@ TEST_F(LiteralUtilTest, LiteralMoveAssignment) {
   EXPECT_EQ(literal.Get<float>({1, 1}), 4.0);
 }
 
-TEST_F(LiteralUtilTest, LiteralViewCopy) {
+TEST_F(LiteralUtilTest, LiteralSliceCopy) {
   std::unique_ptr<Literal> matrix =
       Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
-  const auto matrix_view = LiteralView::Create(*matrix);
-  LiteralView matrix_view_copy(matrix_view);
+  const auto matrix_view = LiteralSlice(*matrix);
+  LiteralSlice matrix_view_copy(matrix_view);
 
   EXPECT_EQ(matrix_view_copy.Get<float>({0, 0}), 1.0);
   EXPECT_EQ(matrix_view_copy.Get<float>({0, 1}), 2.0);
diff --git a/tensorflow/compiler/xla/map_util.h b/tensorflow/compiler/xla/map_util.h
index 8db8c6f3de84a6c46625eadbb6b0f83d2262e5f7..3c74e070da529b7f1431e01fbaf31932f582db44 100644
--- a/tensorflow/compiler/xla/map_util.h
+++ b/tensorflow/compiler/xla/map_util.h
@@ -86,11 +86,10 @@ const typename Collection::value_type::second_type& FindOrDefault(
 
 // Inserts the key-value pair into the collection. Dies if key was already
 // present.
-template <class Collection>
-void InsertOrDie(Collection* const collection,
-                 const typename Collection::value_type::first_type& key,
-                 const typename Collection::value_type::second_type& data) {
-  auto p = collection->insert(std::make_pair(key, data));
+template <class Collection, class Key, class Value>
+void InsertOrDie(Collection* const collection, Key&& key, Value&& value) {
+  auto p = collection->insert(
+      std::make_pair(std::forward<Key>(key), std::forward<Value>(value)));
   CHECK(p.second) << "duplicate key: " << key;
 }
 
@@ -101,9 +100,10 @@ bool ContainsKey(const Collection& collection, const Key& key) {
 }
 
 // Inserts `value` into `set`. Dies if it was already present.
-template <class Set>
-void InsertOrDie(Set* const set, const typename Set::value_type& value) {
-  CHECK(set->insert(value).second) << "duplicate value: " << value;
+template <class Set, class Value>
+void InsertOrDie(Set* const set, Value&& value) {
+  CHECK(set->insert(std::forward<Value>(value)).second)
+      << "duplicate value: " << value;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index ecb87bd8893276fbb9ecffaa0f8a3233d2e0043f..932cce943f7c046a85984e6e5ed6b59dae371473 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -49,9 +49,10 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 044458164ff89c554262e1bdbbd4dbed120a2ff4..cb4dc1782b680fca1485e883343fbb262b86b1d1 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -15,8 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/local_computation_builder.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/platform/default/thread_annotations.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 
 namespace xla {
 
@@ -248,7 +249,7 @@ LocalShapedBuffer* CompiledLocalComputation::ExecuteWithShapedBuffers(
   return new LocalShapedBuffer(std::move(result_buffer));
 }
 
-LocalComputation::LocalComputation(Computation computation)
+LocalComputation::LocalComputation(XlaComputation computation)
     : computation_(std::move(computation)) {}
 
 StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
@@ -271,7 +272,7 @@ StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
   return new CompiledLocalComputation(std::move(local_executable));
 }
 
-const Computation& LocalComputation::computation() const {
+const XlaComputation& LocalComputation::computation() const {
   return computation_;
 }
 
@@ -281,8 +282,12 @@ StatusOr<Shape> LocalComputation::GetReturnValueShape() const {
   return std::move(*program_shape.mutable_result());
 }
 
+LocalOp::LocalOp(const XlaOp& op) : op_(op) {}
+
+const XlaOp& LocalOp::op() const { return op_; }
+
 LocalComputationBuilder::LocalComputationBuilder(const string& computation_name)
-    : builder_(GetOrCreateLocalClient(), computation_name) {}
+    : builder_(computation_name) {}
 
 void LocalComputationBuilder::SetOpMetadata(const OpMetadata& metadata) {
   builder_.SetOpMetadata(metadata);
@@ -291,19 +296,21 @@ void LocalComputationBuilder::SetOpMetadata(const OpMetadata& metadata) {
 void LocalComputationBuilder::ClearOpMetadata() { builder_.ClearOpMetadata(); }
 
 StatusOr<LocalComputation*> LocalComputationBuilder::Build() {
-  TF_ASSIGN_OR_RETURN(Computation computation, builder_.Build());
+  TF_ASSIGN_OR_RETURN(XlaComputation computation, builder_.Build());
   return new LocalComputation(std::move(computation));
 }
 
-ComputationDataHandle LocalComputationBuilder::Parameter(int64 parameter_number,
-                                                         const Shape& shape,
-                                                         const string& name) {
+LocalOp LocalComputationBuilder::Parameter(int64 parameter_number,
+                                           const Shape& shape,
+                                           const string& name) {
   return builder_.Parameter(parameter_number, shape, name);
 }
 
 std::unique_ptr<Shape> LocalComputationBuilder::GetShape(
-    const ComputationDataHandle& operand) {
-  return builder_.GetShape(operand).ConsumeValueOrDie();
+    const LocalOp& operand) {
+  auto result = MakeUnique<Shape>();
+  *result = builder_.GetShape(operand.op()).ValueOrDie();
+  return result;
 }
 
 StatusOr<Shape> LocalComputationBuilder::GetReturnValueShape() {
@@ -311,222 +318,236 @@ StatusOr<Shape> LocalComputationBuilder::GetReturnValueShape() {
   return program_shape.result();
 }
 
-ComputationDataHandle LocalComputationBuilder::Infeed(const Shape& shape) {
+LocalOp LocalComputationBuilder::Infeed(const Shape& shape) {
   return builder_.Infeed(shape);
 }
 
-void LocalComputationBuilder::Outfeed(const ComputationDataHandle& operand,
+void LocalComputationBuilder::Outfeed(const LocalOp& operand,
                                       const Shape& shape,
                                       const string& outfeed_config) {
-  builder_.Outfeed(operand, shape, outfeed_config);
+  builder_.Outfeed(operand.op(), shape, outfeed_config);
 }
 
-ComputationDataHandle LocalComputationBuilder::ConstantLiteral(
-    const Literal& literal) {
+LocalOp LocalComputationBuilder::ConstantLiteral(const Literal& literal) {
   return builder_.ConstantLiteral(literal);
 }
 
-ComputationDataHandle LocalComputationBuilder::Broadcast(
-    const ComputationDataHandle& operand,
+LocalOp LocalComputationBuilder::Broadcast(
+    const LocalOp& operand,
     tensorflow::gtl::ArraySlice<int64> broadcast_sizes) {
-  return builder_.Broadcast(operand, broadcast_sizes);
+  return builder_.Broadcast(operand.op(), broadcast_sizes);
 }
 
-ComputationDataHandle LocalComputationBuilder::Pad(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& padding_value,
-    const PaddingConfig& padding_config) {
-  return builder_.Pad(operand, padding_value, padding_config);
+LocalOp LocalComputationBuilder::Pad(const LocalOp& operand,
+                                     const LocalOp& padding_value,
+                                     const PaddingConfig& padding_config) {
+  return builder_.Pad(operand.op(), padding_value.op(), padding_config);
 }
 
-ComputationDataHandle LocalComputationBuilder::Reshape(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> dimensions,
+LocalOp LocalComputationBuilder::Reshape(
+    const LocalOp& operand, tensorflow::gtl::ArraySlice<int64> dimensions,
     tensorflow::gtl::ArraySlice<int64> new_sizes) {
-  return builder_.Reshape(operand, dimensions, new_sizes);
+  return builder_.Reshape(operand.op(), dimensions, new_sizes);
 }
 
-ComputationDataHandle LocalComputationBuilder::Collapse(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> dimensions) {
-  return builder_.Collapse(operand, dimensions);
+LocalOp LocalComputationBuilder::Collapse(
+    const LocalOp& operand, tensorflow::gtl::ArraySlice<int64> dimensions) {
+  return builder_.Collapse(operand.op(), dimensions);
 }
 
-ComputationDataHandle LocalComputationBuilder::CrossReplicaSum(
-    const ComputationDataHandle& operand) {
-  return builder_.CrossReplicaSum(operand);
+LocalOp LocalComputationBuilder::CrossReplicaSum(const LocalOp& operand) {
+  return builder_.CrossReplicaSum(operand.op());
 }
 
-ComputationDataHandle LocalComputationBuilder::Slice(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> start_indices,
+LocalOp LocalComputationBuilder::Slice(
+    const LocalOp& operand, tensorflow::gtl::ArraySlice<int64> start_indices,
     tensorflow::gtl::ArraySlice<int64> limit_indices,
     tensorflow::gtl::ArraySlice<int64> strides) {
-  return builder_.Slice(operand, start_indices, limit_indices, strides);
+  return builder_.Slice(operand.op(), start_indices, limit_indices, strides);
 }
 
-ComputationDataHandle LocalComputationBuilder::SliceInDim(
-    const ComputationDataHandle& operand, int64 start_index, int64 limit_index,
-    int64 stride, int64 dimno) {
-  return builder_.SliceInDim(operand, start_index, limit_index, stride, dimno);
+LocalOp LocalComputationBuilder::SliceInDim(const LocalOp& operand,
+                                            int64 start_index,
+                                            int64 limit_index, int64 stride,
+                                            int64 dimno) {
+  return builder_.SliceInDim(operand.op(), start_index, limit_index, stride,
+                             dimno);
 }
 
-ComputationDataHandle LocalComputationBuilder::DynamicSlice(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& start_indices,
+LocalOp LocalComputationBuilder::DynamicSlice(
+    const LocalOp& operand, const LocalOp& start_indices,
     tensorflow::gtl::ArraySlice<int64> slice_sizes) {
-  return builder_.DynamicSlice(operand, start_indices, slice_sizes);
+  return builder_.DynamicSlice(operand.op(), start_indices.op(), slice_sizes);
 }
 
-ComputationDataHandle LocalComputationBuilder::DynamicUpdateSlice(
-    const ComputationDataHandle& operand, const ComputationDataHandle& update,
-    const ComputationDataHandle& start_indices) {
-  return builder_.DynamicUpdateSlice(operand, update, start_indices);
+LocalOp LocalComputationBuilder::DynamicUpdateSlice(
+    const LocalOp& operand, const LocalOp& update,
+    const LocalOp& start_indices) {
+  return builder_.DynamicUpdateSlice(operand.op(), update.op(),
+                                     start_indices.op());
 }
 
-ComputationDataHandle LocalComputationBuilder::ConcatInDim(
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-    int64 dimension) {
-  return builder_.ConcatInDim(operands, dimension);
+LocalOp LocalComputationBuilder::ConcatInDim(
+    tensorflow::gtl::ArraySlice<LocalOp> operands, int64 dimension) {
+  std::vector<XlaOp> xla_ops;
+  xla_ops.reserve(operands.size());
+  for (const auto& op : operands) {
+    xla_ops.push_back(op.op());
+  }
+  return builder_.ConcatInDim(xla_ops, dimension);
 }
 
-ComputationDataHandle
-LocalComputationBuilder::SelectAndScatterWithGeneralPadding(
-    const ComputationDataHandle& operand, const LocalComputation& select,
+LocalOp LocalComputationBuilder::SelectAndScatterWithGeneralPadding(
+    const LocalOp& operand, const LocalComputation& select,
     tensorflow::gtl::ArraySlice<int64> window_dimensions,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    const ComputationDataHandle& source,
-    const ComputationDataHandle& init_value, const LocalComputation& scatter) {
+    const LocalOp& source, const LocalOp& init_value,
+    const LocalComputation& scatter) {
   return builder_.SelectAndScatterWithGeneralPadding(
-      operand, select.computation(), window_dimensions, window_strides, padding,
-      source, init_value, scatter.computation());
+      operand.op(), select.computation(), window_dimensions, window_strides,
+      padding, source.op(), init_value.op(), scatter.computation());
 }
 
-ComputationDataHandle LocalComputationBuilder::Tuple(
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> elements) {
-  return builder_.Tuple(elements);
+LocalOp LocalComputationBuilder::Tuple(
+    tensorflow::gtl::ArraySlice<LocalOp> elements) {
+  std::vector<XlaOp> xla_ops;
+  xla_ops.reserve(elements.size());
+  for (const auto& op : elements) {
+    xla_ops.push_back(op.op());
+  }
+
+  return builder_.Tuple(xla_ops);
 }
 
-ComputationDataHandle LocalComputationBuilder::GetTupleElement(
-    const ComputationDataHandle& tuple_data, int64 index) {
-  return builder_.GetTupleElement(tuple_data, index);
+LocalOp LocalComputationBuilder::GetTupleElement(const LocalOp& tuple_data,
+                                                 int64 index) {
+  return builder_.GetTupleElement(tuple_data.op(), index);
 }
 
-ComputationDataHandle LocalComputationBuilder::Dot(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs) {
-  return builder_.Dot(lhs, rhs);
+LocalOp LocalComputationBuilder::Dot(const LocalOp& lhs, const LocalOp& rhs) {
+  return builder_.Dot(lhs.op(), rhs.op());
 }
 
-ComputationDataHandle LocalComputationBuilder::DotGeneral(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+LocalOp LocalComputationBuilder::DotGeneral(
+    const LocalOp& lhs, const LocalOp& rhs,
     const DotDimensionNumbers& dimension_numbers) {
-  return builder_.DotGeneral(lhs, rhs, dimension_numbers);
+  return builder_.DotGeneral(lhs.op(), rhs.op(), dimension_numbers);
 }
 
-ComputationDataHandle LocalComputationBuilder::ConvGeneralDilated(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+LocalOp LocalComputationBuilder::ConvGeneralDilated(
+    const LocalOp& lhs, const LocalOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     tensorflow::gtl::ArraySlice<int64> lhs_dilation,
     tensorflow::gtl::ArraySlice<int64> rhs_dilation,
     const ConvolutionDimensionNumbers& dimension_numbers) {
-  return builder_.ConvGeneralDilated(lhs, rhs, window_strides, padding,
-                                     lhs_dilation, rhs_dilation,
+  return builder_.ConvGeneralDilated(lhs.op(), rhs.op(), window_strides,
+                                     padding, lhs_dilation, rhs_dilation,
                                      dimension_numbers);
 }
 
-ComputationDataHandle LocalComputationBuilder::ConvertElementType(
-    const ComputationDataHandle& operand, PrimitiveType new_element_type) {
-  return builder_.ConvertElementType(operand, new_element_type);
+LocalOp LocalComputationBuilder::ConvertElementType(
+    const LocalOp& operand, PrimitiveType new_element_type) {
+  return builder_.ConvertElementType(operand.op(), new_element_type);
 }
 
-ComputationDataHandle LocalComputationBuilder::Call(
+LocalOp LocalComputationBuilder::Call(
     const LocalComputation& local_computation,
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands) {
-  return builder_.Call(local_computation.computation(), operands);
+    tensorflow::gtl::ArraySlice<LocalOp> operands) {
+  std::vector<XlaOp> xla_ops;
+  xla_ops.reserve(operands.size());
+  for (const auto& op : operands) {
+    xla_ops.push_back(op.op());
+  }
+  return builder_.Call(local_computation.computation(), xla_ops);
 }
 
-ComputationDataHandle LocalComputationBuilder::Transpose(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> permutation) {
-  return builder_.Transpose(operand, permutation);
+LocalOp LocalComputationBuilder::Transpose(
+    const LocalOp& operand, tensorflow::gtl::ArraySlice<int64> permutation) {
+  return builder_.Transpose(operand.op(), permutation);
 }
 
-ComputationDataHandle LocalComputationBuilder::Rev(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> dimensions) {
-  return builder_.Rev(operand, dimensions);
+LocalOp LocalComputationBuilder::Rev(
+    const LocalOp& operand, tensorflow::gtl::ArraySlice<int64> dimensions) {
+  return builder_.Rev(operand.op(), dimensions);
 }
 
-ComputationDataHandle LocalComputationBuilder::Map(
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
+LocalOp LocalComputationBuilder::Map(
+    tensorflow::gtl::ArraySlice<LocalOp> operands,
     const LocalComputation& local_computation,
     tensorflow::gtl::ArraySlice<int64> dimensions,
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> static_operands) {
-  return builder_.Map(operands, local_computation.computation(), dimensions,
-                      static_operands);
+    tensorflow::gtl::ArraySlice<LocalOp> static_operands) {
+  std::vector<XlaOp> xla_ops;
+  xla_ops.reserve(operands.size());
+  for (const auto& op : operands) {
+    xla_ops.push_back(op.op());
+  }
+
+  std::vector<XlaOp> static_xla_ops;
+  static_xla_ops.reserve(static_operands.size());
+  for (const auto& op : static_operands) {
+    static_xla_ops.push_back(op.op());
+  }
+
+  return builder_.Map(xla_ops, local_computation.computation(), dimensions,
+                      static_xla_ops);
 }
 
-ComputationDataHandle LocalComputationBuilder::Reduce(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& init_value,
+LocalOp LocalComputationBuilder::Reduce(
+    const LocalOp& operand, const LocalOp& init_value,
     const LocalComputation& local_computation,
     tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce) {
-  return builder_.Reduce(operand, init_value, local_computation.computation(),
-                         dimensions_to_reduce);
+  return builder_.Reduce(operand.op(), init_value.op(),
+                         local_computation.computation(), dimensions_to_reduce);
 }
 
-ComputationDataHandle LocalComputationBuilder::ReduceWindowWithGeneralPadding(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& init_value,
+LocalOp LocalComputationBuilder::ReduceWindowWithGeneralPadding(
+    const LocalOp& operand, const LocalOp& init_value,
     const LocalComputation& local_computation,
     tensorflow::gtl::ArraySlice<int64> window_dimensions,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
   return builder_.ReduceWindowWithGeneralPadding(
-      operand, init_value, local_computation.computation(), window_dimensions,
-      window_strides, padding);
+      operand.op(), init_value.op(), local_computation.computation(),
+      window_dimensions, window_strides, padding);
 }
 
-ComputationDataHandle LocalComputationBuilder::RngNormal(
-    const ComputationDataHandle& mu, const ComputationDataHandle& sigma,
-    const Shape& shape) {
-  return builder_.RngNormal(mu, sigma, shape);
+LocalOp LocalComputationBuilder::RngNormal(const LocalOp& mu,
+                                           const LocalOp& sigma,
+                                           const Shape& shape) {
+  return builder_.RngNormal(mu.op(), sigma.op(), shape);
 }
 
-ComputationDataHandle LocalComputationBuilder::RngUniform(
-    const ComputationDataHandle& a, const ComputationDataHandle& b,
-    const Shape& shape) {
-  return builder_.RngUniform(a, b, shape);
+LocalOp LocalComputationBuilder::RngUniform(const LocalOp& a, const LocalOp& b,
+                                            const Shape& shape) {
+  return builder_.RngUniform(a.op(), b.op(), shape);
 }
 
-ComputationDataHandle LocalComputationBuilder::While(
-    const LocalComputation& condition, const LocalComputation& body,
-    const ComputationDataHandle& init) {
-  return builder_.While(condition.computation(), body.computation(), init);
+LocalOp LocalComputationBuilder::While(const LocalComputation& condition,
+                                       const LocalComputation& body,
+                                       const LocalOp& init) {
+  return builder_.While(condition.computation(), body.computation(), init.op());
 }
 
-ComputationDataHandle LocalComputationBuilder::Conditional(
-    const ComputationDataHandle& predicate,
-    const ComputationDataHandle& true_operand,
-    const LocalComputation& true_computation,
-    const ComputationDataHandle& false_operand,
+LocalOp LocalComputationBuilder::Conditional(
+    const LocalOp& predicate, const LocalOp& true_operand,
+    const LocalComputation& true_computation, const LocalOp& false_operand,
     const LocalComputation& false_computation) {
-  return builder_.Conditional(predicate, true_operand,
-                              true_computation.computation(), false_operand,
-                              false_computation.computation());
+  return builder_.Conditional(
+      predicate.op(), true_operand.op(), true_computation.computation(),
+      false_operand.op(), false_computation.computation());
 }
 
-StatusOr<bool> LocalComputationBuilder::IsConstant(
-    const ComputationDataHandle& operand, int64 num_parameters) {
-  return builder_.IsConstant(operand, num_parameters);
+StatusOr<bool> LocalComputationBuilder::IsConstant(const LocalOp& operand) {
+  return builder_.IsConstant(operand.op());
 }
 
-StatusOr<std::unique_ptr<Literal>> LocalComputationBuilder::ComputeConstant(
-    const ComputationDataHandle& operand, const Layout* output_layout,
-    tensorflow::gtl::ArraySlice<Literal> parameters) {
-  return builder_.ComputeConstant(operand, output_layout, parameters);
+StatusOr<LocalComputation*> LocalComputationBuilder::BuildConstantSubGraph(
+    const LocalOp& operand) {
+  TF_ASSIGN_OR_RETURN(XlaComputation computation,
+                      builder_.BuildConstantSubGraph(operand.op()));
+  return new LocalComputation(std::move(computation));
 }
 
 #define _FORWARD(method_name, return_sig, args_sig, args)    \
@@ -534,23 +555,19 @@ StatusOr<std::unique_ptr<Literal>> LocalComputationBuilder::ComputeConstant(
     return builder_.method_name args;                        \
   }
 
-#define _FORWARD_UNOP(method_name)             \
-  _FORWARD(method_name, ComputationDataHandle, \
-           (const ComputationDataHandle& operand), (operand))
-
-#define _FORWARD_BINOP(method_name)                                        \
-  _FORWARD(                                                                \
-      method_name, ComputationDataHandle,                                  \
-      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
-       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions),           \
-      (lhs, rhs, broadcast_dimensions))
-
-#define _FORWARD_TRIOP(method_name)                                        \
-  _FORWARD(                                                                \
-      method_name, ComputationDataHandle,                                  \
-      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
-       const ComputationDataHandle& ehs),                                  \
-      (lhs, rhs, ehs))
+#define _FORWARD_UNOP(method_name) \
+  _FORWARD(method_name, LocalOp, (const LocalOp& operand), (operand.op()))
+
+#define _FORWARD_BINOP(method_name)                                   \
+  _FORWARD(method_name, LocalOp,                                      \
+           (const LocalOp& lhs, const LocalOp& rhs,                   \
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions), \
+           (lhs.op(), rhs.op(), broadcast_dimensions))
+
+#define _FORWARD_TRIOP(method_name)                                      \
+  _FORWARD(method_name, LocalOp,                                         \
+           (const LocalOp& lhs, const LocalOp& rhs, const LocalOp& ehs), \
+           (lhs.op(), rhs.op(), ehs.op()))
 
 _FORWARD_TRIOP(Select)
 _FORWARD_TRIOP(Clamp)
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 5ec097846a59fdc7dd51f04f011225495643930d..a06b85b4ea28c4f386598901138930eaaed12079 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -17,9 +17,10 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_COMPUTATION_BUILDER_H_
 
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -97,25 +98,37 @@ class CompiledLocalComputation {
   std::unique_ptr<LocalExecutable> executable_;
 };
 
-// Wraps a Computation produced by a LocalComputationBuilder. The
+// Wraps a XlaComputation produced by a LocalComputationBuilder. The
 // Compile method compiles the computation to a (local) executable via
 // the client library's local client. This class is intended to be
 // made available to Python via SWIG.
 class LocalComputation {
  public:
-  LocalComputation(Computation computation);
+  LocalComputation(XlaComputation computation);
 
   StatusOr<CompiledLocalComputation*> Compile(
       const std::vector<Shape>& argument_shapes,
       const ExecutableBuildOptions* build_options);
 
-  const Computation& computation() const;
+  const XlaComputation& computation() const;
 
   // Returns the return-value shape for this computation.
   StatusOr<Shape> GetReturnValueShape() const;
 
  private:
-  Computation computation_;
+  XlaComputation computation_;
+};
+
+// Wraps a XlaOp produced by a LocalComputationBuilder. This class is intended
+// to be made available to Python via SWIG.
+class LocalOp {
+ public:
+  LocalOp(const XlaOp& op);
+
+  const XlaOp& op() const;
+
+ private:
+  XlaOp op_;
 };
 
 // Wraps the ComputationBuilder API in order to:
@@ -135,166 +148,137 @@ class LocalComputationBuilder {
   // Returns an owned LocalComputation to the caller on success.
   StatusOr<LocalComputation*> Build();
 
-  ComputationDataHandle Parameter(int64 parameter_number, const Shape& shape,
-                                  const string& name);
+  LocalOp Parameter(int64 parameter_number, const Shape& shape,
+                    const string& name);
 
-  std::unique_ptr<Shape> GetShape(const ComputationDataHandle& operand);
+  std::unique_ptr<Shape> GetShape(const LocalOp& operand);
 
   // Returns the shape of the current return value for the computation.
   StatusOr<Shape> GetReturnValueShape();
 
-  ComputationDataHandle Infeed(const Shape& shape);
+  LocalOp Infeed(const Shape& shape);
 
-  void Outfeed(const ComputationDataHandle& operand, const Shape& shape,
+  void Outfeed(const LocalOp& operand, const Shape& shape,
                const string& outfeed_config);
 
-  ComputationDataHandle ConstantLiteral(const Literal& literal);
+  LocalOp ConstantLiteral(const Literal& literal);
 
-  ComputationDataHandle Broadcast(
-      const ComputationDataHandle& operand,
-      tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
+  LocalOp Broadcast(const LocalOp& operand,
+                    tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
 
-  ComputationDataHandle Pad(const ComputationDataHandle& operand,
-                            const ComputationDataHandle& padding_value,
-                            const PaddingConfig& padding_config);
+  LocalOp Pad(const LocalOp& operand, const LocalOp& padding_value,
+              const PaddingConfig& padding_config);
 
-  ComputationDataHandle Reshape(const ComputationDataHandle& operand,
-                                tensorflow::gtl::ArraySlice<int64> dimensions,
-                                tensorflow::gtl::ArraySlice<int64> new_sizes);
+  LocalOp Reshape(const LocalOp& operand,
+                  tensorflow::gtl::ArraySlice<int64> dimensions,
+                  tensorflow::gtl::ArraySlice<int64> new_sizes);
 
-  ComputationDataHandle Collapse(const ComputationDataHandle& operand,
-                                 tensorflow::gtl::ArraySlice<int64> dimensions);
+  LocalOp Collapse(const LocalOp& operand,
+                   tensorflow::gtl::ArraySlice<int64> dimensions);
 
-  ComputationDataHandle CrossReplicaSum(const ComputationDataHandle& operand);
+  LocalOp CrossReplicaSum(const LocalOp& operand);
 
-  ComputationDataHandle Slice(const ComputationDataHandle& operand,
-                              tensorflow::gtl::ArraySlice<int64> start_indices,
-                              tensorflow::gtl::ArraySlice<int64> limit_indices,
-                              tensorflow::gtl::ArraySlice<int64> strides);
+  LocalOp Slice(const LocalOp& operand,
+                tensorflow::gtl::ArraySlice<int64> start_indices,
+                tensorflow::gtl::ArraySlice<int64> limit_indices,
+                tensorflow::gtl::ArraySlice<int64> strides);
 
-  ComputationDataHandle SliceInDim(const ComputationDataHandle& operand,
-                                   int64 start_index, int64 limit_index,
-                                   int64 stride, int64 dimno);
+  LocalOp SliceInDim(const LocalOp& operand, int64 start_index,
+                     int64 limit_index, int64 stride, int64 dimno);
 
-  ComputationDataHandle DynamicSlice(
-      const ComputationDataHandle& operand,
-      const ComputationDataHandle& start_indices,
-      tensorflow::gtl::ArraySlice<int64> slice_sizes);
+  LocalOp DynamicSlice(const LocalOp& operand, const LocalOp& start_indices,
+                       tensorflow::gtl::ArraySlice<int64> slice_sizes);
 
-  ComputationDataHandle DynamicUpdateSlice(
-      const ComputationDataHandle& operand, const ComputationDataHandle& update,
-      const ComputationDataHandle& start_indices);
+  LocalOp DynamicUpdateSlice(const LocalOp& operand, const LocalOp& update,
+                             const LocalOp& start_indices);
 
-  ComputationDataHandle ConcatInDim(
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-      int64 dimension);
+  LocalOp ConcatInDim(tensorflow::gtl::ArraySlice<LocalOp> operands,
+                      int64 dimension);
 
-  ComputationDataHandle SelectAndScatterWithGeneralPadding(
-      const ComputationDataHandle& operand, const LocalComputation& select,
+  LocalOp SelectAndScatterWithGeneralPadding(
+      const LocalOp& operand, const LocalComputation& select,
       tensorflow::gtl::ArraySlice<int64> window_dimensions,
       tensorflow::gtl::ArraySlice<int64> window_strides,
       tensorflow::gtl::ArraySlice<std::pair<int64, int64> > padding,
-      const ComputationDataHandle& source,
-      const ComputationDataHandle& init_value, const LocalComputation& scatter);
+      const LocalOp& source, const LocalOp& init_value,
+      const LocalComputation& scatter);
 
-  ComputationDataHandle Tuple(
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> elements);
+  LocalOp Tuple(tensorflow::gtl::ArraySlice<LocalOp> elements);
 
-  ComputationDataHandle GetTupleElement(const ComputationDataHandle& tuple_data,
-                                        int64 index);
+  LocalOp GetTupleElement(const LocalOp& tuple_data, int64 index);
 
-  ComputationDataHandle Dot(const ComputationDataHandle& lhs,
-                            const ComputationDataHandle& rhs);
+  LocalOp Dot(const LocalOp& lhs, const LocalOp& rhs);
 
-  ComputationDataHandle DotGeneral(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      const DotDimensionNumbers& dimension_numbers);
+  LocalOp DotGeneral(const LocalOp& lhs, const LocalOp& rhs,
+                     const DotDimensionNumbers& dimension_numbers);
 
-  ComputationDataHandle ConvGeneralDilated(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+  LocalOp ConvGeneralDilated(
+      const LocalOp& lhs, const LocalOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides,
       tensorflow::gtl::ArraySlice<std::pair<int64, int64> > padding,
       tensorflow::gtl::ArraySlice<int64> lhs_dilation,
       tensorflow::gtl::ArraySlice<int64> rhs_dilation,
       const ConvolutionDimensionNumbers& dimension_numbers);
 
-  ComputationDataHandle ConvertElementType(const ComputationDataHandle& operand,
-                                           PrimitiveType new_element_type);
+  LocalOp ConvertElementType(const LocalOp& operand,
+                             PrimitiveType new_element_type);
 
-  ComputationDataHandle Call(
-      const LocalComputation& local_computation,
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands);
+  LocalOp Call(const LocalComputation& local_computation,
+               tensorflow::gtl::ArraySlice<LocalOp> operands);
 
-  ComputationDataHandle Transpose(
-      const ComputationDataHandle& operand,
-      tensorflow::gtl::ArraySlice<int64> permutation);
+  LocalOp Transpose(const LocalOp& operand,
+                    tensorflow::gtl::ArraySlice<int64> permutation);
 
-  ComputationDataHandle Rev(const ComputationDataHandle& operand,
-                            tensorflow::gtl::ArraySlice<int64> dimensions);
+  LocalOp Rev(const LocalOp& operand,
+              tensorflow::gtl::ArraySlice<int64> dimensions);
 
-  ComputationDataHandle Map(
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-      const LocalComputation& local_computation,
-      tensorflow::gtl::ArraySlice<int64> dimensions,
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> static_operands);
+  LocalOp Map(tensorflow::gtl::ArraySlice<LocalOp> operands,
+              const LocalComputation& local_computation,
+              tensorflow::gtl::ArraySlice<int64> dimensions,
+              tensorflow::gtl::ArraySlice<LocalOp> static_operands);
 
-  ComputationDataHandle Reduce(
-      const ComputationDataHandle& operand,
-      const ComputationDataHandle& init_value,
-      const LocalComputation& local_computation,
-      tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
+  LocalOp Reduce(const LocalOp& operand, const LocalOp& init_value,
+                 const LocalComputation& local_computation,
+                 tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
 
-  ComputationDataHandle ReduceWindowWithGeneralPadding(
-      const ComputationDataHandle& operand,
-      const ComputationDataHandle& init_value,
+  LocalOp ReduceWindowWithGeneralPadding(
+      const LocalOp& operand, const LocalOp& init_value,
       const LocalComputation& local_computation,
       tensorflow::gtl::ArraySlice<int64> window_dimensions,
       tensorflow::gtl::ArraySlice<int64> window_strides,
       tensorflow::gtl::ArraySlice<std::pair<int64, int64> > padding);
 
-  ComputationDataHandle RngNormal(const ComputationDataHandle& mu,
-                                  const ComputationDataHandle& sigma,
-                                  const Shape& shape);
+  LocalOp RngNormal(const LocalOp& mu, const LocalOp& sigma,
+                    const Shape& shape);
 
-  ComputationDataHandle RngUniform(const ComputationDataHandle& a,
-                                   const ComputationDataHandle& b,
-                                   const Shape& shape);
+  LocalOp RngUniform(const LocalOp& a, const LocalOp& b, const Shape& shape);
 
-  ComputationDataHandle While(const LocalComputation& condition,
-                              const LocalComputation& body,
-                              const ComputationDataHandle& init);
+  LocalOp While(const LocalComputation& condition, const LocalComputation& body,
+                const LocalOp& init);
 
-  ComputationDataHandle Conditional(const ComputationDataHandle& predicate,
-                                    const ComputationDataHandle& true_operand,
-                                    const LocalComputation& true_computation,
-                                    const ComputationDataHandle& false_operand,
-                                    const LocalComputation& false_computation);
+  LocalOp Conditional(const LocalOp& predicate, const LocalOp& true_operand,
+                      const LocalComputation& true_computation,
+                      const LocalOp& false_operand,
+                      const LocalComputation& false_computation);
 
-  StatusOr<bool> IsConstant(const ComputationDataHandle& operand,
-                            int64 num_parameters);
+  StatusOr<bool> IsConstant(const LocalOp& operand);
 
-  StatusOr<std::unique_ptr<Literal> > ComputeConstant(
-      const ComputationDataHandle& operand, const Layout* output_layout,
-      tensorflow::gtl::ArraySlice<Literal> parameters);
+  StatusOr<LocalComputation*> BuildConstantSubGraph(const LocalOp& operand);
 
 #define _FORWARD(method_name, return_sig, args_sig) \
   return_sig method_name args_sig;
 
-#define _FORWARD_UNOP(method_name)             \
-  _FORWARD(method_name, ComputationDataHandle, \
-           (const ComputationDataHandle& operand))
+#define _FORWARD_UNOP(method_name) \
+  _FORWARD(method_name, LocalOp, (const LocalOp& operand))
 
-#define _FORWARD_BINOP(method_name)                                        \
-  _FORWARD(                                                                \
-      method_name, ComputationDataHandle,                                  \
-      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
-       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions))
+#define _FORWARD_BINOP(method_name)                 \
+  _FORWARD(method_name, LocalOp,                    \
+           (const LocalOp& lhs, const LocalOp& rhs, \
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions))
 
-#define _FORWARD_TRIOP(method_name)                                        \
-  _FORWARD(                                                                \
-      method_name, ComputationDataHandle,                                  \
-      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
-       const ComputationDataHandle& ehs))
+#define _FORWARD_TRIOP(method_name) \
+  _FORWARD(method_name, LocalOp,    \
+           (const LocalOp& lhs, const LocalOp& rhs, const LocalOp& ehs))
 
   _FORWARD_TRIOP(Select)
   _FORWARD_TRIOP(Clamp)
@@ -338,7 +322,7 @@ class LocalComputationBuilder {
 #undef _FORWARD_TRIOP
 
  private:
-  ComputationBuilder builder_;
+  XlaBuilder builder_;
 };
 
 // Functions for freeing resources from the Python side.
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index b8cce5a5f7105ef59d6f02ac7c1995bb81df4d58..04c56bbba95fbf3248df6c49700ff563c8b253c0 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -22,9 +22,8 @@ limitations under the License.
 //
 //    C++                                  Python
 // -------------------------------------+---------------------------------------
-//  ComputationDataHandle              <-> int
 //  ArraySlice<int64>                  <-  sequence of int
-//  ArraySlice<ComputationDataHandle>  <-  sequence of int
+//  ArraySlice<LocalOp>                <-  sequence of LocalOp
 //  Literal                            <-> (nested tuple of) numpy ndarray
 //  std::vector<Literal>               <-  sequence of (nested tuple of) ndarray
 //  Shape                               -> pair holding (dtype, dimensions)
@@ -91,12 +90,9 @@ limitations under the License.
 // One central reason for the Python-side indirection is that the
 // Python-side objects produced by the typemaps in this file are
 // further packaged up by xla_client before being passed on. For
-// instance, xla_client wraps the long produced for a C++
-// ComputationDataHandle in a Python ComputationDataHandle proto,
-// rather than exposing a raw long outside of the client. Similarly,
-// the Python pair produced for a C++ Shape is further wrapped in a
-// Python class (xla_client.Shape) so as not to expose the raw pair
-// externally.
+// instance, the Python pair produced for a C++ Shape is further
+// wrapped in a Python class (xla_client.Shape) so as not to expose
+// the raw pair externally.
 //
 // Other SWIG object wrappers (e.g. of LocalComputation) are further
 // wrapped by xla_client in order to set up a custom destructor that
@@ -124,6 +120,7 @@ using namespace xla;
 using namespace xla::swig;
 
 namespace xla {
+
 namespace swig {
 
 bool GetIntAttr(PyObject* o, const char* field, int64* result) {
@@ -177,21 +174,6 @@ bool HandleStringAttribute(PyObject* o,
 tensorflow::ImportNumpy();
 %}
 
-// ComputationDataHandle
-
-%typemap(in) const ComputationDataHandle& (ComputationDataHandle temp) {
-  const int64 handle = numpy::PyIntOrPyLongToLong($input);
-  if (handle == -1 && PyErr_Occurred()) {
-    SWIG_fail;
-  }
-  temp.set_handle(handle);
-  $1 = &temp;
-}
-
-%typemap(out) ComputationDataHandle {
-  $result = numpy::LongToPyIntOrPyLong($1.handle());
-}
-
 %typemap(out) StatusOr<xla::swig::CompiledLocalComputation*> {
   if ($1.ok()) {
     auto* value = $1.ValueOrDie();
@@ -301,33 +283,23 @@ tensorflow::ImportNumpy();
   $1 = temps;
 }
 
-// ComputationDataHandle
+// ArraySlice<LocalOp>
 
-%typemap(in) tensorflow::gtl::ArraySlice<ComputationDataHandle>
-    (std::vector<ComputationDataHandle> temps) {
+%typemap(in) tensorflow::gtl::ArraySlice<xla::swig::LocalOp>(
+      std::vector<LocalOp> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
     SWIG_fail;
   }
   const int size = PySequence_Size($input);
-  temps.resize(size);
   for (int i = 0; i < size; ++i) {
     PyObject* o = PySequence_GetItem($input, i);
-    PyObject* py_int = numpy::PyNumberToPyInt(o);
-    if (!py_int) {
-      PyErr_SetString(
-          PyExc_TypeError,
-          "Argument sequence element cannot be converted to int");
-      SWIG_fail;
-    }
-    const int64 handle = numpy::PyIntOrPyLongToLong(py_int);
-    if (handle == -1 && PyErr_Occurred()) {
-      Py_DECREF(py_int);
-      Py_DECREF(o);
+    LocalOp* op;
+    if ((SWIG_ConvertPtr(o, (void**)&op, $descriptor(xla::swig::LocalOp*),
+                         SWIG_POINTER_EXCEPTION)) == -1) {
       SWIG_fail;
     }
-    temps[i].set_handle(handle);
-    Py_DECREF(py_int);
+    temps.push_back(*op);
     Py_DECREF(o);
   }
   $1 = temps;
@@ -934,6 +906,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputation;
 %unignore xla::swig::LocalComputation::Compile;
 %unignore xla::swig::LocalComputation::GetReturnValueShape;
+%unignore xla::swig::LocalOp;
 %unignore xla::swig::LocalComputationBuilder;
 %unignore xla::swig::LocalComputationBuilder::LocalComputationBuilder;
 %unignore xla::swig::LocalComputationBuilder::Build;
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
index dc6f5fe5fcc067c99ced01812f9f2388a00766d0..68648a3a176363de69a56ecb8070f82862874e94 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.cc
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -340,13 +340,13 @@ StatusOr<OpMetadata> OpMetadataFromPyObject(PyObject* o) {
   return result;
 }
 
-PyObject* PyObjectFromXlaLiteral(const Literal& literal) {
+PyObject* PyObjectFromXlaLiteral(const LiteralSlice& literal) {
   if (ShapeUtil::IsTuple(literal.shape())) {
     int num_elements = ShapeUtil::TupleElementCount(literal.shape());
     PyObject* tuple = PyTuple_New(num_elements);
     for (int i = 0; i < num_elements; i++) {
-      PyTuple_SET_ITEM(
-          tuple, i, PyObjectFromXlaLiteral(LiteralView::Create(literal, {i})));
+      PyTuple_SET_ITEM(tuple, i,
+                       PyObjectFromXlaLiteral(LiteralSlice(literal, {i})));
     }
     return tuple;
   } else {
@@ -431,7 +431,7 @@ Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
   return Status::OK();
 }
 
-void CopyLiteralToNumpyArray(int np_type, const Literal& literal,
+void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
                              PyArrayObject* py_array) {
   switch (np_type) {
     case NPY_BOOL:
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.h b/tensorflow/compiler/xla/python/numpy_bridge.h
index 9656cb1c31c39dbe54293700c2765d0723255657..64f0aae0f9790f0199ac6cb931a5c9f6dc356f4c 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.h
+++ b/tensorflow/compiler/xla/python/numpy_bridge.h
@@ -74,7 +74,7 @@ StatusOr<OpMetadata> OpMetadataFromPyObject(PyObject* o);
 // array data.
 //
 // The return value is a new reference.
-PyObject* PyObjectFromXlaLiteral(const Literal& literal);
+PyObject* PyObjectFromXlaLiteral(const LiteralSlice& literal);
 
 // Converts a Numpy ndarray or a nested Python tuple thereof to a
 // corresponding XLA literal.
@@ -90,7 +90,7 @@ StatusOr<std::unique_ptr<Literal> > XlaLiteralFromPyObject(PyObject* o);
 Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
                                Literal* literal);
 
-void CopyLiteralToNumpyArray(int np_type, const Literal& literal,
+void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
                              PyArrayObject* py_array);
 
 template <typename NativeT>
@@ -101,7 +101,8 @@ void CopyNumpyArrayToLiteral(PyArrayObject* py_array, Literal* literal) {
 }
 
 template <typename NativeT>
-void CopyLiteralToNumpyArray(const Literal& literal, PyArrayObject* py_array) {
+void CopyLiteralToNumpyArray(const LiteralSlice& literal,
+                             PyArrayObject* py_array) {
   NativeT* dest = static_cast<NativeT*>(PyArray_DATA(py_array));
   auto source = literal.data<NativeT>();
   std::copy(source.begin(), source.end(), dest);
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index f6809b6b871d7e246dd43811c7e8c08378d53989..1d5b75d1bee2dcee3e448d0bcb72103b539efac6 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -335,20 +335,6 @@ def _wrap_shape(shape_info):
     return Shape.array_shape(dtype, dims)
 
 
-def _wrap_data_handle(handle):
-  cdh = xla_data_pb2.ComputationDataHandle()
-  cdh.handle = handle
-  return cdh
-
-
-def _unwrap_data_handle(handle_proto):
-  return handle_proto.handle
-
-
-def _unwrap_data_handles(handle_protos):
-  return [_unwrap_data_handle(cdh) for cdh in handle_protos]
-
-
 def require_numpy_array_layout(value):
   if isinstance(value, tuple):
     return tuple(require_numpy_array_layout(x) for x in value)
@@ -535,9 +521,9 @@ class ComputationBuilder(object):
     queue for subsequent use in the computation.
 
     Returns:
-      A  ComputationDataHandle message.
+      A LocalOp.
     """
-    return _wrap_data_handle(self._client.Infeed(shape))
+    return self._client.Infeed(shape)
 
   def Outfeed(self, operand):
     """Enqueues an outfeed op onto the computation.
@@ -545,9 +531,7 @@ class ComputationBuilder(object):
     Outfeed operations enqueue data, using the given operand, onto the XLA
     outfeed queue for subsequent dequeue via the client API.
     """
-    self._client.Outfeed(
-        _unwrap_data_handle(operand), self.GetShape(operand),
-        ''.encode('utf-8'))
+    self._client.Outfeed(operand, self.GetShape(operand), ''.encode('utf-8'))
 
   def Constant(self, value):
     """Enqueues a constant op onto the computation.
@@ -557,10 +541,10 @@ class ComputationBuilder(object):
              to one of the supported types.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     value = require_numpy_array_layout(value)
-    return _wrap_data_handle(self._client.ConstantLiteral(value))
+    return self._client.ConstantLiteral(value)
 
   def ConstantF32Scalar(self, value):
     """Convenience method to enqueue a scalar F32 constant op.
@@ -569,7 +553,7 @@ class ComputationBuilder(object):
       value: a floating-point number.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     return self.Constant(np.array(value, dtype=np.float32))
 
@@ -580,7 +564,7 @@ class ComputationBuilder(object):
       value: a floating-point number.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     return self.Constant(np.array(value, dtype=np.float64))
 
@@ -591,7 +575,7 @@ class ComputationBuilder(object):
       value: a floating-point number.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     return self.Constant(np.array(value, dtype=np.int32))
 
@@ -602,7 +586,7 @@ class ComputationBuilder(object):
       value: a floating-point number.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     return self.Constant(np.array(value, dtype=np.int64))
 
@@ -613,7 +597,7 @@ class ComputationBuilder(object):
       value: a boolean value.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     return self.Constant(np.array(value, dtype=np.bool))
 
@@ -629,15 +613,14 @@ class ComputationBuilder(object):
         parameters, use it for *all* parameters to avoid clashes.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     if name is None:
       name = ''
     if parameter_num is None:
       parameter_num = next(self._parameter_numbering)
 
-    return _wrap_data_handle(
-        self._client.Parameter(parameter_num, shape, name.encode('utf8')))
+    return self._client.Parameter(parameter_num, shape, name.encode('utf8'))
 
   def ParameterFromNumpy(self, value, name=None, parameter_num=None):
     """Enqueues a Parameter op onto the computation.
@@ -649,7 +632,7 @@ class ComputationBuilder(object):
       parameter_num: as in ParameterWithShape.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     return self.ParameterWithShape(
         Shape.from_pyval(value), name=name, parameter_num=parameter_num)
@@ -658,14 +641,13 @@ class ComputationBuilder(object):
     """Enqueues a broadcast operation onto the computation.
 
     Args:
-      operand: the operand ComputationDataHandle to broadcast.
+      operand: the operand LocalOp to broadcast.
       sizes: an iterable of broadcast sizes.
 
     Returns:
-      A ComputationDataHandle representing the added broadcast op.
+      A LocalOp representing the added broadcast op.
     """
-    return _wrap_data_handle(
-        self._client.Broadcast(_unwrap_data_handle(operand), sizes))
+    return self._client.Broadcast(operand, sizes)
 
   def Concatenate(self, operands, dimension):
     """Enqueues a concatenate operation onto the computation.
@@ -675,10 +657,9 @@ class ComputationBuilder(object):
       dimension: the dimension in which to perform the concatenation.
 
     Returns:
-      A ComputationDataHandle representing the added concatenate op.
+      A LocalOp representing the added concatenate op.
     """
-    return _wrap_data_handle(
-        self._client.ConcatInDim(_unwrap_data_handles(operands), dimension))
+    return self._client.ConcatInDim(operands, dimension)
 
   def ConvertElementType(self, operand, new_element_type):
     """Enqueues an element type conversion operation onto the computation.
@@ -688,14 +669,12 @@ class ComputationBuilder(object):
       new_element_type: the target primitive type.
 
     Returns:
-      A ComputationDataHandle representing the added conversion op.
+      A LocalOp representing the added conversion op.
     """
-    return _wrap_data_handle(
-        self._client.ConvertElementType(
-            _unwrap_data_handle(operand), new_element_type))
+    return self._client.ConvertElementType(operand, new_element_type)
 
   def GetShape(self, operand):
-    return _wrap_shape(self._client.GetShape(_unwrap_data_handle(operand)))
+    return _wrap_shape(self._client.GetShape(operand))
 
   def GetReturnValueShape(self):
     return _wrap_shape(self._client.GetReturnValueShape())
@@ -707,40 +686,35 @@ class ComputationBuilder(object):
     """Enqueues a Pad operation onto the computation.
 
     Args:
-      operand: ComputationDataHandle representing the array to pad.
-      padding_value: ComputationDataHandle representing the scalar pad value.
+      operand: LocalOp representing the array to pad.
+      padding_value: LocalOp representing the scalar pad value.
       padding_config: either an xla_data_pb2.PaddingConfig or a list of integer
         triples (edge_padding_low, edge_padding_high, interior_padding)
         representing the configuration of the padding operation.
 
     Returns:
-      A ComputationDataHandle representing the added Pad op.
+      A LocalOp representing the added Pad op.
     """
     if not isinstance(padding_config, xla_data_pb2.PaddingConfig):
       padding_config = GetPaddingConfigFromTriples(padding_config)
-    return _wrap_data_handle(
-        self._client.Pad(_unwrap_data_handle(operand),
-                         _unwrap_data_handle(padding_value),
-                         padding_config))
+    return self._client.Pad(operand, padding_value, padding_config)
 
   def Reshape(self, operand, dimensions, new_sizes):
     """Enqueues a reshape op onto the computation.
 
     Args:
-      operand: ComputationDataHandle representing the array to be reshaped.
+      operand: LocalOp representing the array to be reshaped.
       dimensions: sequence of integers encoding the order in which dimensions
         are collapsed or None, in which case dimensions are flattened in order.
       new_sizes: sequence of integers encoding the new dimension sizes (shape).
 
     Returns:
-      A ComputationDataHandle representing the added Reshape op.
+      A LocalOp representing the added Reshape op.
     """
     if dimensions is None:
       ndim = len(self.GetShape(operand).dimensions())
       dimensions = tuple(range(ndim))
-    return _wrap_data_handle(
-        self._client.Reshape(
-            _unwrap_data_handle(operand), dimensions, new_sizes))
+    return self._client.Reshape(operand, dimensions, new_sizes)
 
   def CrossReplicaSum(self, operand):
     """CrossReplicaSum op.
@@ -749,67 +723,56 @@ class ComputationBuilder(object):
       operand: the operand to sum across replica instances.
 
     Returns:
-      A ComputationDataHandle that has the sum of the value among all replicas.
+      A LocalOp that has the sum of the value among all replicas.
     """
-    return _wrap_data_handle(
-        self._client.CrossReplicaSum(_unwrap_data_handle(operand)))
+    return self._client.CrossReplicaSum(operand)
 
   def Collapse(self, operand, dimensions):
     """Collapse op."""
-    return _wrap_data_handle(
-        self._client.Collapse(_unwrap_data_handle(operand), dimensions))
+    return self._client.Collapse(operand, dimensions)
 
   def Trans(self, operand):
     """Specialized matrix transpose op."""
-    return _wrap_data_handle(
-        self._client.Transpose(_unwrap_data_handle(operand), [1, 0]))
+    return self._client.Transpose(operand, [1, 0])
 
   def Transpose(self, operand, permutation):
     """Transpose op."""
-    return _wrap_data_handle(
-        self._client.Transpose(_unwrap_data_handle(operand), permutation))
+    return self._client.Transpose(operand, permutation)
 
   def Rev(self, operand, dimensions):
     """Rev op."""
-    return _wrap_data_handle(
-        self._client.Rev(_unwrap_data_handle(operand), dimensions))
+    return self._client.Rev(operand, dimensions)
 
   def Clamp(self, min, operand, max):  # pylint: disable=redefined-builtin
     """Clamp op."""
-    return _wrap_data_handle(
-        self._client.Clamp(_unwrap_data_handle(min),
-                           _unwrap_data_handle(operand),
-                           _unwrap_data_handle(max)))
+    return self._client.Clamp(min, operand, max)
 
   def SelectAndScatter(self, operand, select, window_dimensions, window_strides,
                        padding, source, init_value, scatter):
     """Select and scatter op, used by the gradient of ReduceWindow.
 
     Args:
-      operand: ComputationDataHandle for array of dimension N and type T over
+      operand: LocalOp for array of dimension N and type T over
         which the windows slide.
       select: Computation of type (T, T) -> Pred to apply to the elements of
         each window to indicate which element is selected.
       window_dimensions: sequence of N integers for dimensions of the window.
       window_strides: sequence of N integers for the strides of the window.
       padding: PaddingType representing either 'SAME' or 'VALID ' padding.
-      source: ComputationDataHandle for array of type T with values to scatter.
-      init_value: ComputationDataHandle of scalar type T for initial out value.
+      source: LocalOp for array of type T with values to scatter.
+      init_value: LocalOp of scalar type T for initial out value.
       scatter: Computation of type (T, T) -> T to apply to each scatter source
         element with its destination element.
 
     Returns:
-      A ComputationDataHandle representing the added SelectAndScatter op.
+      A LocalOp representing the added SelectAndScatter op.
     """
     pads = _convert_padding_type_to_pad_values(
         padding, self.GetShape(operand).dimensions(),
         window_dimensions, window_strides)
-    return _wrap_data_handle(
-        self._client.SelectAndScatterWithGeneralPadding(
-            _unwrap_data_handle(operand), select.c_local_computation,
-            window_dimensions, window_strides, pads,
-            _unwrap_data_handle(source), _unwrap_data_handle(init_value),
-            scatter.c_local_computation))
+    return self._client.SelectAndScatterWithGeneralPadding(
+        operand, select.c_local_computation, window_dimensions, window_strides,
+        pads, source, init_value, scatter.c_local_computation)
 
   def Select(self, pred, on_true, on_false):
     """Element-wise selection op.
@@ -817,17 +780,13 @@ class ComputationBuilder(object):
     Constructs an output array from elements of two input arrays, based on the
     values of a predicate array.
     """
-    return _wrap_data_handle(
-        self._client.Select(
-            _unwrap_data_handle(pred),
-            _unwrap_data_handle(on_true),
-            _unwrap_data_handle(on_false)))
+    return self._client.Select(pred, on_true, on_false)
 
   def Slice(self, operand, start_indices, limit_indices, strides=None):
     """Enqueues a slice operation onto the computation.
 
     Args:
-      operand: ComputationDataHandle for the N dimensional array to be sliced.
+      operand: LocalOp for the N dimensional array to be sliced.
       start_indices: iterable of N integers containing the starting indices of
         the slice for each dimension.
       limit_indices: iterable of N integers containing the ending indices
@@ -836,207 +795,177 @@ class ComputationBuilder(object):
         each dimension.
 
     Returns:
-      A ComputationDataHandle representing the added Slice op.
+      A LocalOp representing the added Slice op.
     """
     if strides is None:
       start_indices = list(start_indices)
       strides = [1] * len(start_indices)
-    return _wrap_data_handle(
-        self._client.Slice(
-            _unwrap_data_handle(operand), start_indices, limit_indices,
-            strides))
+    return self._client.Slice(operand, start_indices, limit_indices, strides)
 
   def SliceInDim(self, operand, start_index, limit_index, stride, dimno):
     """Enqueues a slice-in-dimension operation onto the computation.
 
     Args:
-      operand: ComputationDataHandle for the N dimensional array to be sliced.
+      operand: LocalOp for the N dimensional array to be sliced.
       start_index: an integer containing the start index of the slice.
       limit_index: an integer containing the end index of the slice.
       stride: an integer containing the stride size for the slice.
       dimno: an integer indicating the dimension along which to slice.
 
     Returns:
-      A ComputationDataHandle representing the added Slice op.
+      A LocalOp representing the added Slice op.
     """
-    return _wrap_data_handle(
-        self._client.SliceInDim(
-            _unwrap_data_handle(operand), start_index, limit_index, stride,
-            dimno))
+    return self._client.SliceInDim(operand, start_index, limit_index, stride,
+                                   dimno)
 
   def DynamicSlice(self, operand, start_indices, slice_sizes):
     """Enqueues a slice op with dynamic start indices onto the computation.
 
     Args:
-      operand: ComputationDataHandle for the N dimensional array to be sliced.
-      start_indices: ComputationDataHandle for the 1D array of N integers
+      operand: LocalOp for the N dimensional array to be sliced.
+      start_indices: LocalOp for the 1D array of N integers
         containing the starting indices of the slice.
       slice_sizes: iterable of N integers containing the slice sizes in each
         dimension.
 
     Returns:
-      A ComputationDataHandle representing the added DynamicSlice op.
+      A LocalOp representing the added DynamicSlice op.
     """
-    return _wrap_data_handle(
-        self._client.DynamicSlice(
-            _unwrap_data_handle(operand),
-            _unwrap_data_handle(start_indices),
-            slice_sizes))
+    return self._client.DynamicSlice(operand, start_indices, slice_sizes)
 
   def DynamicUpdateSlice(self, operand, update, start_indices):
     """Enqueues a dynamic update slice operation onto the computation.
 
     Args:
-      operand: ComputationDataHandle for the N dimensional array to be updated.
+      operand: LocalOp for the N dimensional array to be updated.
       update: N dimensional array comprising the slice update.
       start_indices: Rank-1 array of N integers comprising the starting indices
         of the slice along each dimension.
     Returns:
-      A ComputationDataHandle representing the added DynamicUpdateSlice op.
+      A LocalOp representing the added DynamicUpdateSlice op.
     """
-    return _wrap_data_handle(
-        self._client.DynamicUpdateSlice(
-            _unwrap_data_handle(operand),
-            _unwrap_data_handle(update),
-            _unwrap_data_handle(start_indices)))
+    return self._client.DynamicUpdateSlice(operand, update, start_indices)
 
   def Tuple(self, *ops):
     """Enqueues a tuple operation onto the computation.
 
     Args:
-      ops: a sequence of tuple operands (each a ComputationDataHandle).
+      ops: a sequence of tuple operands (each a LocalOp).
 
     Returns:
-      A ComputationDataHandle representing the added Tuple op.
+      A LocalOp representing the added Tuple op.
     """
-    return _wrap_data_handle(self._client.Tuple(_unwrap_data_handles(ops)))
+    return self._client.Tuple(ops)
 
   def GetTupleElement(self, tup, index):
     """Enqueues a 'get tuple element' operation onto the computation.
 
     Args:
-      tup: the tuple operand (a ComputationDataHandle).
+      tup: the tuple operand (a LocalOp).
       index: numeric index to select from the tuple.
 
     Returns:
-      A ComputationDataHandle representing the added GetTupleElement op.
+      A LocalOp representing the added GetTupleElement op.
     """
-    return _wrap_data_handle(
-        self._client.GetTupleElement(_unwrap_data_handle(tup), index))
+    return self._client.GetTupleElement(tup, index)
 
   def Call(self, computation_to_apply, operands):
     """Enqueues a call operation onto the computation.
 
     Args:
       computation_to_apply: a Computation object.
-      operands: an iterable of ComputationDataHandle. The number and types of
+      operands: an iterable of LocalOp. The number and types of
         operands must match the arity of computation_to_apply.
 
     Returns:
-      A ComputationDataHandle representing the added call op.
+      A LocalOp representing the added call op.
     """
-    return _wrap_data_handle(
-        self._client.Call(computation_to_apply.c_local_computation,
-                          _unwrap_data_handles(operands)))
+    return self._client.Call(computation_to_apply.c_local_computation, operands)
 
   def Map(self, operands, computation_to_apply, dimensions, static_operands=()):
     """Enqueues a map operation onto the computation.
 
     Args:
-      operands: an iterable of ComputationDataHandle.
+      operands: an iterable of LocalOp.
       computation_to_apply: a Computation object.
       dimensions: dimensions over which to apply map the function.
       static_operands: auxiliary arguments passed to the applied computation.
 
     Returns:
-      A ComputationDataHandle representing the added Map op.
+      A LocalOp representing the added Map op.
     """
-    return _wrap_data_handle(
-        self._client.Map(
-            _unwrap_data_handles(operands),
-            computation_to_apply.c_local_computation,
-            dimensions,
-            _unwrap_data_handles(static_operands)))
+    return self._client.Map(operands, computation_to_apply.c_local_computation,
+                            dimensions, static_operands)
 
   def Reduce(self, operand, init_value, computation_to_apply, dimensions):
     """Enqueues a reduction operation onto the computation.
 
     Args:
-      operand: reduction operand (ComputationDataHandle).
-      init_value: reduction initial value (ComputationDataHandle).
+      operand: reduction operand (LocalOp).
+      init_value: reduction initial value (LocalOp).
       computation_to_apply: a Computation object - binary reduction function.
       dimensions: sequence of dimensions (integers) to reduce on.
 
     Returns:
-      A ComputationDataHandle representing the added Reduce op.
+      A LocalOp representing the added Reduce op.
     """
-    return _wrap_data_handle(
-        self._client.Reduce(
-            _unwrap_data_handle(operand),
-            _unwrap_data_handle(init_value),
-            computation_to_apply.c_local_computation,
-            dimensions))
+    return self._client.Reduce(operand, init_value,
+                               computation_to_apply.c_local_computation,
+                               dimensions)
 
   def ReduceWindow(self, operand, init_value, computation_to_apply,
                    window_dimensions, window_strides, padding):
     """Enqueues a windowed reduction operation onto the computation.
 
     Args:
-      operand: reduction operand (ComputationDataHandle).
-      init_value: reduction initial value (ComputationDataHandle).
+      operand: reduction operand (LocalOp).
+      init_value: reduction initial value (LocalOp).
       computation_to_apply: a binary reduction function (Computation).
       window_dimensions: dimensions of window (sequence of integers).
       window_strides: strides for window (sequence of integers).
       padding: PaddingType representing either 'SAME' or 'VALID' padding.
 
     Returns:
-      A ComputationDataHandle representing the added ReduceWindow op.
+      A LocalOp representing the added ReduceWindow op.
     """
     pads = _convert_padding_type_to_pad_values(
         padding, self.GetShape(operand).dimensions(), window_dimensions,
         window_strides)
-    return _wrap_data_handle(
-        self._client.ReduceWindowWithGeneralPadding(
-            _unwrap_data_handle(operand),
-            _unwrap_data_handle(init_value),
-            computation_to_apply.c_local_computation,
-            window_dimensions, window_strides, pads))
+    return self._client.ReduceWindowWithGeneralPadding(
+        operand, init_value, computation_to_apply.c_local_computation,
+        window_dimensions, window_strides, pads)
 
   def RngNormal(self, mu, sigma, dims):
     """Enqueues an RngNormal operation onto the computation.
 
     Args:
-      mu: A ComputationDataHandle to an F32 scalar specifying the mean.
-      sigma: A ComputationDataHandle to an F32 scalar specifying the standard
+      mu: A LocalOp to an F32 scalar specifying the mean.
+      sigma: A LocalOp to an F32 scalar specifying the standard
         deviation.
       dims: A 1D array-like of nonnegative integers specifying the dimensions.
 
-    Returns: a ComputationDataHandle to the generated array of F32 values.
+    Returns: a LocalOp to the generated array of F32 values.
     """
     shape = Shape.array_shape(self.GetShape(mu).element_type(), dims)
-    return _wrap_data_handle(
-        self._client.RngNormal(
-            _unwrap_data_handle(mu), _unwrap_data_handle(sigma), shape))
+    return self._client.RngNormal(mu, sigma, shape)
 
   def RngUniform(self, a, b, dims):
     """Enqueues an RngUniform operation onto the computation.
 
     Args:
-      a: a ComputationDataHandle to an F32, S32, or U32 scalar (consistent with
+      a: a LocalOp to an F32, S32, or U32 scalar (consistent with
         the type of b) specifying the low end of the interval [a, b) over which
         values are generated.
-      b: a ComputationDataHandle to an F32, S32, or U32 scalar (consistent with
+      b: a LocalOp to an F32, S32, or U32 scalar (consistent with
         the type of a) specifying the high end of the interval [a, b) over which
         values are generated.
       dims: A 1D array-like of nonnegative integers specifying the dimensions.
 
-    Returns: a ComputationDataHandle to the generated array of values with the
+    Returns: a LocalOp to the generated array of values with the
       same numeric type (F32, S32, or U32) as the arguments a and b.
     """
     shape = Shape.array_shape(self.GetShape(a).element_type(), dims)
-    return _wrap_data_handle(
-        self._client.RngUniform(
-            _unwrap_data_handle(a), _unwrap_data_handle(b), shape))
+    return self._client.RngUniform(a, b, shape)
 
   def While(self, cond, body, init):
     """Enqueues a While operation onto the computation.
@@ -1044,112 +973,105 @@ class ComputationBuilder(object):
     Args:
       cond: a Computation for the loop condition, which has type T -> PRED
       body: a Computation for the loop body, which has type T -> T
-      init: a ComputationDataHandle for the initial parameter, which has type T
+      init: a LocalOp for the initial parameter, which has type T
 
-    Returns: a ComputationDataHandle representing the While operation.
+    Returns: a LocalOp representing the While operation.
     """
-    return _wrap_data_handle(
-        self._client.While(cond.c_local_computation,
-                           body.c_local_computation,
-                           _unwrap_data_handle(init)))
+    return self._client.While(cond.c_local_computation,
+                              body.c_local_computation, init)
 
   def Conditional(self, pred, true_operand, true_computation, false_operand,
                   false_computation):
     """Enqueues a Conditional operation onto the computation.
 
     Args:
-      predicate: a ComputationDataHandle to test, which has scalar type PRED
-      true_operand: a ComputationDataHandle of type T_0
+      predicate: a LocalOp to test, which has scalar type PRED
+      true_operand: a LocalOp of type T_0
       true_computation: a Computation to apply to true_operand, type T_0 -> S
       false_operand: a ComputationDatahandle of type T_1
       false_computation: a Computation to apply to false_operand, type T_1 -> S
 
-    Returns: a ComputationDataHandle representing the Conditional operation.
+    Returns: a LocalOp representing the Conditional operation.
     """
-    return _wrap_data_handle(
-        self._client.Conditional(
-            _unwrap_data_handle(pred), _unwrap_data_handle(true_operand),
-            true_computation.c_local_computation,
-            _unwrap_data_handle(false_operand),
-            false_computation.c_local_computation))
+    return self._client.Conditional(
+        pred, true_operand, true_computation.c_local_computation, false_operand,
+        false_computation.c_local_computation)
 
-  def IsConstant(self, operand, num_parameters=0):
-    """Enqueues an IsConstant operation onto the computation.
+  def IsConstant(self, operand):
+    """Checks whether the given operand is a compile-time constant.
 
     Args:
       operand: a ComputationDataHandle to test.
-      num_parameters: optional int, number of computation parameters to treat as
-        constant (default 0).
 
     Returns: bool indicating whether `operand` is a compile-time constant,
-      meaning its value does not depend on parameters with index greater than or
-      equal to `num_parameters`.
+      meaning its value does not depend on any parametersor, or on stateful
+      operators such as `RngNormal` or `Infeed`.
+    """
+    return self._client.IsConstant(operand)
+
+  def BuildConstantSubGraph(self, operand):
+    """Builds a constant sub graph.
+
+    Args:
+      operand: a LocalOp to test.
+    Returns: a LocalComputation that is rooted on the given `operand` which is a
+      compile-time constant.
     """
-    return self._client.IsConstant(_unwrap_data_handle(operand), num_parameters)
+    return self._client.BuildConstantSubGraph(operand)
 
   def Dot(self, lhs, rhs):
     """Enqueues a dot operation onto the computation.
 
     Args:
-      lhs: ComputationDataHandle for the rank 1 or rank 2 left-hand-side array.
-      rhs: ComputationDataHandle for the rank 1 or rank 2 right-hand-side array.
+      lhs: LocalOp for the rank 1 or rank 2 left-hand-side array.
+      rhs: LocalOp for the rank 1 or rank 2 right-hand-side array.
 
-    Returns: a ComputationDataHandle representing the Dot operation.
+    Returns: a LocalOp representing the Dot operation.
     """
-    return _wrap_data_handle(
-        self._client.Dot(_unwrap_data_handle(lhs), _unwrap_data_handle(rhs)))
+    return self._client.Dot(lhs, rhs)
 
   def DotGeneral(self, lhs, rhs, dimension_numbers):
     """Enqueues a general dot operation onto the computation.
 
     Args:
-      lhs: ComputationDataHandle for the left-hand-side array.
-      rhs: ComputationDataHandle for the right-hand-side array.
+      lhs: LocalOp for the left-hand-side array.
+      rhs: LocalOp for the right-hand-side array.
       dimension_numbers: either an xla_data_pb2.DotDimensionNumbers or a nested
         tuple ((lhs_contract, rhs_contract), (lhs_batch, rhs_batch)) of lists of
         integers representing the dimensions to treat as contracting dimensions
         and batch dimensions on each input operand.
 
-    Returns: a ComputationDataHandle representing the DotGeneral operation.
+    Returns: a LocalOp representing the DotGeneral operation.
     """
     if not isinstance(dimension_numbers, xla_data_pb2.DotDimensionNumbers):
       dimension_numbers = GetDotDimensionsFromLists(dimension_numbers)
-    return _wrap_data_handle(
-        self._client.DotGeneral(
-            _unwrap_data_handle(lhs), _unwrap_data_handle(rhs),
-            dimension_numbers))
+    return self._client.DotGeneral(lhs, rhs, dimension_numbers)
 
   def Conv(self, lhs, rhs, window_strides, padding):
     """Enqueues a Conv operation onto the computation.
 
     Args:
-      lhs: ComputationDataHandle for the rank N+2 array of inputs.
-      rhs: ComputationDataHandle for the rank N+2 array of kernel weights.
+      lhs: LocalOp for the rank N+2 array of inputs.
+      rhs: LocalOp for the rank N+2 array of kernel weights.
       window_strides: length-N array-like of integer kernel strides.
       padding: PaddingType representing either 'SAME' or 'VALID' padding.
 
-    Returns: a ComputationDataHandle representing the Conv operation.
+    Returns: a LocalOp representing the Conv operation.
     """
     pads = _convert_padding_type_to_pad_values(
         padding, self.GetShape(lhs).dimensions()[2:],
         self.GetShape(rhs).dimensions()[2:], window_strides)
     dimension_numbers = self._GetConvDimensionNumbers(len(window_strides))
-    return _wrap_data_handle(
-        self._client.ConvGeneralDilated(_unwrap_data_handle(lhs),
-                                        _unwrap_data_handle(rhs),
-                                        window_strides,
-                                        pads,
-                                        (),
-                                        (),
-                                        dimension_numbers))
+    return self._client.ConvGeneralDilated(lhs, rhs, window_strides, pads, (),
+                                           (), dimension_numbers)
 
   def ConvWithGeneralPadding(self, lhs, rhs, window_strides, padding,
                              lhs_dilation, rhs_dilation):
     """Enqueues a ConvWithGeneralPadding operation onto the computation.
 
     Args:
-      lhs: ComputationDataHandle for the rank N+2 array of inputs.
-      rhs: ComputationDataHandle for the rank N+2 array of kernel weights.
+      lhs: LocalOp for the rank N+2 array of inputs.
+      rhs: LocalOp for the rank N+2 array of kernel weights.
       window_strides: length-N array-like of kernel strides.
       padding: length-N array-like of pairs of integers of (low, high) padding.
       lhs_dilation: length-N array-like of dilation factors.
@@ -1159,14 +1081,9 @@ class ComputationBuilder(object):
       A ComputationdataHandle representing the added ConvWithGeneralPadding op.
     """
     dimension_numbers = self._GetConvDimensionNumbers(len(window_strides))
-    return _wrap_data_handle(
-        self._client.ConvGeneralDilated(_unwrap_data_handle(lhs),
-                                        _unwrap_data_handle(rhs),
-                                        window_strides,
-                                        padding,
-                                        lhs_dilation,
-                                        rhs_dilation,
-                                        dimension_numbers))
+    return self._client.ConvGeneralDilated(lhs, rhs, window_strides, padding,
+                                           lhs_dilation, rhs_dilation,
+                                           dimension_numbers)
 
   def _GetConvDimensionNumbers(self, num_spatial_dims):
     """Create ConvolutionDimensionNumbers proto for convolutions."""
@@ -1196,15 +1113,14 @@ def _forward_methods_to_local_builder():
     """Generate a forwarding method that wraps/unwraps data handles."""
 
     def forward(self, *args, **kwargs):
-      unwrapped_args = [_unwrap_data_handle(arg) for arg in args]
+      arg_list = list(args)
 
-      if is_binop and len(unwrapped_args) < 3:
-        unwrapped_args.append(kwargs.get('broadcast_dimensions', ()))
+      if is_binop and len(arg_list) < 3:
+        arg_list.append(kwargs.get('broadcast_dimensions', ()))
 
-      return _wrap_data_handle(
-          target_method(
-              self._client,  # pylint: disable=protected-access
-              *unwrapped_args))
+      return target_method(
+          self._client,  # pylint: disable=protected-access
+          *arg_list)
 
     return forward
 
diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h
index 28d6a8c3fe85fa4179bf2f41c82ad4eb93a045fe..2698ba7d79e246530b6b486d3e3bc8bf101c891e 100644
--- a/tensorflow/compiler/xla/reference_util.h
+++ b/tensorflow/compiler/xla/reference_util.h
@@ -330,13 +330,14 @@ class ReferenceUtil {
     return result;
   }
 
-  // Slices with modulo-wrapping.
+  // Slices with index clamping
   template <typename T>
-  static std::vector<T> ModSlice1D(const tensorflow::gtl::ArraySlice<T>& input,
-                                   int64 start, int64 size) {
+  static std::vector<T> ClampSlice1D(
+      const tensorflow::gtl::ArraySlice<T>& input, int64 start, int64 size) {
+    start = std::min<int64>(std::max<int64>(0, start), input.size() - size);
     std::vector<T> result;
     for (int64 i = 0; i < size; ++i) {
-      result.push_back(input[(start + i) % input.size()]);
+      result.push_back(input[(start + i)]);
     }
     return result;
   }
diff --git a/tensorflow/compiler/xla/rpc/grpc_client_test.cc b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
index 10997c0719dfb80efc7b855c7888500caeb1591b..313f11a9a957155eb277dc02ba5d2565c87e0235 100644
--- a/tensorflow/compiler/xla/rpc/grpc_client_test.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
@@ -101,8 +101,8 @@ TEST_F(GRPCClientTestBase, AxpyTenValues) {
   TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(auto result_literal, client_->ExecuteAndTransfer(
                                                    computation, {}, nullptr));
-  LiteralTestUtil::ExpectNear(*expected_literal, *result_literal,
-                              ErrorSpec(0.0001));
+  EXPECT_TRUE(LiteralTestUtil::Near(*expected_literal, *result_literal,
+                                    ErrorSpec(0.0001)));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc
index ffb72fc73c5bc1ad6e648fb3d772eb5749700dc0..5f4dc6bd08f18b50e60b173432d3d305759bccea 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_service.cc
@@ -27,8 +27,8 @@ namespace xla {
   return std::move(grpc_service);
 }
 
-::grpc::Status DelegateRPC(std::function<tensorflow::Status()> op) {
-  tensorflow::Status s = op();
+::grpc::Status DelegateRPC(std::function<Status()> op) {
+  Status s = op();
   return tensorflow::ToGrpcStatus(s);
 }
 
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.cc b/tensorflow/compiler/xla/rpc/grpc_stub.cc
index e1f2b0abe39b10dd82b700941748bc4f4e8cb2f8..620ac6cec4f76d938e57e87849066df59514938a 100644
--- a/tensorflow/compiler/xla/rpc/grpc_stub.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.cc
@@ -20,53 +20,49 @@ namespace xla {
 
 GRPCStub::~GRPCStub() = default;
 
-tensorflow::Status MakeRPC(
+Status MakeRPC(
     const std::function<::grpc::Status(::grpc::ClientContext*)>& rpc_method) {
   ::grpc::ClientContext context;
   ::grpc::Status s = rpc_method(&context);
   return tensorflow::FromGrpcStatus(s);
 }
 
-tensorflow::Status GRPCStub::TransferToClient(
-    const TransferToClientRequest* request,
-    TransferToClientResponse* response) {
+Status GRPCStub::TransferToClient(const TransferToClientRequest* request,
+                                  TransferToClientResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->TransferToClient(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::TransferToServer(
-    const TransferToServerRequest* request,
-    TransferToServerResponse* response) {
+Status GRPCStub::TransferToServer(const TransferToServerRequest* request,
+                                  TransferToServerResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->TransferToServer(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::TransferToInfeed(
-    const TransferToInfeedRequest* request,
-    TransferToInfeedResponse* response) {
+Status GRPCStub::TransferToInfeed(const TransferToInfeedRequest* request,
+                                  TransferToInfeedResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->TransferToInfeed(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::TransferFromOutfeed(
-    const TransferFromOutfeedRequest* request,
-    TransferFromOutfeedResponse* response) {
+Status GRPCStub::TransferFromOutfeed(const TransferFromOutfeedRequest* request,
+                                     TransferFromOutfeedResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->TransferFromOutfeed(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::ResetDevice(const ResetDeviceRequest* request,
-                                         ResetDeviceResponse* response) {
+Status GRPCStub::ResetDevice(const ResetDeviceRequest* request,
+                             ResetDeviceResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->ResetDevice(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::LoadComputationSnapshot(
+Status GRPCStub::LoadComputationSnapshot(
     const LoadComputationSnapshotRequest* request,
     LoadComputationSnapshotResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
@@ -74,28 +70,28 @@ tensorflow::Status GRPCStub::LoadComputationSnapshot(
   });
 }
 
-tensorflow::Status GRPCStub::Execute(const ExecuteRequest* request,
-                                     ExecuteResponse* response) {
+Status GRPCStub::Execute(const ExecuteRequest* request,
+                         ExecuteResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->Execute(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::ExecuteGraph(const ExecuteGraphRequest* request,
-                                          ExecuteResponse* response) {
+Status GRPCStub::ExecuteGraph(const ExecuteGraphRequest* request,
+                              ExecuteResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->ExecuteGraph(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::ExecuteParallel(
-    const ExecuteParallelRequest* request, ExecuteParallelResponse* response) {
+Status GRPCStub::ExecuteParallel(const ExecuteParallelRequest* request,
+                                 ExecuteParallelResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->ExecuteParallel(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::ExecuteGraphParallel(
+Status GRPCStub::ExecuteGraphParallel(
     const ExecuteGraphParallelRequest* request,
     ExecuteParallelResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
@@ -103,38 +99,35 @@ tensorflow::Status GRPCStub::ExecuteGraphParallel(
   });
 }
 
-tensorflow::Status GRPCStub::ExecuteAsync(const ExecuteAsyncRequest* request,
-                                          ExecuteAsyncResponse* response) {
+Status GRPCStub::ExecuteAsync(const ExecuteAsyncRequest* request,
+                              ExecuteAsyncResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->ExecuteAsync(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::WaitForExecution(
-    const WaitForExecutionRequest* request,
-    WaitForExecutionResponse* response) {
+Status GRPCStub::WaitForExecution(const WaitForExecutionRequest* request,
+                                  WaitForExecutionResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->WaitForExecution(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::DeconstructTuple(
-    const DeconstructTupleRequest* request,
-    DeconstructTupleResponse* response) {
+Status GRPCStub::DeconstructTuple(const DeconstructTupleRequest* request,
+                                  DeconstructTupleResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->DeconstructTuple(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::GetComputationStats(
-    const ComputationStatsRequest* request,
-    ComputationStatsResponse* response) {
+Status GRPCStub::GetComputationStats(const ComputationStatsRequest* request,
+                                     ComputationStatsResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->GetComputationStats(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::GetComputationGraphStats(
+Status GRPCStub::GetComputationGraphStats(
     const ComputationGraphStatsRequest* request,
     ComputationStatsResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
@@ -142,81 +135,77 @@ tensorflow::Status GRPCStub::GetComputationGraphStats(
   });
 }
 
-tensorflow::Status GRPCStub::GetComputationShape(
-    const GetComputationShapeRequest* request,
-    GetComputationShapeResponse* response) {
+Status GRPCStub::GetComputationShape(const GetComputationShapeRequest* request,
+                                     GetComputationShapeResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->GetComputationShape(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::GetShape(const GetShapeRequest* request,
-                                      GetShapeResponse* response) {
+Status GRPCStub::GetShape(const GetShapeRequest* request,
+                          GetShapeResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->GetShape(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::GetDeviceHandles(
-    const GetDeviceHandlesRequest* request,
-    GetDeviceHandlesResponse* response) {
+Status GRPCStub::GetDeviceHandles(const GetDeviceHandlesRequest* request,
+                                  GetDeviceHandlesResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->GetDeviceHandles(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::CreateChannelHandle(
-    const CreateChannelHandleRequest* request,
-    CreateChannelHandleResponse* response) {
+Status GRPCStub::CreateChannelHandle(const CreateChannelHandleRequest* request,
+                                     CreateChannelHandleResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->CreateChannelHandle(context, *request, response);
   });
 }
 
 // Methods used by ComputationBuilder.
-tensorflow::Status GRPCStub::Computation(const ComputationRequest* request,
-                                         ComputationResponse* response) {
+Status GRPCStub::Computation(const ComputationRequest* request,
+                             ComputationResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->Computation(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::Op(const OpRequest* request,
-                                OpResponse* response) {
+Status GRPCStub::Op(const OpRequest* request, OpResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->CreateOp(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::GetLocalShape(const GetLocalShapeRequest* request,
-                                           GetLocalShapeResponse* response) {
+Status GRPCStub::GetLocalShape(const GetLocalShapeRequest* request,
+                               GetLocalShapeResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->GetLocalShape(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::SetReturnValue(
-    const SetReturnValueRequest* request, SetReturnValueResponse* responses) {
+Status GRPCStub::SetReturnValue(const SetReturnValueRequest* request,
+                                SetReturnValueResponse* responses) {
   return MakeRPC([this, request, responses](::grpc::ClientContext* context) {
     return grpc_stub_->SetReturnValue(context, *request, responses);
   });
 }
 
-tensorflow::Status GRPCStub::IsConstant(const IsConstantRequest* request,
-                                        IsConstantResponse* response) {
+Status GRPCStub::IsConstant(const IsConstantRequest* request,
+                            IsConstantResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->IsConstant(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::ComputeConstant(
-    const ComputeConstantRequest* request, ComputeConstantResponse* response) {
+Status GRPCStub::ComputeConstant(const ComputeConstantRequest* request,
+                                 ComputeConstantResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->ComputeConstant(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::ComputeConstantGraph(
+Status GRPCStub::ComputeConstantGraph(
     const ComputeConstantGraphRequest* request,
     ComputeConstantResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
@@ -225,17 +214,16 @@ tensorflow::Status GRPCStub::ComputeConstantGraph(
 }
 
 // Methods used by Computation.
-tensorflow::Status GRPCStub::SnapshotComputation(
-    const SnapshotComputationRequest* request,
-    SnapshotComputationResponse* response) {
+Status GRPCStub::SnapshotComputation(const SnapshotComputationRequest* request,
+                                     SnapshotComputationResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->SnapshotComputation(context, *request, response);
   });
 }
 
 // Methods used by GlobalData.
-tensorflow::Status GRPCStub::Unregister(const UnregisterRequest* request,
-                                        UnregisterResponse* response) {
+Status GRPCStub::Unregister(const UnregisterRequest* request,
+                            UnregisterResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->Unregister(context, *request, response);
   });
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.h b/tensorflow/compiler/xla/rpc/grpc_stub.h
index fd9810d4f1a5e084b73e83007ea7f9f8b0462c72..5906d45769b5749b0c590dbc0e1972077dc3e7ba 100644
--- a/tensorflow/compiler/xla/rpc/grpc_stub.h
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.h
@@ -28,105 +28,90 @@ class GRPCStub : public ServiceInterface {
   explicit GRPCStub(grpc::XlaService::Stub* stub) : grpc_stub_(stub) {}
   ~GRPCStub() override;
 
-  tensorflow::Status TransferToClient(
-      const TransferToClientRequest* arg,
-      TransferToClientResponse* result) override;
+  Status TransferToClient(const TransferToClientRequest* arg,
+                          TransferToClientResponse* result) override;
 
-  tensorflow::Status TransferToServer(
-      const TransferToServerRequest* arg,
-      TransferToServerResponse* result) override;
+  Status TransferToServer(const TransferToServerRequest* arg,
+                          TransferToServerResponse* result) override;
 
-  tensorflow::Status TransferToInfeed(
-      const TransferToInfeedRequest* arg,
-      TransferToInfeedResponse* result) override;
+  Status TransferToInfeed(const TransferToInfeedRequest* arg,
+                          TransferToInfeedResponse* result) override;
 
-  tensorflow::Status TransferFromOutfeed(
-      const TransferFromOutfeedRequest* arg,
-      TransferFromOutfeedResponse* result) override;
+  Status TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
+                             TransferFromOutfeedResponse* result) override;
 
-  tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
-                                 ResetDeviceResponse* result) override;
+  Status ResetDevice(const ResetDeviceRequest* arg,
+                     ResetDeviceResponse* result) override;
 
-  tensorflow::Status LoadComputationSnapshot(
+  Status LoadComputationSnapshot(
       const LoadComputationSnapshotRequest* request,
       LoadComputationSnapshotResponse* result) override;
 
-  tensorflow::Status Execute(const ExecuteRequest* arg,
-                             ExecuteResponse* result) override;
+  Status Execute(const ExecuteRequest* arg, ExecuteResponse* result) override;
 
-  tensorflow::Status ExecuteGraph(const ExecuteGraphRequest* request,
-                                  ExecuteResponse* response) override;
+  Status ExecuteGraph(const ExecuteGraphRequest* request,
+                      ExecuteResponse* response) override;
 
-  tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
-                                     ExecuteParallelResponse* result) override;
+  Status ExecuteParallel(const ExecuteParallelRequest* arg,
+                         ExecuteParallelResponse* result) override;
 
-  tensorflow::Status ExecuteGraphParallel(
-      const ExecuteGraphParallelRequest* request,
-      ExecuteParallelResponse* response) override;
+  Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* request,
+                              ExecuteParallelResponse* response) override;
 
-  tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
-                                  ExecuteAsyncResponse* result) override;
+  Status ExecuteAsync(const ExecuteAsyncRequest* arg,
+                      ExecuteAsyncResponse* result) override;
 
-  tensorflow::Status WaitForExecution(
-      const WaitForExecutionRequest* arg,
-      WaitForExecutionResponse* result) override;
+  Status WaitForExecution(const WaitForExecutionRequest* arg,
+                          WaitForExecutionResponse* result) override;
 
-  tensorflow::Status DeconstructTuple(
-      const DeconstructTupleRequest* arg,
-      DeconstructTupleResponse* result) override;
+  Status DeconstructTuple(const DeconstructTupleRequest* arg,
+                          DeconstructTupleResponse* result) override;
 
-  tensorflow::Status GetComputationStats(
-      const ComputationStatsRequest* arg,
-      ComputationStatsResponse* result) override;
+  Status GetComputationStats(const ComputationStatsRequest* arg,
+                             ComputationStatsResponse* result) override;
 
-  tensorflow::Status GetComputationGraphStats(
-      const ComputationGraphStatsRequest* request,
-      ComputationStatsResponse* response) override;
+  Status GetComputationGraphStats(const ComputationGraphStatsRequest* request,
+                                  ComputationStatsResponse* response) override;
 
-  tensorflow::Status GetComputationShape(
-      const GetComputationShapeRequest* arg,
-      GetComputationShapeResponse* result) override;
+  Status GetComputationShape(const GetComputationShapeRequest* arg,
+                             GetComputationShapeResponse* result) override;
 
-  tensorflow::Status GetShape(const GetShapeRequest* arg,
-                              GetShapeResponse* result) override;
+  Status GetShape(const GetShapeRequest* arg,
+                  GetShapeResponse* result) override;
 
-  tensorflow::Status GetDeviceHandles(
-      const GetDeviceHandlesRequest* arg,
-      GetDeviceHandlesResponse* result) override;
+  Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
+                          GetDeviceHandlesResponse* result) override;
 
-  tensorflow::Status CreateChannelHandle(
-      const CreateChannelHandleRequest* arg,
-      CreateChannelHandleResponse* result) override;
+  Status CreateChannelHandle(const CreateChannelHandleRequest* arg,
+                             CreateChannelHandleResponse* result) override;
 
   // Methods used by ComputationBuilder.
-  tensorflow::Status Computation(const ComputationRequest* arg,
-                                 ComputationResponse* result) override;
+  Status Computation(const ComputationRequest* arg,
+                     ComputationResponse* result) override;
 
-  tensorflow::Status Op(const OpRequest* arg, OpResponse* result) override;
-  tensorflow::Status GetLocalShape(const GetLocalShapeRequest* arg,
-                                   GetLocalShapeResponse* result) override;
+  Status Op(const OpRequest* arg, OpResponse* result) override;
+  Status GetLocalShape(const GetLocalShapeRequest* arg,
+                       GetLocalShapeResponse* result) override;
 
-  tensorflow::Status SetReturnValue(const SetReturnValueRequest* arg,
-                                    SetReturnValueResponse* results) override;
+  Status SetReturnValue(const SetReturnValueRequest* arg,
+                        SetReturnValueResponse* results) override;
 
-  tensorflow::Status IsConstant(const IsConstantRequest* arg,
-                                IsConstantResponse* result) override;
+  Status IsConstant(const IsConstantRequest* arg,
+                    IsConstantResponse* result) override;
 
-  tensorflow::Status ComputeConstant(const ComputeConstantRequest* arg,
-                                     ComputeConstantResponse* result) override;
+  Status ComputeConstant(const ComputeConstantRequest* arg,
+                         ComputeConstantResponse* result) override;
 
-  tensorflow::Status ComputeConstantGraph(
-      const ComputeConstantGraphRequest* arg,
-      ComputeConstantResponse* result) override;
+  Status ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
+                              ComputeConstantResponse* result) override;
 
   // Methods used by Computation.
-  tensorflow::Status SnapshotComputation(
-      const SnapshotComputationRequest* ag,
-      SnapshotComputationResponse* result) override;
+  Status SnapshotComputation(const SnapshotComputationRequest* ag,
+                             SnapshotComputationResponse* result) override;
 
   // Methods used by GlobalData.
-  tensorflow::Status Unregister(const UnregisterRequest* arg,
-                                UnregisterResponse* result) override;
+  Status Unregister(const UnregisterRequest* arg,
+                    UnregisterResponse* result) override;
 
   grpc::XlaService::Stub* service() { return grpc_stub_; }
 
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 9c362d8cad4642e534430becd9374351d51bf297..d1722644c72646538dab77899b79d25056f2f2bf 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -12,6 +12,7 @@ package_group(
     ],
 )
 
+load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
@@ -26,6 +27,7 @@ xla_proto_library(
 xla_proto_library(
     name = "hlo_proto",
     srcs = ["hlo.proto"],
+    visibility = ["//visibility:public"],
     deps = ["//tensorflow/compiler/xla:xla_data_proto"],
 )
 
@@ -200,7 +202,22 @@ tf_cc_test(
 
 cc_library(
     name = "hlo_evaluator",
-    srcs = ["hlo_evaluator.cc"],
+    srcs = [
+        "hlo_evaluator.cc",
+        "hlo_evaluator_typed_visitor.h",
+        "hlo_evaluator_typed_visitor_bfloat16.cc",
+        "hlo_evaluator_typed_visitor_bool.cc",
+        "hlo_evaluator_typed_visitor_complex64.cc",
+        "hlo_evaluator_typed_visitor_double.cc",
+        "hlo_evaluator_typed_visitor_float.cc",
+        "hlo_evaluator_typed_visitor_half.cc",
+        "hlo_evaluator_typed_visitor_int32.cc",
+        "hlo_evaluator_typed_visitor_int64.cc",
+        "hlo_evaluator_typed_visitor_int8.cc",
+        "hlo_evaluator_typed_visitor_uint32.cc",
+        "hlo_evaluator_typed_visitor_uint64.cc",
+        "hlo_evaluator_typed_visitor_uint8.cc",
+    ],
     hdrs = ["hlo_evaluator.h"],
     deps = [
         ":hlo",
@@ -370,6 +387,7 @@ tf_cc_test(
         ":hlo_matchers",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -743,6 +761,23 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "shaped_buffer_test",
+    srcs = ["shaped_buffer_test.cc"],
+    deps = [
+        ":cpu_plugin",
+        ":device_memory_allocator",
+        ":platform_util",
+        ":shaped_buffer",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:ptr_util",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "executable",
     srcs = ["executable.cc"],
@@ -837,7 +872,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
     ],
 )
 
@@ -918,33 +952,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "liveness_util",
-    srcs = ["liveness_util.cc"],
-    hdrs = ["liveness_util.h"],
-    deps = [
-        ":hlo",
-        ":hlo_dataflow_analysis",
-        ":logical_buffer",
-        ":tuple_points_to_analysis",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-    ],
-)
-
-tf_cc_test(
-    name = "liveness_util_test",
-    srcs = ["liveness_util_test.cc"],
-    deps = [
-        ":hlo",
-        ":liveness_util",
-        ":tuple_points_to_analysis",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-    ],
-)
-
 cc_library(
     name = "buffer_liveness",
     srcs = [
@@ -956,7 +963,6 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_ordering",
-        ":liveness_util",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -993,6 +999,7 @@ cc_library(
     ],
     deps = [
         ":buffer_liveness",
+        ":buffer_value_containers",
         ":heap_simulator",
         ":hlo",
         ":hlo_proto",
@@ -1048,7 +1055,6 @@ cc_library(
         ":hlo_dataflow_analysis",
         ":hlo_proto",
         ":hlo_value",
-        ":liveness_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -1081,11 +1087,11 @@ cc_library(
     srcs = ["heap_simulator.cc"],
     hdrs = ["heap_simulator.h"],
     deps = [
+        ":buffer_value",
+        ":buffer_value_containers",
         ":hlo",
         ":hlo_ordering",
         ":hlo_proto",
-        ":liveness_util",
-        ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -1101,7 +1107,7 @@ tf_cc_test(
         ":heap_simulator",
         ":hlo",
         ":hlo_ordering",
-        ":logical_buffer",
+        ":hlo_value",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -1251,13 +1257,11 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_pass",
-        ":hlo_query",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
     ],
@@ -1342,6 +1346,42 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "batch_dot_simplification",
+    srcs = ["batch_dot_simplification.cc"],
+    hdrs = ["batch_dot_simplification.h"],
+    deps = [
+        ":hlo",
+        ":hlo_creation_utils",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "batch_dot_simplification_test",
+    srcs = ["batch_dot_simplification_test.cc"],
+    deps = [
+        ":batch_dot_simplification",
+        ":hlo",
+        ":hlo_matchers",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "gather_expander_test",
     srcs = ["gather_expander_test.cc"],
@@ -1704,6 +1744,8 @@ tf_cc_test(
         ":hlo_execution_profile",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+        "//tensorflow/core:lib",
     ],
 )
 
@@ -1768,6 +1810,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "buffer_value_containers",
+    hdrs = ["buffer_value_containers.h"],
+    deps = [
+        ":buffer_value",
+        ":logical_buffer",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 cc_library(
     name = "logical_buffer",
     srcs = ["logical_buffer.cc"],
@@ -1842,6 +1895,44 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "hlo_liveness_analysis",
+    srcs = ["hlo_liveness_analysis.cc"],
+    hdrs = ["hlo_liveness_analysis.h"],
+    deps = [
+        ":call_graph",
+        ":hlo",
+        ":hlo_value",
+        "//tensorflow/compiler/xla:shape_tree",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_liveness_analysis_test",
+    srcs = ["hlo_liveness_analysis_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_liveness_analysis",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_buffer",
     srcs = ["hlo_buffer.cc"],
@@ -1978,10 +2069,12 @@ cc_library(
     deps = [
         ":computation_layout",
         ":hlo",
+        ":hlo_dce",
         ":hlo_graph_dumper",
         ":hlo_pass",
         ":logical_buffer",
         ":tuple_points_to_analysis",
+        ":tuple_simplifier",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -2005,7 +2098,6 @@ cc_library(
         ":hlo_graph_dumper",
         ":hlo_ordering",
         ":hlo_pass",
-        ":liveness_util",
         ":logical_buffer",
         ":tuple_simplifier",
         "//tensorflow/compiler/xla:status_macros",
@@ -2052,6 +2144,24 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_module_dce",
+    srcs = ["hlo_module_dce.cc"],
+    hdrs = ["hlo_module_dce.h"],
+    deps = [
+        ":hlo",
+        ":hlo_dce",
+        ":hlo_liveness_analysis",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "hlo_verifier",
     srcs = ["hlo_verifier.cc"],
@@ -2094,7 +2204,6 @@ cc_library(
         ":hlo_dce",
         ":hlo_ordering",
         ":hlo_scheduling",
-        ":liveness_util",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -2142,6 +2251,27 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "hlo_module_dce_test",
+    srcs = ["hlo_module_dce_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_module_dce",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "layout_assignment_test",
     srcs = ["layout_assignment_test.cc"],
@@ -2233,6 +2363,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
     ],
 )
@@ -2299,8 +2430,14 @@ tf_cc_test(
 
 cc_library(
     name = "device_memory_allocator",
-    srcs = ["device_memory_allocator.cc"],
-    hdrs = ["device_memory_allocator.h"],
+    srcs = [
+        "device_memory_allocator.cc",
+        "owning_device_memory.cc",
+    ],
+    hdrs = [
+        "device_memory_allocator.h",
+        "owning_device_memory.h",
+    ],
     deps = [
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -2335,6 +2472,24 @@ cc_library(
     ],
 )
 
+xla_test(
+    name = "elemental_ir_emitter_test",
+    srcs = ["elemental_ir_emitter_test.cc"],
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+    ],
+)
+
 cc_library(
     name = "hlo_module_config",
     srcs = ["hlo_module_config.cc"],
@@ -2458,6 +2613,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -2467,6 +2623,7 @@ tf_cc_test(
     srcs = ["transpose_folding_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_matchers",
         ":shape_inference",
         ":transpose_folding",
         "//tensorflow/compiler/xla:literal_util",
@@ -2478,6 +2635,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
     ],
 )
@@ -2510,7 +2668,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -2620,7 +2777,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2762,3 +2918,30 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
+
+cc_library(
+    name = "indexed_array_analysis",
+    srcs = ["indexed_array_analysis.cc"],
+    hdrs = ["indexed_array_analysis.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:ptr_util",
+    ],
+)
+
+tf_cc_test(
+    name = "indexed_array_analysis_test",
+    srcs = ["indexed_array_analysis_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        ":indexed_array_analysis",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 8e785de68cb1fbe4ce9fd58a661bdc208725483b..f732ed8f398c4699bd5247dc7fa1e9677340dcae 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -92,26 +92,6 @@ bool ReshapeIsBitcast(
          valid_bitcast_callback(operand->shape(), reshape->shape());
 }
 
-// Adds a scalar computation to the module to enable optimizations with dot
-// converting into reduction.
-HloComputation* CreateScalarBinaryComputation(HloModule* module,
-                                              PrimitiveType primitive_type,
-                                              HloOpcode opcode) {
-  HloComputation::Builder b("scalar_computation");
-  auto scalar_lhs = b.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {}), "scalar_lhs"));
-  auto scalar_rhs = b.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {}), "scalar_rhs"));
-  auto scalar_op = b.AddInstruction(
-      HloInstruction::CreateBinary(ShapeUtil::MakeShape(primitive_type, {}),
-                                   opcode, scalar_lhs, scalar_rhs));
-  HloComputation* scalar_computation =
-      module->AddEmbeddedComputation(b.Build(scalar_op));
-  return scalar_computation;
-}
-
-}  // namespace
-
 // AlgebraicSimplifierVisitor traverses the HLO computation and reduces certain
 // algebraic expressions to simplified forms. Note: This only supports
 // simplifications that simply look at the operands of an instruction. For the
@@ -220,8 +200,7 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   HloInstruction* AddReduce(HloInstruction* hlo, int64 dim) {
     HloInstruction* zero = computation_->AddInstruction(
         HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-    HloComputation* AddReduce_computation = CreateScalarBinaryComputation(
-        computation_->parent(), F32, HloOpcode::kAdd);
+    HloComputation* AddReduce_computation = GetOrCreateScalarAddComputation();
     Shape shape = ShapeUtil::DeleteDimension(dim, hlo->shape());
     return computation_->AddInstruction(HloInstruction::CreateReduce(
         shape, hlo, zero, {dim}, AddReduce_computation));
@@ -291,6 +270,26 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
       const Shape& dot_shape, HloInstruction* lhs, int64 lhs_contracting_dim,
       HloInstruction* rhs, int64 rhs_contracting_dim, bool swapped);
 
+  StatusOr<HloInstruction*> OptimizeDotOfGather(HloInstruction* dot);
+
+  HloComputation* GetOrCreateScalarAddComputation() {
+    if (scalar_add_computation_) {
+      return scalar_add_computation_;
+    }
+
+    HloComputation::Builder b("scalar_add_computation");
+    Shape shape = ShapeUtil::MakeShape(F32, {});
+    auto scalar_lhs = b.AddInstruction(
+        HloInstruction::CreateParameter(0, shape, "scalar_lhs"));
+    auto scalar_rhs = b.AddInstruction(
+        HloInstruction::CreateParameter(1, shape, "scalar_rhs"));
+    auto scalar_op = b.AddInstruction(HloInstruction::CreateBinary(
+        shape, HloOpcode::kAdd, scalar_lhs, scalar_rhs));
+    scalar_add_computation_ =
+        computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
+    return scalar_add_computation_;
+  }
+
   // Current HloComputation instance the AlgebraicSimplifierVisitor is
   // traversing.
   HloComputation* computation_;
@@ -309,8 +308,13 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   // Disable convolution simplification on platforms where it causes a slowdown.
   bool enable_conv_simplification_;
+
+  // Cached computation for adding two scalar F32.
+  HloComputation* scalar_add_computation_ = nullptr;
 };
 
+}  // namespace
+
 bool AlgebraicSimplifierVisitor::Run(
     HloComputation* computation, bool is_layout_sensitive,
     AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
@@ -499,13 +503,13 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
 }
 
 static HloInstruction* BuildTupleConstant(HloComputation* computation,
-                                          const Literal& literal) {
+                                          const LiteralSlice& literal) {
   if (ShapeUtil::IsTuple(literal.shape())) {
     std::vector<HloInstruction*> elems;
     elems.reserve(ShapeUtil::TupleElementCount(literal.shape()));
     for (int i = 0; i < ShapeUtil::TupleElementCount(literal.shape()); ++i) {
       elems.push_back(
-          BuildTupleConstant(computation, LiteralView::Create(literal, {i})));
+          BuildTupleConstant(computation, LiteralSlice(literal, {i})));
     }
     return computation->AddInstruction(HloInstruction::CreateTuple(elems));
   } else {
@@ -912,6 +916,134 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcatHelper(
   return add_result;
 }
 
+StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
+    HloInstruction* dot) {
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+  if (dnums.lhs_contracting_dimensions_size() != 1 ||
+      dnums.rhs_contracting_dimensions_size() != 1 ||
+      dnums.lhs_batch_dimensions_size() != 0 ||
+      dnums.rhs_batch_dimensions_size() != 0 ||
+      dot->shape().dimensions_size() != 2) {  // dot output 2D
+    VLOG(10) << "DotOfGather: Can only optimize 2D, non-batch dot operations.";
+    return nullptr;
+  }
+
+  // Optimize either dot(DS(ctA), ctB)) or dot(ctB, DS(ctA)).
+  // Currently a Gather is a DynamicSlice.
+  auto is_dynamic_slice_constant_combination =
+      [](HloInstruction* a, HloInstruction* b, int a_contracting_dimension) {
+        // First operand is a DynamicSlice(Constant).
+        if (a->opcode() != HloOpcode::kDynamicSlice) {
+          return false;
+        }
+        auto* dynamic_slice_op = a->operand(0);
+        if (dynamic_slice_op->opcode() != HloOpcode::kConstant) {
+          return false;
+        }
+        // Second operand is a Constant.
+        if (b->opcode() != HloOpcode::kConstant) {
+          return false;
+        }
+        // The DynamicSlice output is a vector.
+        const Shape& dynamic_slice_shape = a->shape();
+        if (dynamic_slice_shape.dimensions(1 - a_contracting_dimension) != 1) {
+          return false;
+        }
+        // Constant size is the same before and after slice in the contracting
+        // dimension, otherwise we either must precompute for all possible slice
+        // indices or dot is invalid.
+        const Shape& dynamic_slice_op_shape = dynamic_slice_op->shape();
+        if (dynamic_slice_op_shape.dimensions(a_contracting_dimension) !=
+            dynamic_slice_shape.dimensions(a_contracting_dimension)) {
+          return false;
+        }
+        return true;
+      };
+
+  HloInstruction* lhs = dot->mutable_operand(0);
+  HloInstruction* rhs = dot->mutable_operand(1);
+  int lhs_contracting_dimension = dnums.lhs_contracting_dimensions(0);
+  int rhs_contracting_dimension = dnums.rhs_contracting_dimensions(0);
+
+  if (!is_dynamic_slice_constant_combination(
+          lhs, rhs, /*a_contracting_dimension=*/lhs_contracting_dimension) &&
+      !is_dynamic_slice_constant_combination(
+          rhs, lhs, /*a_contracting_dimension=*/rhs_contracting_dimension)) {
+    VLOG(10) << "DotOfGather: Can only optimize dot(DS(ctA), ctB)) or "
+                "dot(ctB, DS(ctA)), where the two constants have equal "
+                "contracting dimensions.";
+    return nullptr;
+  }
+
+  // LHS is DynamicSlice:
+  // input: dot(DS(ctA), ctB))
+  // where DS(ctA) = DS({M x K}, {start, 0}, {1, K}) and ctB = {K x N}.
+  // => input dimensions: dot({1 x K}, {K x N}) => {1 x N}.
+  // output: DS(dot(ctA, ctB))
+  // => output dimensions: DS ({M x N}, {start, 0}, {1, N}) => {1 x N}.
+
+  // RHS is DynamicSlice:
+  // input: dot(ctA, DS(ctB))
+  // where ctA = {M x K} and DS(ctB) = DS({K x N}, {0, start}, {K, 1}).
+  // => input dimensions: dot({M x K}, {K x 1}) => {M x 1}.
+  // output: DS(dot(ctA, ctB))
+  // => output dimensions: DS ({M x N}, {0, start}, {M, 1}) => {M x 1}.
+
+  bool lhs_is_dynamic_slice = lhs->opcode() == HloOpcode::kDynamicSlice;
+
+  // ctA:
+  HloInstruction* left_operand =
+      lhs_is_dynamic_slice ? lhs->mutable_operand(0) : lhs;
+  // ctB:
+  HloInstruction* right_operand =
+      lhs_is_dynamic_slice ? rhs : rhs->mutable_operand(0);
+  // Build ctA x ctB.
+  const int m = left_operand->shape().dimensions(1 - lhs_contracting_dimension);
+  const int n =
+      right_operand->shape().dimensions(1 - rhs_contracting_dimension);
+  auto memoized_shape = ShapeUtil::MakeShape(F32, {m, n});
+  auto* memoized_inst = computation_->AddInstruction(HloInstruction::CreateDot(
+      memoized_shape, left_operand, right_operand, dnums));
+  // Get pair {start, 0} or {0, start}.
+  HloInstruction* original_start_indices =
+      lhs_is_dynamic_slice ? lhs->mutable_operand(1) : rhs->mutable_operand(1);
+  // Position of start:
+  int index_of_non_zero_start = lhs_is_dynamic_slice
+                                    ? 1 - lhs_contracting_dimension
+                                    : 1 - rhs_contracting_dimension;
+  // Position of zero:
+  int index_of_zero_start = 1 - index_of_non_zero_start;
+
+  // Slice out start and 0 components and reorder if necessary.
+  auto indices_type = original_start_indices->shape().element_type();
+  Shape s_shape = ShapeUtil::MakeShape(indices_type, {1});
+  Shape d_shape = ShapeUtil::MakeShape(indices_type, {2});
+  HloInstruction* non_zero_start =
+      computation_->AddInstruction(HloInstruction::CreateSlice(
+          s_shape, original_start_indices, {index_of_non_zero_start},
+          {index_of_non_zero_start + 1}, {1}));
+  HloInstruction* zero_start =
+      computation_->AddInstruction(HloInstruction::CreateSlice(
+          s_shape, original_start_indices, {index_of_zero_start},
+          {index_of_zero_start + 1}, {1}));
+  HloInstruction* new_start_indices =
+      lhs_is_dynamic_slice
+          ? computation_->AddInstruction(HloInstruction::CreateConcatenate(
+                d_shape, {non_zero_start, zero_start}, 0))
+          : computation_->AddInstruction(HloInstruction::CreateConcatenate(
+                d_shape, {zero_start, non_zero_start}, 0));
+
+  // Build DynamicSlice(ctA x ctB).
+  const int new_slice_m = lhs_is_dynamic_slice ? 1 : m;
+  const int new_slice_n = lhs_is_dynamic_slice ? n : 1;
+  auto* memoized_lookup =
+      computation_->AddInstruction(HloInstruction::CreateDynamicSlice(
+          dot->shape(), memoized_inst, new_start_indices,
+          {new_slice_m, new_slice_n}));
+
+  return memoized_lookup;
+}
+
 Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
@@ -941,6 +1073,17 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
     return ReplaceInstruction(dot, dot_of_concat_optimized);
   }
 
+  // Simplify dot(ConstA, Gather(Index, ConstB)) to:
+  // Gather(Index, dot*(ConstA, ConstB)), where dot* is an appropriately
+  // batched version of dot.
+  TF_ASSIGN_OR_RETURN(HloInstruction * dot_of_gather_optimized,
+                      OptimizeDotOfGather(dot));
+  if (dot_of_gather_optimized) {
+    VLOG(10) << "Replaced dot(constA, gather(i, constB)) with "
+                "gather(i, dot*(constA, constB))";
+    return ReplaceInstruction(dot, dot_of_gather_optimized);
+  }
+
   if (enable_dot_strength_reduction_ && !is_layout_sensitive_) {
     TF_ASSIGN_OR_RETURN(bool did_strength_reduction,
                         HandleDotStrengthReduction(dot));
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index d0c99bf818cd54b897ae9da6f9c46862254d64e5..4e082877c776c35bab499c805fef7632765a3ee1 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -2963,5 +2963,208 @@ TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
 INSTANTIATE_TEST_CASE_P(DotOfConcatSimplificationTestInstantiation,
                         DotOfConcatSimplificationTest,
                         ::testing::ValuesIn(kDotOfConcatTestSpecs));
+
+struct DotOfGatherTestSpec {
+  int64 m;
+  int64 k;
+  int64 n;
+  int s;      // start index for dynamic slice on the non-contracting dimension
+  int64 lcd;  // left contracting dimension
+  int64 rcd;  // right contracting dimension
+  bool neg;   // is negative testcase
+};
+
+class DotOfGatherSimplificationTest
+    : public HloVerifiedTestBase,
+      public ::testing::WithParamInterface<DotOfGatherTestSpec> {};
+
+// input: dot(DS(ctA), ctB))
+// where DS(ctA) = DS({M x K}, {s, 0}, {1, K}) and ctB = {K x N}.
+// => input dimensions: dot({1 x K}, {K x N}) => {1 x N}.
+// output: DS(dot(ctA, ctB))
+// => output dimensions: DS ({M x N}, {s, 0}, {1, N}) => {1 x N}.
+TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
+  HloComputation::Builder builder(TestName());
+
+  DotOfGatherTestSpec spec = GetParam();
+
+  ASSERT_LE(spec.s, spec.m);
+
+  // For negative tests, increase k of the dynamic slice argument to prevent the
+  // optimization (constants ctA, ctB must have equal contracting dimensions).
+  int64 k_increase = spec.neg ? 5 : 0;
+  int64 lhs_rows = (spec.lcd == 0) ? (spec.k + k_increase) : spec.m;
+  int64 lhs_cols = (spec.lcd == 0) ? spec.m : (spec.k + k_increase);
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {lhs_rows, lhs_cols});
+  auto* lhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+          /*from=*/10.0, /*to=*/10000.0, /*rows=*/lhs_rows,
+          /*cols=*/lhs_cols)));
+
+  int32 start_row = (spec.lcd == 0) ? 0 : spec.s;
+  int32 start_col = (spec.lcd == 0) ? spec.s : 0;
+  const auto start_indices =
+      builder.AddInstruction(HloInstruction::CreateConstant(
+          Literal::CreateR1<int32>({start_row, start_col})));
+  int64 slice_row_size = (spec.lcd == 0) ? spec.k : 1;
+  int64 slice_col_size = (spec.lcd == 0) ? 1 : spec.k;
+  Shape ds_shape = ShapeUtil::MakeShape(F32, {slice_row_size, slice_col_size});
+  auto* ds = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
+      ds_shape, lhs, start_indices, {slice_row_size, slice_col_size}));
+
+  int64 rhs_rows = (spec.rcd == 0) ? spec.k : spec.n;
+  int64 rhs_cols = (spec.rcd == 0) ? spec.n : spec.k;
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {rhs_rows, rhs_cols});
+  auto* rhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+          /*from=*/10.0, /*to=*/10000.0, /*rows=*/rhs_rows,
+          /*cols=*/rhs_cols)));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(spec.lcd);
+  dot_dnums.add_rhs_contracting_dimensions(spec.rcd);
+
+  int64 dot_row_size = 1;
+  int64 dot_col_size = spec.n;
+  Shape dot_shape = ShapeUtil::MakeShape(F32, {dot_row_size, dot_col_size});
+  builder.AddInstruction(
+      HloInstruction::CreateDot(dot_shape, ds, rhs, dot_dnums));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  ASSERT_TRUE(run_successful);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
+
+  if (spec.neg) {
+    EXPECT_NE(computation->root_instruction()->opcode(),
+              HloOpcode::kDynamicSlice);
+  } else {
+    EXPECT_THAT(computation->root_instruction(),
+                op::DynamicSlice(op::Dot(op::Constant(), op::Constant()),
+                                 op::Concatenate()));
+  }
+}
+
+// input: dot(ctA, DS(ctB))
+// where ctA = {M x K} and DS(ctB) = DS({K x N}, {0, s}, {K, 1}).
+// => input dimensions: dot({M x K}, {K x 1}) => {M x 1}.
+// output: DS(dot(ctA, ctB))
+// => output dimensions: DS ({M x N}, {0, s}, {M, 1}) => {M x 1}.
+TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
+  HloComputation::Builder builder(TestName());
+
+  DotOfGatherTestSpec spec = GetParam();
+
+  ASSERT_LE(spec.s, spec.n);
+
+  int64 lhs_rows = (spec.lcd == 0) ? spec.k : spec.m;
+  int64 lhs_cols = (spec.lcd == 0) ? spec.m : spec.k;
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {lhs_rows, lhs_cols});
+  auto* lhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+          /*from=*/10.0, /*to=*/10000.0, /*rows=*/lhs_rows,
+          /*cols=*/lhs_cols)));
+
+  // For negative tests increase k of the dynamic slice argument to prevent the
+  // optimization
+  int64 k_increase = spec.neg ? 5 : 0;
+  int64 rhs_rows = (spec.rcd == 0) ? (spec.k + k_increase) : spec.n;
+  int64 rhs_cols = (spec.rcd == 0) ? spec.n : (spec.k + k_increase);
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {rhs_rows, rhs_cols});
+  auto* rhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+          /*from=*/10.0, /*to=*/10000.0, /*rows=*/rhs_rows,
+          /*cols=*/rhs_cols)));
+
+  int32 start_row = (spec.rcd == 0) ? 0 : spec.s;
+  int32 start_col = (spec.rcd == 0) ? spec.s : 0;
+  const auto start_indices =
+      builder.AddInstruction(HloInstruction::CreateConstant(
+          Literal::CreateR1<int32>({start_row, start_col})));
+  int64 slice_row_size = (spec.rcd == 0) ? spec.k : 1;
+  int64 slice_col_size = (spec.rcd == 0) ? 1 : spec.k;
+  Shape ds_shape = ShapeUtil::MakeShape(F32, {slice_row_size, slice_col_size});
+  auto* ds = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
+      ds_shape, rhs, start_indices, {slice_row_size, slice_col_size}));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(spec.lcd);
+  dot_dnums.add_rhs_contracting_dimensions(spec.rcd);
+
+  int64 dot_row_size = spec.m;
+  int64 dot_col_size = 1;
+  Shape dot_shape = ShapeUtil::MakeShape(F32, {dot_row_size, dot_col_size});
+  builder.AddInstruction(
+      HloInstruction::CreateDot(dot_shape, lhs, ds, dot_dnums));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  ASSERT_TRUE(run_successful);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
+
+  if (spec.neg) {
+    EXPECT_NE(computation->root_instruction()->opcode(),
+              HloOpcode::kDynamicSlice);
+  } else {
+    EXPECT_THAT(computation->root_instruction(),
+                op::DynamicSlice(op::Dot(op::Constant(), op::Constant()),
+                                 op::Concatenate()));
+  }
+}
+
+std::vector<DotOfGatherTestSpec> DotOfGatherPositiveNegativeTests() {
+  std::vector<DotOfGatherTestSpec> positives = {
+      // "Classical dot", i.e. matrix multiply:
+      {/*m=*/10, /*k=*/10, /*n=*/5, /*s=*/0, /*lcd=*/1, /*rcd=*/0,
+       /*neg=*/false},
+      {/*m=*/20, /*k=*/20, /*n=*/3, /*s=*/2, /*lcd=*/1, /*rcd=*/0,
+       /*neg=*/false},
+      {/*m=*/10, /*k=*/3, /*n=*/10, /*s=*/9, /*lcd=*/1, /*rcd=*/0,
+       /*neg=*/false},
+      // Note: testing for m=1 and n=1 is unnecessary, as this optimizes to
+      // dot(ct, ct) before DotOfGather optimization kicks in.
+      // Contract on rows:
+      {/*m=*/10, /*k=*/10, /*n=*/5, /*s=*/0, /*lcd=*/0, /*rcd=*/0,
+       /*neg=*/false},
+      {/*m=*/20, /*k=*/20, /*n=*/3, /*s=*/2, /*lcd=*/0, /*rcd=*/0,
+       /*neg=*/false},
+      {/*m=*/10, /*k=*/3, /*n=*/10, /*s=*/9, /*lcd=*/0, /*rcd=*/0,
+       /*neg=*/false},
+      // Reverse matrix multiply:
+      {/*m=*/10, /*k=*/10, /*n=*/5, /*s=*/0, /*lcd=*/0, /*rcd=*/1,
+       /*neg=*/false},
+      {/*m=*/20, /*k=*/20, /*n=*/3, /*s=*/2, /*lcd=*/0, /*rcd=*/1,
+       /*neg=*/false},
+      {/*m=*/10, /*k=*/3, /*n=*/10, /*s=*/9, /*lcd=*/0, /*rcd=*/1,
+       /*neg=*/false},
+      // Contract on columns:
+      {/*m=*/10, /*k=*/10, /*n=*/5, /*s=*/0, /*lcd=*/1, /*rcd=*/1,
+       /*neg=*/false},
+      {/*m=*/20, /*k=*/20, /*n=*/3, /*s=*/2, /*lcd=*/1, /*rcd=*/1,
+       /*neg=*/false},
+      {/*m=*/10, /*k=*/3, /*n=*/10, /*s=*/9, /*lcd=*/1, /*rcd=*/1,
+       /*neg=*/false},
+  };
+  std::vector<DotOfGatherTestSpec> all;
+  for (int i = 0; i < positives.size(); i++) {
+    DotOfGatherTestSpec positive_test = positives[i];
+    all.push_back(positive_test);
+    DotOfGatherTestSpec negative_test = positive_test;
+    negative_test.neg = true;
+    all.push_back(negative_test);
+  }
+  return all;
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DotOfGatherSimplificationTestInstantiation, DotOfGatherSimplificationTest,
+    ::testing::ValuesIn(DotOfGatherPositiveNegativeTests()));
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index cf1231bcce4d004284b71a49063e3e470a9eb93f..95b4cb6d2e694063b648b264bd2454ae0a5469ff 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -101,7 +101,7 @@ StatusOr<GlobalDataHandle> AllocationTracker::RegisterInternal(
   return result;
 }
 
-tensorflow::Status AllocationTracker::Unregister(const GlobalDataHandle& data) {
+Status AllocationTracker::Unregister(const GlobalDataHandle& data) {
   tensorflow::mutex_lock lock(mutex_);
   VLOG(2) << "Unregister("
           << "handle: " << data.handle() << ")";
@@ -130,7 +130,7 @@ tensorflow::Status AllocationTracker::Unregister(const GlobalDataHandle& data) {
   for (auto& shaped_buffer : it->second) {
     shaped_buffer.reset();
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
@@ -220,8 +220,10 @@ void AllocationTracker::AddAllocationOrIncrementRefCount(
   AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
   auto it = allocation_map.find(device_memory.opaque());
   if (it == allocation_map.end()) {
-    allocation_map[device_memory.opaque()] = {device_memory, device_ordinal,
-                                              /*ref_count=*/1};
+    allocation_map[device_memory.opaque()] = {
+        OwningDeviceMemory(device_memory, device_ordinal,
+                           backend_->memory_allocator()),
+        /*ref_count=*/1};
   } else {
     it->second.ref_count++;
   }
@@ -235,13 +237,12 @@ Status AllocationTracker::DecrementRefCount(se::DeviceMemoryBase device_memory,
   Allocation& allocation = it->second;
   TF_RET_CHECK(allocation.ref_count >= 1);
   if (allocation.ref_count == 1) {
-    TF_RETURN_IF_ERROR(backend_->memory_allocator()->Deallocate(
-        device_ordinal, &device_memory));
+    allocation.device_memory.Free();
     allocation_map.erase(it);
   } else {
     allocation.ref_count--;
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index 1174fa641c06ae053bcc652416bfbc30cabc777c..a7d8927cf7e90d764ff8046df16c71922b11478e 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -76,10 +76,7 @@ class AllocationTracker {
   // Data structure encapsulating single memory allocation on the device.
   struct Allocation {
     // The pointer to this allocation.
-    se::DeviceMemoryBase device_memory;
-
-    // The device that the memory is allocated on.
-    int device_ordinal;
+    OwningDeviceMemory device_memory;
 
     // This is the number of times this memory allocation is referred to by
     // registered data handles.
@@ -126,7 +123,10 @@ class AllocationTracker {
   int64 next_handle_ GUARDED_BY(mutex_);
 
   // A map from device ordinal to AllocationMap.
-  tensorflow::gtl::FlatMap<int, AllocationMap> opaque_to_allocation_map_
+  //
+  // This is not a TF FlatMap because (currently) FlatMap (and therefore
+  // AllocationMap) is not movable.
+  std::unordered_map<int, AllocationMap> opaque_to_allocation_map_
       GUARDED_BY(mutex_);
 
   // A map from data handle to a vector of shaped buffers that represent the
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification.cc b/tensorflow/compiler/xla/service/batch_dot_simplification.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2099916509acdbc2680cc2b5bd405e96f2f7bfb8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification.cc
@@ -0,0 +1,99 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/batch_dot_simplification.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+
+namespace xla {
+StatusOr<bool>
+BatchDotSimplification::ElideDegenerateBatchDimensionFromBatchDot(
+    HloInstruction* batch_dot) {
+  const DotDimensionNumbers& dim_numbers = batch_dot->dot_dimension_numbers();
+  HloInstruction *lhs = batch_dot->mutable_operand(0),
+                 *rhs = batch_dot->mutable_operand(1);
+  const Shape& lhs_shape = lhs->shape();
+
+  std::vector<int64> degenerate_dims;
+  for (int64 batch_dim : dim_numbers.lhs_batch_dimensions()) {
+    if (lhs_shape.dimensions(batch_dim) == 1) {
+      degenerate_dims.push_back(batch_dim);
+    }
+  }
+
+  if (degenerate_dims.empty()) {
+    return false;
+  }
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * new_lhs,
+                      ElideDegenerateDims(lhs, degenerate_dims));
+  TF_ASSIGN_OR_RETURN(HloInstruction * new_rhs,
+                      ElideDegenerateDims(rhs, degenerate_dims));
+
+  DotDimensionNumbers new_dim_numbers = dim_numbers;
+  new_dim_numbers.clear_lhs_batch_dimensions();
+  new_dim_numbers.clear_rhs_batch_dimensions();
+
+  for (int64 i = 0, e = dim_numbers.lhs_batch_dimensions_size() -
+                        degenerate_dims.size();
+       i < e; i++) {
+    new_dim_numbers.add_lhs_batch_dimensions(i);
+    new_dim_numbers.add_rhs_batch_dimensions(i);
+  }
+
+  new_dim_numbers.set_lhs_contracting_dimensions(
+      0,
+      new_dim_numbers.lhs_contracting_dimensions(0) - degenerate_dims.size());
+  new_dim_numbers.set_rhs_contracting_dimensions(
+      0,
+      new_dim_numbers.rhs_contracting_dimensions(0) - degenerate_dims.size());
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * new_dot,
+                      MakeDotHlo(new_lhs, new_rhs, new_dim_numbers));
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * new_dot_reshaped,
+                      MakeReshapeHlo(batch_dot->shape(), new_dot));
+
+  VLOG(2) << "Replaced " << batch_dot->ToString() << " with "
+          << new_dot->ToString();
+
+  TF_RETURN_IF_ERROR(
+      batch_dot->parent()->ReplaceInstruction(batch_dot, new_dot_reshaped));
+
+  return true;
+}
+
+tensorflow::StringPiece BatchDotSimplification::name() const {
+  return "batch-dot-simplification";
+}
+
+StatusOr<bool> BatchDotSimplification::Run(HloModule* module) {
+  bool changed = false;
+  std::vector<HloInstruction*> dot_instrs;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    c_copy_if(computation->instructions(), std::back_inserter(dot_instrs),
+              [](HloInstruction* instr) {
+                return instr->opcode() == HloOpcode::kDot;
+              });
+  }
+  for (HloInstruction* dot_instr : dot_instrs) {
+    TF_ASSIGN_OR_RETURN(bool elided_batch_dim_from_one,
+                        ElideDegenerateBatchDimensionFromBatchDot(dot_instr));
+    changed |= elided_batch_dim_from_one;
+  }
+  return changed;
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification.h b/tensorflow/compiler/xla/service/batch_dot_simplification.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0ca8d8ebac1a3b218e7bd4d6db02b69cfb6916f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BATCH_DOT_SIMPLIFICATION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_BATCH_DOT_SIMPLIFICATION_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+// Simplifies batch dot operations.
+//
+// Normally these would live in the algebraic simplifier, but we want to run
+// this to fixpoint (this pass reaches fixed point in one execution) before we
+// run the DotDecomposer.
+class BatchDotSimplification : public HloPassInterface {
+ public:
+  StatusOr<bool> Run(HloModule* module) override;
+  tensorflow::StringPiece name() const override;
+
+ private:
+  StatusOr<bool> ElideDegenerateBatchDimensionFromBatchDot(
+      HloInstruction* batch_dot);
+};
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_BATCH_DOT_SIMPLIFICATION_H_
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38f1a5d3a645f98220ec445bb9bbdf2b9b842109
--- /dev/null
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
@@ -0,0 +1,168 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/batch_dot_simplification.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class BatchDotSimplificationTest : public HloVerifiedTestBase {};
+
+TEST_F(BatchDotSimplificationTest,
+       ElideSingleDegenerateBatchDotDim_VectorVector) {
+  const string hlo_text = R"(
+HloModule BatchDot
+
+main {
+  a = f32[1,3] parameter(0)
+  b = f32[1,3] parameter(1)
+  ROOT dot = f32[1] dot(a, b), lhs_batch_dims={0}, rhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+)";
+
+  ParseAndVerifyModule(hlo_text);
+  BatchDotSimplification pass;
+  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Reshape(op::Dot(
+                  op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
+                  /*lhs_contracting_dim=*/0, /*rhs_contracting_dim=*/0)));
+}
+
+TEST_F(BatchDotSimplificationTest,
+       ElideSingleDegenerateBatchDotDim_MatrixVector) {
+  const string hlo_text = R"(
+HloModule BatchDot
+
+main {
+  a = f32[1,9,3] parameter(0)
+  b = f32[1,3] parameter(1)
+  ROOT dot = f32[1,9] dot(a, b), lhs_batch_dims={0}, rhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+)";
+
+  ParseAndVerifyModule(hlo_text);
+  BatchDotSimplification pass;
+  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Reshape(op::Dot(
+                  op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
+                  /*lhs_contracting_dim=*/1, /*rhs_contracting_dim=*/0)));
+}
+
+TEST_F(BatchDotSimplificationTest,
+       ElideSingleDegenerateBatchDotDim_MatrixMatrix) {
+  const string hlo_text = R"(
+HloModule BatchDot
+
+main {
+  a = f32[1,9,3] parameter(0)
+  b = f32[1,3,7] parameter(1)
+  ROOT dot = f32[1,9,7] dot(a, b), lhs_batch_dims={0}, rhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+)";
+
+  ParseAndVerifyModule(hlo_text);
+  BatchDotSimplification pass;
+  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Reshape(op::Dot(
+                  op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
+                  /*lhs_contracting_dim=*/1, /*rhs_contracting_dim=*/0)));
+}
+
+TEST_F(BatchDotSimplificationTest,
+       ElideMultipleDegenerateBatchDotDims_VectorVector) {
+  const string hlo_text = R"(
+HloModule BatchDot
+
+main {
+  a = f32[9,1,7,1,3] parameter(0)
+  b = f32[9,1,7,1,3] parameter(1)
+  ROOT dot = f32[9,1,7,1] dot(a, b), lhs_batch_dims={0,1,2,3}, rhs_batch_dims={0,1,2,3}, lhs_contracting_dims={4}, rhs_contracting_dims={4}
+}
+)";
+
+  ParseAndVerifyModule(hlo_text);
+  BatchDotSimplification pass;
+  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Reshape(op::Dot(
+                  op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
+                  /*lhs_contracting_dim=*/2, /*rhs_contracting_dim=*/2)));
+}
+
+TEST_F(BatchDotSimplificationTest,
+       ElideMultipleDegenerateBatchDotDims_VectorMatrix) {
+  const string hlo_text = R"(
+HloModule BatchDot
+
+main {
+  a = f32[9,1,7,1,3] parameter(0)
+  b = f32[9,1,7,1,20,3] parameter(1)
+  ROOT dot = f32[9,1,7,1,20] dot(a, b), lhs_batch_dims={0,1,2,3}, rhs_batch_dims={0,1,2,3}, lhs_contracting_dims={4}, rhs_contracting_dims={5}
+}
+)";
+
+  ParseAndVerifyModule(hlo_text);
+  BatchDotSimplification pass;
+  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Reshape(op::Dot(
+                  op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
+                  /*lhs_contracting_dim=*/2, /*rhs_contracting_dim=*/3)));
+}
+
+TEST_F(BatchDotSimplificationTest,
+       ElideMultipleDegenerateBatchDotDims_MatrixMatrix) {
+  const string hlo_text = R"(
+HloModule BatchDot
+
+main {
+  a = f32[9,1,7,1,19,3] parameter(0)
+  b = f32[9,1,7,1,3,20] parameter(1)
+  ROOT dot = f32[9,1,7,1,19,20] dot(a, b), lhs_batch_dims={0,1,2,3}, rhs_batch_dims={0,1,2,3}, lhs_contracting_dims={5}, rhs_contracting_dims={4}
+}
+)";
+
+  ParseAndVerifyModule(hlo_text);
+  BatchDotSimplification pass;
+  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Reshape(op::Dot(
+                  op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
+                  /*lhs_contracting_dim=*/3, /*rhs_contracting_dim=*/2)));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index 38086bd7e121847be6b6b69415cfe87814e7fc24..96e02b82b97ff2fd682638f4c6297cbc2019c481 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -15,35 +15,32 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 
-#include <algorithm>
 #include <memory>
-#include <numeric>
-#include <set>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
+namespace {
+
 // BatchNormExpanderVisitor traverses the HLO computation and rewrites BatchNorm
 // operations into smaller operations.
 class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
@@ -80,17 +77,25 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
         rewrite_grad_op_(rewrite_grad_op),
         use_fusion_(use_fusion) {}
 
-  HloComputation* GetScalarBinaryComputation(PrimitiveType primitive_type,
-                                             HloOpcode opcode) {
-    HloComputation::Builder b("scalar_computation");
-    auto scalar_lhs = b.AddInstruction(HloInstruction::CreateParameter(
-        0, ShapeUtil::MakeShape(primitive_type, {}), "scalar_lhs"));
-    auto scalar_rhs = b.AddInstruction(HloInstruction::CreateParameter(
-        1, ShapeUtil::MakeShape(primitive_type, {}), "scalar_rhs"));
-    auto scalar_op = b.AddInstruction(
-        HloInstruction::CreateBinary(ShapeUtil::MakeShape(primitive_type, {}),
-                                     opcode, scalar_lhs, scalar_rhs));
-    return computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
+  HloComputation* GetOrCreateScalarAddComputation(
+      PrimitiveType primitive_type) {
+    HloComputation** scalar_add_computation =
+        &scalar_add_computations_[primitive_type];
+    if (*scalar_add_computation) {
+      return *scalar_add_computation;
+    }
+
+    HloComputation::Builder b("scalar_add_computation");
+    Shape shape = ShapeUtil::MakeShape(primitive_type, {});
+    auto scalar_lhs = b.AddInstruction(
+        HloInstruction::CreateParameter(0, shape, "scalar_lhs"));
+    auto scalar_rhs = b.AddInstruction(
+        HloInstruction::CreateParameter(1, shape, "scalar_rhs"));
+    auto scalar_op = b.AddInstruction(HloInstruction::CreateBinary(
+        shape, HloOpcode::kAdd, scalar_lhs, scalar_rhs));
+    *scalar_add_computation =
+        computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
+    return *scalar_add_computation;
   }
 
   // Current HloComputation instance the BatchNormExpander is
@@ -105,6 +110,10 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
   // Whether rewrite has occurred.
   bool changed_ = false;
 
+  // Cached computations for adding two scalars.
+  tensorflow::gtl::FlatMap<PrimitiveType, HloComputation*>
+      scalar_add_computations_;
+
   // Replaces the existing HLO instruction old_instruction, with
   // new_instruction, and marks the optimizer status as changed.
   // Returns the Status representing the result of the replace operation.
@@ -129,6 +138,8 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
   }
 };
 
+}  // namespace
+
 bool BatchNormExpanderVisitor::Run(HloComputation* computation,
                                    bool rewrite_training_op,
                                    bool rewrite_inference_op,
@@ -199,7 +210,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
       HloInstruction::CreateBroadcast(operand_shape, offset, {feature_index}));
 
   HloComputation* add_reduce_computation =
-      GetScalarBinaryComputation(ptype, HloOpcode::kAdd);
+      GetOrCreateScalarAddComputation(ptype);
 
   // X^2.
   auto operand_squared = add(HloInstruction::CreateBinary(
@@ -500,7 +511,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
                                        grad_output, activation_minus_mean));
 
   HloComputation* add_reduce_computation =
-      GetScalarBinaryComputation(ptype, HloOpcode::kAdd);
+      GetOrCreateScalarAddComputation(ptype);
 
   // sum(Grad[Y] * (X - E[X])).
   auto sum_grad_output_times_activiation_minus_mean =
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index 313910a861f7f4c0d1d60b738caef40e76cc4260..5e1499ee6b6ef397f95f7ed29e808d530777bd07 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -149,12 +149,12 @@ TEST_F(BFloat16PropagationTest, ConvertConstantLiteral) {
   EXPECT_TRUE(OutputsBF16(dot->operand(1)));
   EXPECT_EQ(dot->operand(0)->opcode(), HloOpcode::kConstant);
   EXPECT_EQ(dot->operand(1)->opcode(), HloOpcode::kConstant);
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       dot->operand(0)->literal(),
-      *LiteralTestUtil::ConvertF32ToBF16(*Literal::CreateFromArray(array_a)));
-  LiteralTestUtil::ExpectEqual(
+      *Literal::ConvertF32ToBF16(*Literal::CreateFromArray(array_a))));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       dot->operand(1)->literal(),
-      *LiteralTestUtil::ConvertF32ToBF16(*Literal::CreateFromArray(array_b)));
+      *Literal::ConvertF32ToBF16(*Literal::CreateFromArray(array_b))));
 }
 
 // Tests that BF16 can be propagated through nested tuples.
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 94ccfedf6289b4af1accebd358671c3e2bc10ba7..c0b8bf903923a327fb1378eafb51a7d493d5e62d 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/buffer_value_containers.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -699,7 +700,7 @@ BufferAssignmentProto BufferAssignment::ToProto() const {
         BufferAssignmentProto::BufferAlias* proto_alias =
             proto.add_buffer_aliases();
         LogicalBufferProto::Location proto_alias_location =
-            LogicalBuffer::ToLocationProto(*alias.instruction(), alias.index());
+            BufferValue::ToLocationProto(*alias.instruction(), alias.index());
         proto_alias->set_source_buffer_id(buffer.id());
         proto_alias->mutable_location()->Swap(&proto_alias_location);
       }
@@ -1083,7 +1084,9 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       VLOG(2) << "Simulating heap for color " << color;
       int64 alignment = assignment->color_alignment_(color);
       HeapSimulator::Options options;
-      options.buffers_to_assign = &single_colored_set.second;
+      BufferValueFlatSet buffer_value_set =
+          ToBufferValueFlatSet(single_colored_set.second);
+      options.buffers_to_assign = &buffer_value_set;
       TF_ASSIGN_OR_RETURN(
           const HeapSimulator::Result result,
           HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
@@ -1111,7 +1114,9 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         VLOG(2) << "Simulating heap for color " << color;
         int64 alignment = assignment->color_alignment_(color);
         HeapSimulator::Options options;
-        options.buffers_to_assign = &single_colored_set.second;
+        BufferValueFlatSet buffer_value_set =
+            ToBufferValueFlatSet(single_colored_set.second);
+        options.buffers_to_assign = &buffer_value_set;
         TF_ASSIGN_OR_RETURN(
             const HeapSimulator::Result result,
             HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
@@ -1224,7 +1229,10 @@ void BufferAssigner::AssignBuffersFromHeapSimulator(
   BufferAllocation* allocation = assignment->NewEmptyAllocation(
       result.heap_size, /*is_thread_local=*/false, /*is_reusable=*/true, color);
   for (const auto& buffer_chunk : result.chunk_map) {
-    const LogicalBuffer& buffer = *buffer_chunk.first;
+    // TODO(lauj) Remove this down_cast after downstream users of
+    // BufferAllocation::assigned_buffers() are updated to use BufferValue.
+    const LogicalBuffer& buffer =
+        *CHECK_NOTNULL(dynamic_cast<const LogicalBuffer*>(buffer_chunk.first));
     const HeapSimulator::Chunk& chunk = buffer_chunk.second;
     assignment->AddAssignment(allocation, buffer, chunk.offset, chunk.size);
   }
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 15fd905e8d593994c1cd5ec77cef6db7c2dbefdb..ad0b0bf7c25d7194a06801e4ef1c9ee961f6b915 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -415,10 +415,10 @@ class BufferAssignment {
   // Only BufferAssigner can build or modify BufferAssignments.
   friend class BufferAssigner;
 
-  explicit BufferAssignment(const HloModule* module,
-                            std::unique_ptr<BufferLiveness> liveness,
-                            LogicalBuffer::SizeFunction buffer_size,
-                            LogicalBuffer::AlignmentFunction color_alignment)
+  BufferAssignment(const HloModule* module,
+                   std::unique_ptr<BufferLiveness> liveness,
+                   LogicalBuffer::SizeFunction buffer_size,
+                   LogicalBuffer::AlignmentFunction color_alignment)
       : module_(module),
         liveness_(std::move(liveness)),
         buffer_size_(std::move(buffer_size)),
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc
index 37982aaef9eddd64ef6b57ad5a9cf8dd6a565097..810d597e730c1823668c81598df6138655e58b55 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -44,7 +43,7 @@ StatusOr<std::unique_ptr<BufferLiveness>> BufferLiveness::Run(
   return std::move(liveness);
 }
 
-tensorflow::Status BufferLiveness::Analyze() {
+Status BufferLiveness::Analyze() {
   TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module_));
   for (auto* computation : module_->computations()) {
     if (computation->IsFusionComputation()) {
@@ -71,7 +70,7 @@ tensorflow::Status BufferLiveness::Analyze() {
   }
 
   XLA_VLOG_LINES(3, ToString());
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 string BufferLiveness::ToString() const {
@@ -105,8 +104,8 @@ bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
   for (const BufferAlias& alias : points_to_analysis_->GetBufferAliases(a)) {
     // Every user of 'a' must be a predecessor of 'b' or 'b' itself.
     for (auto user : alias.instruction()->users()) {
-      if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(), user,
-                                  points_to_analysis())) {
+      if (points_to_analysis().DoesNotUseOperandBuffer(alias.instruction(),
+                                                       alias.index(), user)) {
         continue;
       }
       if (user != b.instruction() &&
@@ -132,9 +131,8 @@ bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
   // the qualifications specified in CanShareOperandBufferWithUser.
   for (const BufferAlias& alias : points_to_analysis_->GetBufferAliases(a)) {
     if (b.instruction()->IsUserOf(alias.instruction()) &&
-        !CanShareOperandBufferWithUser(alias.instruction(), alias.index(),
-                                       b.instruction(), b.index(),
-                                       points_to_analysis())) {
+        !points_to_analysis().CanShareOperandBufferWithUser(
+            alias.instruction(), alias.index(), b.instruction(), b.index())) {
       return false;
     }
   }
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.h b/tensorflow/compiler/xla/service/buffer_liveness.h
index 11834a5127e383cc2ec2ab3fe1bb82ba86e4abed..cdd3cf4032ef6916086e1c2d148b575192503000 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.h
+++ b/tensorflow/compiler/xla/service/buffer_liveness.h
@@ -89,7 +89,7 @@ class BufferLiveness {
 
   // Perform buffer liveness analysis. This method must be called prior to
   // MayInterfere or MaybeLiveOut.
-  tensorflow::Status Analyze();
+  Status Analyze();
 
   // Returns true if the live range of the buffer of 'a' is strictly before the
   // live range of the buffer of 'b' (they do not overlap).
diff --git a/tensorflow/compiler/xla/service/buffer_value_containers.h b/tensorflow/compiler/xla/service/buffer_value_containers.h
new file mode 100644
index 0000000000000000000000000000000000000000..305914fca828f110bf54239bddb1590172562b16
--- /dev/null
+++ b/tensorflow/compiler/xla/service/buffer_value_containers.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BUFFER_VALUE_CONTAINERS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_BUFFER_VALUE_CONTAINERS_H_
+
+#include "tensorflow/compiler/xla/service/buffer_value.h"
+#include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/core/lib/gtl/compactptrset.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+
+namespace xla {
+
+// Define various containers of BufferValues, and utilities to convert from
+// containers of LogicalBuffers to containers of BufferValues.
+
+using BufferValueCompactPointerSet =
+    tensorflow::gtl::CompactPointerSet<const BufferValue*>;
+template <class LogicalBufferContainerT>
+BufferValueCompactPointerSet ToBufferValueCompactPointerSet(
+    const LogicalBufferContainerT& logical_buffer_container) {
+  BufferValueCompactPointerSet output;
+  for (const LogicalBuffer* buffer : logical_buffer_container) {
+    output.insert(buffer);
+  }
+  return output;
+}
+
+using BufferValueFlatSet = tensorflow::gtl::FlatSet<const BufferValue*>;
+template <class LogicalBufferContainerT>
+BufferValueFlatSet ToBufferValueFlatSet(
+    const LogicalBufferContainerT& logical_buffer_container) {
+  BufferValueFlatSet output;
+  output.reserve(logical_buffer_container.size());
+  for (const LogicalBuffer* buffer : logical_buffer_container) {
+    output.insert(buffer);
+  }
+  return output;
+}
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_BUFFER_VALUE_CONTAINERS_H_
diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h
index c10609e67fcdec459baf25a95173bbf700994be9..7f2ce0e8974c01b09664235d7b9d19555b2705a3 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.h
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@@ -75,48 +75,42 @@ class CompileOnlyService : public Service {
   // Override Service methods that require or imply the existence of an
   // execute backend.  Note that this does not include TransferToClient, as
   // computing constants produces global data that we may wish to transfer.
-  tensorflow::Status Execute(const ExecuteRequest* arg,
-                             ExecuteResponse* result) override {
+  Status Execute(const ExecuteRequest* arg, ExecuteResponse* result) override {
     return Unimplemented("CompileOnlyService does not support execution.");
   }
-  tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
-                                     ExecuteParallelResponse* result) override {
+  Status ExecuteParallel(const ExecuteParallelRequest* arg,
+                         ExecuteParallelResponse* result) override {
     return Unimplemented("CompileOnlyService does not support execution.");
   }
-  tensorflow::Status GetDeviceHandles(
-      const GetDeviceHandlesRequest* arg,
-      GetDeviceHandlesResponse* result) override {
+  Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
+                          GetDeviceHandlesResponse* result) override {
     return Unimplemented("CompileOnlyService does not support devices.");
   }
-  tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
-                                  ExecuteAsyncResponse* result) override {
+  Status ExecuteAsync(const ExecuteAsyncRequest* arg,
+                      ExecuteAsyncResponse* result) override {
     return Unimplemented("CompileOnlyService does not support execution.");
   }
-  tensorflow::Status WaitForExecution(
-      const WaitForExecutionRequest* arg,
-      WaitForExecutionResponse* result) override {
+  Status WaitForExecution(const WaitForExecutionRequest* arg,
+                          WaitForExecutionResponse* result) override {
     return Unimplemented("CompileOnlyService does not support execution.");
   }
-  tensorflow::Status TransferToServer(
-      const TransferToServerRequest* arg,
-      TransferToServerResponse* result) override {
+  Status TransferToServer(const TransferToServerRequest* arg,
+                          TransferToServerResponse* result) override {
     return Unimplemented(
         "CompileOnlyService does not support device data transfers.");
   }
-  tensorflow::Status TransferToInfeed(
-      const TransferToInfeedRequest* arg,
-      TransferToInfeedResponse* result) override {
+  Status TransferToInfeed(const TransferToInfeedRequest* arg,
+                          TransferToInfeedResponse* result) override {
     return Unimplemented(
         "CompileOnlyService does not support device data transfers.");
   }
-  tensorflow::Status TransferFromOutfeed(
-      const TransferFromOutfeedRequest* arg,
-      TransferFromOutfeedResponse* result) override {
+  Status TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
+                             TransferFromOutfeedResponse* result) override {
     return Unimplemented(
         "CompileOnlyService does not support device data transfers.");
   }
-  tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
-                                 ResetDeviceResponse* result) override {
+  Status ResetDevice(const ResetDeviceRequest* arg,
+                     ResetDeviceResponse* result) override {
     return Unimplemented("CompileOnlyService does not support devices.");
   }
 
diff --git a/tensorflow/compiler/xla/service/computation_layout.cc b/tensorflow/compiler/xla/service/computation_layout.cc
index d2d4f14fcec35f5b51a2670a646154ce8bb9bfc1..cb61f3da39fb8eef69fd81066d87a1da91a62935 100644
--- a/tensorflow/compiler/xla/service/computation_layout.cc
+++ b/tensorflow/compiler/xla/service/computation_layout.cc
@@ -23,12 +23,15 @@ limitations under the License.
 
 namespace xla {
 
-ComputationLayout::ComputationLayout(const ProgramShape& program_shape)
+ComputationLayout::ComputationLayout(const ProgramShape& program_shape,
+                                     bool ignore_layouts)
     : result_layout_(program_shape.result()) {
   for (auto& shape : program_shape.parameters()) {
     parameter_layouts_.emplace_back(shape);
   }
-  SetToDefaultLayout();
+  if (ignore_layouts) {
+    SetToDefaultLayout();
+  }
 }
 
 void ComputationLayout::SetToDefaultLayout() {
diff --git a/tensorflow/compiler/xla/service/computation_layout.h b/tensorflow/compiler/xla/service/computation_layout.h
index 80e102411c7885669947d89f378b1ec61e3e4e96..53c3a3f7b738687db3098acfaef1ae87860d0440 100644
--- a/tensorflow/compiler/xla/service/computation_layout.h
+++ b/tensorflow/compiler/xla/service/computation_layout.h
@@ -34,8 +34,9 @@ class ComputationLayout {
  public:
   // Constructs a ComputationLayout from a ProgramShape. The layouts of the
   // parameters and results are set to the default layout. Layouts in the
-  // ProgramShape are ignored.
-  explicit ComputationLayout(const ProgramShape& program_shape);
+  // ProgramShape are ignored if ignore_layouts is true.
+  explicit ComputationLayout(const ProgramShape& program_shape,
+                             bool ignore_layouts = true);
 
   // Returns the layout of a particular parameter.
   const ShapeLayout& parameter_layout(int64 param_no) const {
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index cbe2ba2e50ab213133196987cf486152edc9d785..33d8338809d4e8c7c4774f062c3dda5494543ca6 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
-#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 7e6d58c7fa5ccaf3e0a6f21d43a54906a3fbe408..bfd85f257fb9550a6babb2459a7227ca9003a14f 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -103,6 +103,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
+        "//tensorflow/compiler/xla/service:batch_dot_simplification",
         "//tensorflow/compiler/xla/service:batchnorm_expander",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
@@ -125,6 +126,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_scheduling",
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
         "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/service:indexed_array_analysis",
         "//tensorflow/compiler/xla/service:inliner",
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
@@ -176,6 +178,7 @@ cc_library(
         ":runtime_matmul",
         ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
+        ":runtime_single_threaded_fft",
         ":runtime_single_threaded_matmul",
         "@llvm//:execution_engine",
         "@llvm//:core",
@@ -295,6 +298,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "target_machine_features_fake",
+    testonly = 1,
+    hdrs = ["target_machine_features_fake.h"],
+    deps = [
+        ":target_machine_features",
+    ],
+)
+
 cc_library(
     name = "ir_function",
     srcs = ["ir_function.cc"],
@@ -336,6 +348,7 @@ cc_library(
     deps = [
         ":cpu_options",
         ":cpu_runtime",
+        ":ir_emission_utils",
         ":target_machine_features",
         ":vector_support_library",
         "//tensorflow/compiler/xla:shape_util",
@@ -408,7 +421,6 @@ cc_library(
         "//tensorflow/core:lib",
         "@llvm//:analysis",
         "@llvm//:core",
-        "@llvm//:execution_engine",
         "@llvm//:ipo",
         "@llvm//:mc",
         "@llvm//:object",
@@ -505,7 +517,6 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
@@ -567,6 +578,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_single_threaded_fft",
+    srcs = [
+        "runtime_fft_impl.h",
+        "runtime_single_threaded_fft.cc",
+    ],
+    hdrs = ["runtime_single_threaded_fft.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:framework_lite",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "runtime_single_threaded_matmul",
     srcs = ["runtime_single_threaded_matmul.cc"],
@@ -660,6 +687,7 @@ cc_library(
     hdrs = ["ir_emission_utils.h"],
     deps = [
         ":cpu_runtime",
+        ":target_machine_features",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla/service:hlo",
@@ -672,6 +700,7 @@ tf_cc_test(
     srcs = ["ir_emission_utils_test.cc"],
     deps = [
         ":ir_emission_utils",
+        ":target_machine_features_fake",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
@@ -690,6 +719,7 @@ cc_library(
     deps = [
         ":dot_op_emitter",
         ":ir_emission_utils",
+        ":target_machine_features",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:layout_assignment",
@@ -703,6 +733,7 @@ tf_cc_test(
     srcs = ["cpu_layout_assignment_test.cc"],
     deps = [
         ":cpu_layout_assignment",
+        ":target_machine_features_fake",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
@@ -727,6 +758,7 @@ cc_library(
     deps = [
         ":cpu_runtime",
         ":ir_emission_utils",
+        ":target_machine_features",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -741,6 +773,7 @@ tf_cc_test(
     srcs = ["conv_canonicalization_test.cc"],
     deps = [
         ":conv_canonicalization",
+        ":target_machine_features_fake",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
@@ -779,6 +812,7 @@ cc_library(
         ":dot_op_emitter",
         ":ir_emission_utils",
         ":shape_partition",
+        ":target_machine_features",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:hlo_pass",
@@ -791,6 +825,7 @@ tf_cc_test(
     deps = [
         ":cpu_executable",
         ":parallel_task_assignment",
+        ":target_machine_features_fake",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
@@ -913,3 +948,17 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
+
+tf_cc_test(
+    name = "cpu_eigen_tensor_alignment_test",
+    size = "small",
+    srcs = ["cpu_eigen_tensor_alignment_test.cc"],
+    deps = [
+        ":dot_op_emitter",
+        ":ir_emission_utils",
+        ":target_machine_features_fake",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
index 2136aeb3877685373efaf5bf702a42b39a63f082..0985b9297fe487f3523826cb0978c17775549735 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
@@ -33,7 +33,8 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
   for (HloInstruction* hlo :
        module->entry_computation()->MakeInstructionPostOrder()) {
     if (hlo->opcode() == HloOpcode::kConvolution &&
-        !PotentiallyImplementedAsEigenConvolution(*hlo)) {
+        !PotentiallyImplementedAsEigenConvolution(*hlo,
+                                                  target_machine_features_)) {
       const ConvolutionDimensionNumbers& dnums =
           hlo->convolution_dimension_numbers();
       auto input_batch_dim = dnums.input_batch_dimension();
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h
index 9b2c3d82eb673ce542cc03ec706015967dc975b6..e6fd1499edd0095395194200a5b444ad61e7e39d 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CONV_CANONICALIZATION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CONV_CANONICALIZATION_H_
 
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
@@ -32,12 +33,19 @@ namespace cpu {
 // convolutions can run faster.
 class ConvCanonicalization : public HloPassInterface {
  public:
+  explicit ConvCanonicalization(
+      const TargetMachineFeatures* target_machine_features)
+      : target_machine_features_(*target_machine_features) {}
+
   ~ConvCanonicalization() override {}
   tensorflow::StringPiece name() const override {
     return "convolution-canonicalization";
   }
 
   StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  const TargetMachineFeatures& target_machine_features_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
index 968f53d5c706651d2a470a853e0e9b601c0ed2df..375b017b09263c20c1b1ef8329f7e2f6a573dda4 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -89,7 +90,11 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
 
-  ConvCanonicalization conv_canonicalization;
+  cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features(
+      [](int64 shape_size) {
+        return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
+      });
+  ConvCanonicalization conv_canonicalization(&target_machine_features);
   EXPECT_TRUE(conv_canonicalization.Run(module.get()).ValueOrDie());
 
   const HloInstruction* output_reshape = entry_computation->root_instruction();
@@ -146,7 +151,11 @@ TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
-  ConvCanonicalization conv_canonicalization;
+  cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features(
+      [](int64 shape_size) {
+        return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
+      });
+  ConvCanonicalization conv_canonicalization(&target_machine_features);
   EXPECT_FALSE(conv_canonicalization.Run(module.get()).ValueOrDie());
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 91ed6e427ac7c20461cde91ef0cfdf13f4b55992..25b18eff20f901fc34343a12bfbd353ecec49cfb 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/batch_dot_simplification.h"
 #include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
@@ -81,6 +82,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
 #include "tensorflow/compiler/xla/service/inliner.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
@@ -231,7 +233,10 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
 };
 }  // namespace
 
-Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
+Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
+                                 llvm::TargetMachine* target_machine) {
+  LLVMTargetMachineFeatures target_machine_features(target_machine);
+
   // Optimization pipeline.
   HloPassPipeline pipeline("CPU");
   pipeline.AddInvariantChecker<HloVerifier>();
@@ -248,8 +253,9 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
   // TODO(b/65775800): Fix wrong output bug in Call and remove the CallInliner
   // pass.
   pipeline.AddPass<CallInliner>();
+  pipeline.AddPass<BatchDotSimplification>();
   pipeline.AddPass<DotDecomposer>();
-  pipeline.AddPass<ConvCanonicalization>();
+  pipeline.AddPass<ConvCanonicalization>(&target_machine_features);
   {
     auto& pass =
         pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
@@ -278,10 +284,12 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
     pass.AddPass<HloConstantFolding>();
     pass.AddPass<ConditionalSimplifier>();
   }
+  pipeline.AddPass<IndexedArrayAnalysisPrinterPass>();
   pipeline.AddPass<TransposeFolding>(
-      [](const HloInstruction& dot,
-         const TransposeFolding::OperandIndices& candidate_operands) {
-        return PotentiallyImplementedAsEigenDot(dot)
+      [&target_machine_features](
+          const HloInstruction& dot,
+          const TransposeFolding::OperandIndices& candidate_operands) {
+        return PotentiallyImplementedAsEigenDot(dot, target_machine_features)
                    ? candidate_operands
                    : TransposeFolding::OperandIndices{};
       },
@@ -296,7 +304,8 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
       ReducePrecisionInsertion::PassTiming::AFTER_FUSION);
 
   pipeline.AddPass<CpuLayoutAssignment>(
-      module->device_entry_computation_layout());
+      module->mutable_device_entry_computation_layout(),
+      &target_machine_features);
   // The LayoutAssignment pass may leave behind kCopy instructions which are
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
@@ -316,8 +325,8 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
     // and thread synchronization dependencies which would likely increase
     // binary size (and most AOT applications are single-threaded).
     // TODO(b/29630486) Support multi-threaded AOT.
-    pipeline.AddPass<ParallelTaskAssigner>(max_parallelism,
-                                           ShapeSizeBytesFunction());
+    pipeline.AddPass<ParallelTaskAssigner>(
+        max_parallelism, ShapeSizeBytesFunction(), &target_machine_features);
   }
   // Copy insertion should be performed immediately before IR emission to avoid
   // inserting unnecessary copies (later pass adds an instruction which
@@ -470,7 +479,13 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
   VLOG(2) << "Before optimization:";
   XLA_VLOG_LINES(2, module->ToString());
 
-  TF_RETURN_IF_ERROR(RunHloPasses(module.get(), /*is_aot_compile=*/false));
+  std::unique_ptr<llvm::TargetMachine> jit_target_machine =
+      SimpleOrcJIT::InferTargetMachineForJIT(
+          CompilerTargetOptions(module->config()),
+          CodeGenOptLevel(module->config()));
+
+  TF_RETURN_IF_ERROR(RunHloPasses(module.get(), /*is_aot_compile=*/false,
+                                  jit_target_machine.get()));
 
   VLOG(2) << "After optimization:";
   XLA_VLOG_LINES(2, module->ToString());
@@ -535,7 +550,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // and reduced memory usage (as compared to using DependencyHloOrdering).
   TF_ASSIGN_OR_RETURN(
       SequentialHloOrdering::HloModuleSequence module_sequence,
-      CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction()));
+      CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction(),
+                                     DFSMemoryScheduler));
 
   // Run buffer analysis on the HLO graph. This analysis figures out which
   // temporary buffers are required to run the computation.
@@ -560,10 +576,11 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // GetEmbeddedComputations guarantees that a called computation occurs
   // before a caller computation.
 
+  LLVMTargetMachineFeatures target_machine_features(jit->target_machine());
   IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
                        std::move(instruction_to_profile_idx),
                        std::move(computation_to_profile_idx),
-                       jit->target_machine(), jit->external_constant_pool());
+                       &target_machine_features, jit->external_constant_pool());
 
   for (auto embedded_computation :
        entry_computation->MakeEmbeddedComputationsList()) {
@@ -705,7 +722,8 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     VLOG(2) << "Before optimization:";
     XLA_VLOG_LINES(2, module->ToString());
 
-    TF_RETURN_IF_ERROR(RunHloPasses(module, /*is_aot_compile=*/true));
+    TF_RETURN_IF_ERROR(
+        RunHloPasses(module, /*is_aot_compile=*/true, target_machine.get()));
 
     VLOG(2) << "After optimization:";
     XLA_VLOG_LINES(2, module->ToString());
@@ -745,10 +763,11 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
           &hlo_profile_index_map, &hlo_profile_printer_data));
     }
 
+    LLVMTargetMachineFeatures target_machine_features(target_machine.get());
     IrEmitter ir_emitter(*module, *assignment, &llvm_module,
                          std::move(instruction_to_profile_idx),
                          std::move(computation_to_profile_idx),
-                         target_machine.get(),
+                         &target_machine_features,
                          /*external_constant_pool=*/nullptr);
     HloComputation* computation = module->entry_computation();
     for (auto embedded_computation :
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index 65b05f04fa8d9c72e7bfb6978f6a6384dfbcf976..e56f9f01134f84b4698c078b750b0c1fdca7748e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/llvm_compiler.h"
@@ -148,7 +149,8 @@ class CpuCompiler : public LLVMCompiler {
 
   // Runs the HLO passes which are necessary for both optimizations and
   // correctness.
-  Status RunHloPasses(HloModule* module, bool is_aot_compile);
+  Status RunHloPasses(HloModule* module, bool is_aot_compile,
+                      llvm::TargetMachine* target_machine);
 
   TF_DISALLOW_COPY_AND_ASSIGN(CpuCompiler);
 };
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d12fa6bb9ad2054bdc052c9d7b3729cc28e11f6d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+// Test that we don't call into Eigen with tensors too small to be aligned
+// reliably.
+
+class CpuEigenTensorAlignmentTest : public ::testing::Test {};
+
+TEST_F(CpuEigenTensorAlignmentTest, EigenDotAlignment) {
+  string hlo_string = R"(
+HloModule DotOperation
+
+ENTRY DotOperation {
+  arg0 = f32[5,256] parameter(0)
+  arg1 = f32[256,1024] parameter(1)
+  ROOT dot = f32[5,1024] dot(arg0, arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
+
+  HloInstruction* dot = module->entry_computation()->root_instruction();
+
+  TargetMachineFeaturesWithFakeAlignmentLogic target_machine_with_no_alignment(
+      [](int64 size) { return 1; });
+
+  EXPECT_FALSE(
+      PotentiallyImplementedAsEigenDot(*dot, target_machine_with_no_alignment));
+
+  TargetMachineFeaturesWithFakeAlignmentLogic
+      target_machine_with_full_alignment([](int64 size) {
+        return TargetMachineFeatures::kEigenExpectedTensorAlignment;
+      });
+
+  EXPECT_TRUE(PotentiallyImplementedAsEigenDot(
+      *dot, target_machine_with_full_alignment));
+}
+
+TEST_F(CpuEigenTensorAlignmentTest, EigenConvAlignment) {
+  string hlo_string = R"(
+HloModule ConvOperation
+
+ENTRY ConvOperation {
+  arg0 = f32[1,2,1] parameter(0)
+  arg1 = f32[1,1,1] parameter(1)
+  ROOT conv = f32[1,2,1] convolution(arg0, arg1), window={size=1}, dim_labels=b0f_0io->b0f
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
+
+  HloInstruction* conv = module->entry_computation()->root_instruction();
+
+  TargetMachineFeaturesWithFakeAlignmentLogic target_machine_with_no_alignment(
+      [](int64 size) { return 1; });
+
+  EXPECT_FALSE(PotentiallyImplementedAsEigenConvolution(
+      *conv, target_machine_with_no_alignment));
+
+  TargetMachineFeaturesWithFakeAlignmentLogic
+      target_machine_with_full_alignment([](int64 size) {
+        return TargetMachineFeatures::kEigenExpectedTensorAlignment;
+      });
+
+  EXPECT_TRUE(PotentiallyImplementedAsEigenConvolution(
+      *conv, target_machine_with_full_alignment));
+}
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 32613b869078305edda97c11ac250f67de32b805..cf43b74c699ca8cbbef11a0abbaf4d69476f5d77 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -73,7 +73,7 @@ CpuExecutable::CpuExecutable(
 
 Status CpuExecutable::AllocateBuffers(
     DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-    std::vector<se::DeviceMemoryBase>* buffers) {
+    std::vector<OwningDeviceMemory>* buffers) {
   CHECK_EQ(buffers->size(), assignment_->Allocations().size());
   VLOG(3) << "Allocating " << assignment_->Allocations().size()
           << " allocations for module " << module().name();
@@ -201,60 +201,18 @@ Status CpuExecutable::ExecuteComputeFunction(
   return Status::OK();
 }
 
-static void LogLiveAddresses(
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
-    const std::vector<bool>& buffers_in_result) {
-  if (!VLOG_IS_ON(3)) {
-    return;
-  }
-
-  CHECK_EQ(buffers.size(), buffers_in_result.size());
-  std::vector<const void*> live_out_buffers;
-  for (int i = 0; i < buffers.size(); ++i) {
-    if (buffers_in_result[i]) {
-      live_out_buffers.push_back(buffers[i].opaque());
-    }
-  }
-  VLOG(3) << "Live addresses in output marking found "
-          << live_out_buffers.size() << " addresses:\n"
-          << tensorflow::str_util::Join(
-                 live_out_buffers, ", ", [](string* out, const void* address) {
-                   tensorflow::strings::StrAppend(
-                       out, tensorflow::strings::Printf("%p", address));
-                 });
-}
-
-static Status DeallocateTempBuffers(
-    DeviceMemoryAllocator* allocator, se::Stream* stream,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
-    const std::vector<bool>& buffers_in_result) {
-  // Keep those buffers in the output of the marked live because they are needed
-  // by the service. They will be deallocated by the service.
-  for (size_t i = 0; i < buffers.size(); ++i) {
-    se::DeviceMemoryBase alloc = buffers[i];
-    if (!buffers_in_result[i] && !alloc.is_null()) {
-      VLOG(3) << "CpuExecutable deallocating buffer #" << i << " ["
-              << alloc.opaque() << "]";
-      TF_RETURN_IF_ERROR(
-          allocator->Deallocate(stream->parent()->device_ordinal(), &alloc));
-    }
-  }
-
-  return Status::OK();
-}
-
 StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
-    std::vector<bool>* buffers_in_result) {
+    tensorflow::gtl::MutableArraySlice<OwningDeviceMemory> buffers) {
   se::Stream* stream = run_options->stream();
   ScopedShapedBuffer result_buffer(
       /*on_host_shape=*/host_result_shape(),
       /*on_device_shape=*/host_result_shape(), run_options->allocator(),
       stream->parent()->device_ordinal());
 
-  // Copy DeviceMemoryBase values which contain the array(s) of the result into
-  // the respective location in ShapedBuffer which is returned to the caller.
+  // Move OwningDeviceMemory values which contain the array(s) of the result
+  // into the respective location in ScopedShapedBuffer which is returned to the
+  // caller.
   TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus(
       [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
         const auto& sources = this->GetRootPointsToSet().element(index);
@@ -273,10 +231,9 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
         CHECK(!slice.allocation()->is_entry_computation_parameter());
 
         const BufferAllocation::Index buffer_index = slice.index();
-        const se::DeviceMemoryBase& buffer = allocated_buffers[buffer_index];
+        OwningDeviceMemory& buffer = buffers[buffer_index];
         CHECK(!buffer.is_null() || buffer.size() == 0);
-        *device_memory = buffer;
-        (*buffers_in_result)[buffer_index] = true;
+        *device_memory = buffer.Forget();
         return Status::OK();
       }));
   return std::move(result_buffer);
@@ -292,23 +249,21 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteOnStream(
 
   se::Stream* stream = run_options->stream();
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
+  std::vector<OwningDeviceMemory> buffers(assignment_->Allocations().size());
 
   TF_RETURN_IF_ERROR(AllocateBuffers(
       memory_allocator, stream->parent()->device_ordinal(), &buffers));
-  TF_RETURN_IF_ERROR(ExecuteComputeFunction(
-      &run_options->run_options(), arguments, buffers, hlo_execution_profile));
 
-  std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
-  TF_ASSIGN_OR_RETURN(
-      ScopedShapedBuffer result_buffer,
-      CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
-
-  // Free all buffers not in the result.
-  TF_RETURN_IF_ERROR(DeallocateTempBuffers(memory_allocator, stream, buffers,
-                                           buffers_in_result));
+  std::vector<se::DeviceMemoryBase> unowning_buffers;
+  unowning_buffers.reserve(buffers.size());
+  for (auto& buffer : buffers) {
+    unowning_buffers.push_back(buffer.AsDeviceMemoryBase());
+  }
+  TF_RETURN_IF_ERROR(ExecuteComputeFunction(&run_options->run_options(),
+                                            arguments, unowning_buffers,
+                                            hlo_execution_profile));
 
-  return std::move(result_buffer);
+  return CreateResultShapedBuffer(run_options, &buffers);
 }
 
 StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStream(
@@ -324,30 +279,53 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStream(
       run_options->stream()->implementation());
   se::Stream* stream = run_options->stream();
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
-
+  std::vector<OwningDeviceMemory> buffers(assignment_->Allocations().size());
   TF_RETURN_IF_ERROR(AllocateBuffers(
       memory_allocator, stream->parent()->device_ordinal(), &buffers));
 
-  std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
-  TF_ASSIGN_OR_RETURN(
-      ScopedShapedBuffer result_buffer,
-      CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
-
-  LogLiveAddresses(buffers, buffers_in_result);
-
-  host_stream->EnqueueTask([this, run_options, arguments, buffers,
-                            buffers_in_result, memory_allocator, stream]() {
-    // Failing a CHECK here is not great, but I don't see an obvious way to
-    // return a failed Status asynchronously.
-    TF_CHECK_OK(ExecuteComputeFunction(&run_options->run_options(), arguments,
-                                       buffers,
-                                       /*hlo_execution_profile=*/nullptr));
-    TF_CHECK_OK(DeallocateTempBuffers(memory_allocator, stream, buffers,
-                                      buffers_in_result));
-  });
+  std::vector<se::DeviceMemoryBase> unowning_buffers;
+  unowning_buffers.reserve(buffers.size());
+  for (auto& buffer : buffers) {
+    unowning_buffers.push_back(buffer.AsDeviceMemoryBase());
+  }
+  TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
+                      CreateResultShapedBuffer(run_options, &buffers));
 
-  return std::move(result_buffer);
+  // At this point, `unowning_buffers` contains unowning pointers to all of our
+  // buffers, and `buffers` contains owning pointers to the non-live-out
+  // buffers.  Enqueue a task which keeps alive the non-live-out buffers.
+  //
+  // Logically we want this lambda to capture `buffers` by move, ultimately our
+  // functor needs to be wrapped in an std::function, and that requires its
+  // functor to be copyable.  Thus we perpitrate the hack of capturing buffers
+  // "by shared pointer".
+  //
+  // We also need to change the types of some of the variables we capture:
+  // run_options needs to change from a pointer to a value type, and arguments
+  // needs to change from an ArraySlice into a vector.  We use a struct instead
+  // of a lambda to make this explicit.
+  struct AsyncRunTask {
+    CpuExecutable* executable;
+    ServiceExecutableRunOptions run_options;
+    std::vector<const ShapedBuffer*> arguments;
+    std::vector<se::DeviceMemoryBase> unowning_buffers;
+    std::shared_ptr<std::vector<OwningDeviceMemory>> buffers;
+
+    void operator()() {
+      // Failing a CHECK here is not great, but I don't see an obvious way to
+      // return a failed Status asynchronously.
+      TF_CHECK_OK(executable->ExecuteComputeFunction(
+          &run_options.run_options(), arguments, unowning_buffers,
+          /*hlo_execution_profile=*/nullptr));
+    }
+  };
+  host_stream->EnqueueTask(AsyncRunTask{
+      this, *run_options,
+      std::vector<const ShapedBuffer*>(arguments.begin(), arguments.end()),
+      unowning_buffers,
+      std::make_shared<std::vector<OwningDeviceMemory>>(std::move(buffers))});
+
+  return std::move(result);
 }
 
 /*static*/ int64 CpuExecutable::ShapeSizeBytes(const Shape& shape) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 68ad38cba88720a04519fc2473fe6f9decbaaf93..8dd47bfb865e8a0552542f510d3365cff0d111e0 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -92,7 +92,7 @@ class CpuExecutable : public Executable {
   // buffer is assigned for this element.
   Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator,
                          int device_ordinal,
-                         std::vector<se::DeviceMemoryBase>* buffers);
+                         std::vector<OwningDeviceMemory>* buffers);
 
   // Calls the generated function performing the computation with the given
   // arguments using the supplied buffers.
@@ -102,16 +102,12 @@ class CpuExecutable : public Executable {
       tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
       HloExecutionProfile* hlo_execution_profile);
 
-  // Creates a ScopedShapedBuffer for holding the result of the computation. The
-  // addresses (DeviceMemoryBases) are set according to buffer assignment.
-  // 'buffers_in_result' should point to a vector of the same size as
-  // 'allocated_buffers'. An element in buffers_in_result is set to true if the
-  // corresponding buffer is live out of the computation (and thus contained in
-  // the returned ShapedBuffer).
+  // Creates a ScopedShapedBuffer for holding the result of the computation,
+  // moving buffers out of allocated_buffers and into the result as appropriate.
+  // The addresses are set according to buffer assignment.
   StatusOr<ScopedShapedBuffer> CreateResultShapedBuffer(
       const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
-      std::vector<bool>* buffers_in_result);
+      tensorflow::gtl::MutableArraySlice<OwningDeviceMemory> buffers);
 
   // Returns the points-to set of the root instruction of the entry
   // computation. Uses points-to analysis from buffer assignment.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index a98e85a151ffb77e6682b82164603481265283c4..46fe060817b0264d90574b45a94cf1f6e5964593 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -158,37 +158,95 @@ TEST_F(InstructionFusionTest, DotOperationFusion_ElementReuse) {
   EXPECT_EQ(dot, computation->root_instruction());
 }
 
-TEST_F(InstructionFusionTest, DotOperationFusion_TransposeFusion) {
-  HloComputation::Builder builder(TestName());
-  HloInstruction* arg0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 256}), "arg0"));
-  HloInstruction* arg1 = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {1024, 256}), "arg1"));
+TEST_F(InstructionFusionTest, DotOperationFusion_TransposeFusion_RHS) {
+  string hlo_string = R"(
+HloModule DotOperationFusion_TransposeFusion
 
-  HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
-      ShapeUtil::MakeShape(S32, {1024, 256}), HloOpcode::kExp, arg1));
-  HloInstruction* transpose1 =
-      builder.AddInstruction(HloInstruction::CreateTranspose(
-          ShapeUtil::MakeShape(S32, {256, 1024}), exp1, {1, 0}));
-  builder.AddInstruction(
-      MakeDot(ShapeUtil::MakeShape(F32, {1, 1024}), arg0, transpose1));
+ENTRY DotOperationFusion_TransposeFusion {
+  arg0 = f32[1,256] parameter(0)
+  arg1 = f32[1024,256] parameter(1)
+  exponential = s32[1024,256] exponential(arg1)
+  transpose = s32[256,1024] transpose(exponential), dimensions={1,0}
+  ROOT dot = f32[1,1024] dot(arg0, transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
+  HloComputation* computation = module->entry_computation();
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
   TransposeFolding transpose_folding(
       [](const HloInstruction& dot,
          const TransposeFolding::OperandIndices& candidate_operands) {
         return candidate_operands;
       },
       TransposeFolding::NeverFoldTranspose);
-  EXPECT_TRUE(transpose_folding.Run(module.get()).ValueOrDie());
-  EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kFusion);
-  EXPECT_EQ(computation->root_instruction()->fusion_kind(),
-            HloInstruction::FusionKind::kTransposeDot);
-  EXPECT_FALSE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
-  EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kFusion);
-  EXPECT_EQ(computation->root_instruction()->fusion_kind(),
-            HloInstruction::FusionKind::kTransposeDot);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, transpose_folding.Run(module.get()));
+  ASSERT_TRUE(changed);
+  ASSERT_THAT(computation->root_instruction(),
+              op::Dot(op::Parameter(0), op::Exp(op::Parameter(1)),
+                      /*lhs_contracting_dim=*/1, /*rhs_contracting_dim=*/1));
+}
+
+TEST_F(InstructionFusionTest, DotOperationFusion_TransposeFusion_LHS) {
+  string hlo_string = R"(
+HloModule DotOperationFusion_TransposeFusion
+
+ENTRY DotOperationFusion_TransposeFusion {
+  arg0 = f32[256,1] parameter(0)
+  arg1 = f32[256,1024] parameter(1)
+  transpose = s32[1,256] transpose(arg0), dimensions={1,0}
+  exponential = s32[256,1024] exponential(arg1)
+  ROOT dot = f32[1,1024] dot(transpose, exponential), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
+  HloComputation* computation = module->entry_computation();
+
+  TransposeFolding transpose_folding(
+      [](const HloInstruction& dot,
+         const TransposeFolding::OperandIndices& candidate_operands) {
+        return candidate_operands;
+      },
+      TransposeFolding::NeverFoldTranspose);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, transpose_folding.Run(module.get()));
+  ASSERT_TRUE(changed);
+  ASSERT_THAT(computation->root_instruction(),
+              op::Dot(op::Parameter(0), op::Exp(op::Parameter(1)),
+                      /*lhs_contracting_dim=*/0, /*rhs_contracting_dim=*/0));
+}
+
+TEST_F(InstructionFusionTest,
+       DotOperationFusion_TransposeFusion_LHS_NonDefault) {
+  string hlo_string = R"(
+HloModule DotOperationFusion_TransposeFusion
+
+ENTRY DotOperationFusion_TransposeFusion {
+  arg0 = f32[1,256] parameter(0)
+  arg1 = f32[256,1024] parameter(1)
+  transpose = s32[256,1] transpose(arg0), dimensions={1,0}
+  exponential = s32[256,1024] exponential(arg1)
+  ROOT dot = f32[1,1024] dot(transpose, exponential), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
+  HloComputation* computation = module->entry_computation();
+
+  TransposeFolding transpose_folding(
+      [](const HloInstruction& dot,
+         const TransposeFolding::OperandIndices& candidate_operands) {
+        return candidate_operands;
+      },
+      TransposeFolding::NeverFoldTranspose);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, transpose_folding.Run(module.get()));
+  ASSERT_TRUE(changed);
+  ASSERT_THAT(computation->root_instruction(),
+              op::Dot(op::Parameter(0), op::Exp(op::Parameter(1)),
+                      /*lhs_contracting_dim=*/1, /*rhs_contracting_dim=*/0));
 }
 
 class OpcodeFusionTest : public InstructionFusionTest {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
index e8117377e61a4e21b8c45b929c518a18878fcb60..aa872d5ec9e7593b8d2f731421c17af590729529 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
@@ -100,7 +100,8 @@ Status CpuLayoutAssignment::AddBackendConstraints(
   const HloComputation* computation = constraints->computation();
   for (auto* instruction : computation->instructions()) {
     if (instruction->opcode() == HloOpcode::kConvolution &&
-        PotentiallyImplementedAsEigenConvolution(*instruction)) {
+        PotentiallyImplementedAsEigenConvolution(*instruction,
+                                                 target_machine_features_)) {
       const HloInstruction* convolution = instruction;
       const HloInstruction* lhs_instruction = convolution->operand(0);
       const HloInstruction* rhs_instruction = convolution->operand(1);
@@ -126,7 +127,8 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       const HloInstruction* op = instruction->operand(*op_idx);
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
           ColMajorShape(op->shape()), instruction, *op_idx));
-    } else if (PotentiallyImplementedAsEigenDot(*instruction)) {
+    } else if (PotentiallyImplementedAsEigenDot(*instruction,
+                                                target_machine_features_)) {
       const HloInstruction* dot = instruction;
       // In order to implement `dot` with Eigen dot, the layouts of the lhs,
       // rhs, and output need to be row-major.
@@ -139,13 +141,9 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       Shape lhs_shape(RowMajorShape(lhs_instruction->shape()));
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(lhs_shape, dot, 0));
 
-      // dot is a kDot or a kTransposeDot fusion node.  In the latter case, if
-      // it represents X @ X, it may have just one operand.
-      if (dot->operand_count() > 1) {
-        const HloInstruction* rhs_instruction = dot->operand(1);
-        Shape rhs_shape(RowMajorShape(rhs_instruction->shape()));
-        TF_RETURN_IF_ERROR(constraints->SetOperandLayout(rhs_shape, dot, 1));
-      }
+      const HloInstruction* rhs_instruction = dot->operand(1);
+      Shape rhs_shape(RowMajorShape(rhs_instruction->shape()));
+      TF_RETURN_IF_ERROR(constraints->SetOperandLayout(rhs_shape, dot, 1));
 
       // Set layouts of the instructions' shapes.
       TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(output_shape, dot));
@@ -181,7 +179,7 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       }
     }
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
index 09adb5cb02abba5844a1740bdb50a578e1bdf8b5..3c4fe68b830d9602f009b318d4e51e9a04a27e09 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_LAYOUT_ASSIGNMENT_H_
 
 #include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -28,12 +29,16 @@ namespace cpu {
 class CpuLayoutAssignment : public LayoutAssignment {
  public:
   explicit CpuLayoutAssignment(
-      const ComputationLayout& entry_computation_layout)
-      : LayoutAssignment(entry_computation_layout) {}
+      ComputationLayout* entry_computation_layout,
+      const TargetMachineFeatures* target_machine_features)
+      : LayoutAssignment(entry_computation_layout),
+        target_machine_features_(*target_machine_features) {}
   ~CpuLayoutAssignment() override {}
 
  protected:
   Status AddBackendConstraints(LayoutConstraints* constraints) override;
+
+  const TargetMachineFeatures& target_machine_features_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
index ba4c5a23d3e043fd6680c2f9abc2275696737ee7..429fc7b78608da0e9cd794ac294851b326f5be24 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
@@ -49,7 +50,12 @@ class CpuLayoutAssignmentTest : public HloTestBase {
  protected:
   void AssignLayouts(HloModule* module,
                      ComputationLayout* entry_computation_layout) {
-    cpu::CpuLayoutAssignment layout_assignment(*entry_computation_layout);
+    cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features(
+        [](int64 shape_size) {
+          return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
+        });
+    cpu::CpuLayoutAssignment layout_assignment(entry_computation_layout,
+                                               &target_machine_features);
     EXPECT_IS_OK(layout_assignment.Run(module).status());
   }
 };
@@ -311,7 +317,12 @@ static StatusOr<DotOutputFusionLayoutAssignmentResult> RunDotOutputFusion(
   result.addend_fusion_param = fusion_instruction->operand(
       fused_add->operand(1 - dot_operand_idx_in_add)->parameter_number());
 
-  cpu::CpuLayoutAssignment layout_assignment(computation_layout);
+  cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features(
+      [](int64 shape_size) {
+        return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
+      });
+  cpu::CpuLayoutAssignment layout_assignment(&computation_layout,
+                                             &target_machine_features);
   TF_ASSIGN_OR_RETURN(result.layout_assignment_changed_something,
                       layout_assignment.Run(module));
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index f9c51f243c47b8069500eca3c9c2929b17f04e62..e75fcb6bc9719f7453d5f0cb52d1673cef1fd3df 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -22,6 +22,8 @@ namespace {
 const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
 const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
+const char* const kXlaEnableExperimentalLlvmIrGemm =
+    "xla_enable_experimental_llvm_ir_gemm";
 
 }  // namespace
 
@@ -54,6 +56,12 @@ tensorflow::gtl::optional<int64> LlvmIrGemvTilingFactor(
   return tensorflow::gtl::nullopt;
 }
 
+bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
+  const auto& extra_options_map =
+      config.debug_options().xla_backend_extra_options();
+  return extra_options_map.count(kXlaEnableExperimentalLlvmIrGemm) > 0;
+}
+
 }  // namespace options
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index be62ff3cc1af23408ca8a00f1372e7a998f160c6..106dfbbc62dfba8d3de74e0a2ae3bb247bd91d67 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -26,6 +26,7 @@ namespace options {
 
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
+bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
 tensorflow::gtl::optional<int64> LlvmIrGemvTilingFactor(
     const HloModuleConfig& config);
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 215405f6802cf1956ebec011da2fcd11b95c0c64..54c52bc08f9c53b8c6898689b18c4cb7f4bdcfd0 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -51,6 +51,8 @@ extern const char* const kEigenConvF16SymbolName =
 extern const char* const kEigenConvF32SymbolName =
     "__xla_cpu_runtime_EigenConvF32";
 extern const char* const kEigenFftSymbolName = "__xla_cpu_runtime_EigenFft";
+extern const char* const kEigenSingleThreadedFftSymbolName =
+    "__xla_cpu_runtime_EigenSingleThreadedFft";
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulF16";
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index 1dce6efa5cd65e67ae73a2e2affe2d2d3c537508..aa0e96712302e806a389c6ad05a2c1b6634ef901 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -52,6 +52,7 @@ extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenConvF16SymbolName;
 extern const char* const kEigenConvF32SymbolName;
 extern const char* const kEigenFftSymbolName;
+extern const char* const kEigenSingleThreadedFftSymbolName;
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index 9b39e7f5765ae5eb6a25c06eef4d74b1c00e5c91..d97802ee45d6add3c466577d7624d9ca74e2f380 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -88,8 +88,8 @@ CpuTransferManager::CpuTransferManager()
     : GenericTransferManager(se::host::kHostPlatformId,
                              /*pointer_size=*/sizeof(void*)) {}
 
-Status CpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                                   const Literal& literal) {
+Status CpuTransferManager::TransferLiteralToInfeed(
+    se::StreamExecutor* executor, const LiteralSlice& literal) {
   const Shape& shape = literal.shape();
   VLOG(2) << "Transferring literal to infeed with shape: "
           << ShapeUtil::HumanString(shape);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
index 3ecb0d236498371f48caf63249f9cd4e8777752b..6dfc666f09dfa6df740cd54bea0957e3144181bc 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
@@ -38,7 +38,7 @@ class CpuTransferManager : public GenericTransferManager {
   ~CpuTransferManager() override {}
 
   Status TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                 const Literal& literal) override;
+                                 const LiteralSlice& literal) override;
   Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
                                 const void* source) override;
   Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 801c5239081d174ba6278d54009e790863f3bcb9..af69fc3da9869aa2df958ecc5c064ee37dd9ea21 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -520,18 +521,259 @@ void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
   }
 }
 
+// This class implements a tiled matrix multiplication algorithm, intended for
+// use as the innermost GEBP loop in a GEMM kernel (GEBP is described in "Goto,
+// Kazushige, and Robert Van De Geijn. "High-performance implementation of the
+// level-3 BLAS." ACM Transactions on Mathematical Software (TOMS) 35.1 (2008):
+// 4).
+//
+// This only supports canonical dot operations (i.e. where the lhs contraction
+// dimension is 1 and the rhs contraction dimension is 0) over row major
+// matrices.
+class MatrixMatrixBlockPanelEmitter {
+ public:
+  // Describe the dimensions of the GEBP kernel.  These will usually not be the
+  // dimensions of the GEMM itself, the GEMM will usually be broken up into GEBP
+  // kernels with smaller dimensions.
+  class Dimensions {
+   public:
+    explicit Dimensions(int64 m, int64 k, int64 n) : m_(m), k_(k), n_(n) {}
+
+    int64 m() const { return m_; }
+    int64 k() const { return k_; }
+    int64 n() const { return n_; }
+
+   private:
+    const int64 m_;
+    const int64 k_;
+    const int64 n_;
+  };
+
+  // Creates an instance of MatrixMatrixBlockPanelEmitter that matrix-multiplies
+  // `lhs` with `rhs` and stores the result in `result`.
+  //
+  // `m`, `k` and `n` are the matrix multiplication dimensions.
+  //
+  // `max_vectorization_width` is the maximum vector width (i.e. the width of
+  // the largest vector register we will use).  This can be larger than the
+  // largest vector register supported by the machine -- LLVM will legalize
+  // these large vector widths into legally sized vectors.
+  // `min_vectorization_width` is the smallest vector width the emitter will use
+  // -- below that it will devolve to using a scalar loop.
+  //
+  // `k_tiling_factor` is the number of elements along the reduction dimensions
+  // that we will attempt to process at once.
+  explicit MatrixMatrixBlockPanelEmitter(
+      llvm::Value* lhs, llvm::Value* rhs, llvm::Value* result, Dimensions dims,
+      int max_vectorization_width, int min_vectorization_width,
+      int k_tiling_factor, const TargetMachineFeatures& target_machine_features,
+      llvm::IRBuilder<>* ir_builder, PrimitiveType primitive_type)
+      : lhs_(lhs),
+        rhs_(rhs),
+        result_(result),
+        dims_(dims),
+        max_vectorization_width_(max_vectorization_width),
+        min_vectorization_width_(min_vectorization_width),
+        k_tiling_factor_(k_tiling_factor),
+        target_machine_features_(target_machine_features),
+        ir_builder_(ir_builder),
+        primitive_type_(primitive_type),
+        ksl_(ir_builder_) {
+    CHECK(max_vectorization_width > 0 &&
+          IsPowerOfTwo(static_cast<uint64>(max_vectorization_width)));
+    CHECK(min_vectorization_width > 0 &&
+          IsPowerOfTwo(static_cast<uint64>(min_vectorization_width)));
+    CHECK_GT(k_tiling_factor, 0);
+  }
+
+  void Emit();
+
+ private:
+  // We can only iterate the `n` dimension for an extent that is divisible by
+  // the vectorization width.  So we emit an outer loop that first processes the
+  // largest extent in `n` that is divisible by max_vectorization_width, then
+  // the largest remaining extent that is divisible by max_vectorization_width /
+  // 2 etc.  This function emits that outermost loop.
+  void EmitChunkedLoopOverN();
+
+  // This emits a loop that loops over the `k` dimension in multiples of
+  // `k_tiling_factor` as much as possible and then emits a remainder epilogue.
+  void EmitLoopOverK(VectorSupportLibrary* vsl, llvm::Value* n_start,
+                     llvm::Value* n_end);
+
+  // This emits the inner reduction loop.  This inner reduction loop processes
+  // all indices in the `m` dimension, [`k_start`, `k_end`) in the k dimension
+  // and [`n_start`, `n_end`) in the `n` dimension.
+  void EmitInnerLoop(int64 k_tiling_factor, llvm::Value* k_start,
+                     llvm::Value* k_end, llvm::Value* n_start,
+                     llvm::Value* n_end, VectorSupportLibrary* vsl);
+
+  llvm::Value* getInt64(int64 value) { return ir_builder_->getInt64(value); }
+
+  llvm::Value* lhs_;
+  llvm::Value* rhs_;
+  llvm::Value* result_;
+  Dimensions dims_;
+
+  int64 max_vectorization_width_;
+  int64 min_vectorization_width_;
+  int64 k_tiling_factor_;
+
+  const TargetMachineFeatures& target_machine_features_;
+  llvm::IRBuilder<>* ir_builder_;
+  PrimitiveType primitive_type_;
+  KernelSupportLibrary ksl_;
+};
+
+void MatrixMatrixBlockPanelEmitter::Emit() { EmitChunkedLoopOverN(); }
+
+void MatrixMatrixBlockPanelEmitter::EmitChunkedLoopOverN() {
+  int64 current_vectorization_width = max_vectorization_width_;
+  int64 n_start = 0;
+  while (n_start != dims_.n() &&
+         current_vectorization_width >= min_vectorization_width_) {
+    int64 n_end = dims_.n() - (dims_.n() % current_vectorization_width);
+    if (n_start != n_end) {
+      VectorSupportLibrary vsl(primitive_type_, current_vectorization_width,
+                               ir_builder_, "gebp");
+      EmitLoopOverK(&vsl, getInt64(n_start), getInt64(n_end));
+      n_start = n_end;
+    }
+    current_vectorization_width /= 2;
+  }
+
+  if (n_start != dims_.n()) {
+    VectorSupportLibrary vsl(primitive_type_, 1, ir_builder_, "gebp");
+    ksl_.For("epi.n", n_start, dims_.n(), 1, [&](llvm::Value* n_i) {
+      llvm::Value* n_i_next =
+          ir_builder_->CreateAdd(n_i, ir_builder_->getInt64(1));
+      EmitLoopOverK(&vsl, n_i, n_i_next);
+    });
+  }
+}
+
+void MatrixMatrixBlockPanelEmitter::EmitLoopOverK(VectorSupportLibrary* vsl,
+                                                  llvm::Value* n_start,
+                                                  llvm::Value* n_end) {
+  int64 k_start = 0;
+  int64 k_end = dims_.k() - (dims_.k() % k_tiling_factor_);
+  if (k_end != k_start) {
+    EmitInnerLoop(k_tiling_factor_, getInt64(k_start), getInt64(k_end), n_start,
+                  n_end, vsl);
+    k_start = k_end;
+  }
+
+  if (k_start != dims_.k()) {
+    EmitInnerLoop(dims_.k() - k_start, getInt64(k_start), getInt64(dims_.k()),
+                  n_start, n_end, vsl);
+  }
+}
+
+// The tiling scheme is as follows:
+//
+// Let the LHS be:
+//
+//   +---+---+---+
+//   | a | b | c | .
+//   +---+---+---+ .
+//   |   |   |   | .
+//   +---+---+---+
+//     ..     ..
+//
+// and the RHS be:
+//
+//   +----+----+----+----+
+//   | p0 | p1 | p2 | p3 | .
+//   +----+----+----+----+ .
+//   | q0 | q1 | q2 | q3 | .
+//   +----+----+----+----+
+//   | r0 | r1 | r2 | r3 | .
+//   +----+----+----+----+ .
+//     ......    ......
+//
+// and let k_tiling_factor be 3 and the vector width (implicitly denoted by
+// `vsl`) be 4.
+//
+// Then we
+//
+//  1. broadcast the first row in LHS to 3 vectors of width 4
+//  2. elementwise multiply the RHS rows with these broadcasted vectors
+//  3. elementwise add them:
+//
+//   +---+---+---+---+   +----+----+----+----+
+//   | a | a | a | a | * | p0 | p1 | p2 | p3 |   +
+//   +---+---+---+---+   +----+----+----+----+
+//
+//   +---+---+---+---+   +----+----+----+----+
+//   | b | b | b | b | * | q0 | q1 | q2 | q3 |   +
+//   +---+---+---+---+   +----+----+----+----+
+//
+//   +---+---+---+---+   +----+----+----+----+
+//   | c | c | c | c | * | r0 | r1 | r2 | r3 |
+//   +---+---+---+---+   +----+----+----+----+
+//
+// to get:
+//
+//   +----------------+----------------+----------------+----------------+
+//   | a*p0+b*q0+c*r0 | a*p1+b*q1+c*r1 | a*p2+b*q2+c*r2 | a*p3+b*q3+c*r3 |
+//   +----------------+----------------+----------------+----------------+
+//
+// which we increment into the appropriate region in the result.
+void MatrixMatrixBlockPanelEmitter::EmitInnerLoop(
+    int64 k_tiling_factor, llvm::Value* k_start, llvm::Value* k_end,
+    llvm::Value* n_start, llvm::Value* n_end, VectorSupportLibrary* vsl) {
+  ksl_.For("dot.m", 0, dims_.m(), 1, [&](llvm::Value* m_i) {
+    // This outer loop iterates over all of the M dimension
+    llvm::Value* result_row_begin = vsl->ComputeOffsetPointer(
+        result_, /*offset_elements=*/m_i, /*scale=*/dims_.n());
+    llvm::Value* lhs_row_begin = vsl->ComputeOffsetPointer(
+        lhs_, /*offset_elements=*/m_i, /*scale=*/dims_.k());
+
+    ksl_.For("dot.k", k_start, k_end, k_tiling_factor, [&](llvm::Value* k_i) {
+      // broadcasted_a is the broadcasted set of vectors denoted as <a,a,a,a>,
+      // <b,b,b,b> etc. in the diagram.
+      std::vector<llvm::Value*> broadcasted_a;
+      broadcasted_a.reserve(k_tiling_factor);
+      for (int i = 0; i < k_tiling_factor; i++) {
+        broadcasted_a.push_back(vsl->LoadBroadcast(
+            lhs_row_begin, ir_builder_->CreateAdd(getInt64(i), k_i)));
+      }
+
+      // rhs_loader will be used to load the tile off of the RHS, denoted as
+      // <<p0,p1,p2,p3>,<q0,q1,q2,q3> ...> in the diagram.
+      TileLoader rhs_loader(vsl, ir_builder_, rhs_, dims_.n(), k_i,
+                            k_tiling_factor);
+      ksl_.For(
+          "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) {
+            // This loop iterates over the N dimension.  It loads the tile from
+            // RHS, does the FMA resulting in the
+            // <a*p0+b*q0+c*r0,a*p1+b*q1+c*r1,...> in the diagram and increments
+            // the result.
+            std::vector<llvm::Value*> tile = rhs_loader.LoadTile(n_i);
+            llvm::Value* result_accumulator =
+                vsl->LoadVector(result_row_begin, n_i);
+            for (int i = 0; i < tile.size(); i++) {
+              result_accumulator =
+                  vsl->MulAdd(tile[i], broadcasted_a[i], result_accumulator);
+            }
+            vsl->StoreVector(result_accumulator, result_row_begin, n_i);
+          });
+    });
+  });
+}
+
 }  // namespace
 
-DotOpEmitter::DotOpEmitter(
-    const HloInstruction& dot, bool transpose_lhs, bool transpose_rhs,
-    const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array,
-    const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array,
-    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
-    const HloModuleConfig& hlo_module_config,
-    const TargetMachineFeatures& target_machine_features)
+DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
+                           const llvm_ir::IrArray& target_array,
+                           const llvm_ir::IrArray& lhs_array,
+                           const llvm_ir::IrArray& rhs_array,
+                           const llvm_ir::IrArray* addend_array,
+                           llvm::Value* executable_run_options_value,
+                           llvm::IRBuilder<>* ir_builder,
+                           const HloModuleConfig& hlo_module_config,
+                           const TargetMachineFeatures& target_machine_features)
     : dot_(dot),
-      transpose_lhs_(transpose_lhs),
-      transpose_rhs_(transpose_rhs),
       target_array_(target_array),
       lhs_array_(lhs_array),
       rhs_array_(rhs_array),
@@ -541,22 +783,80 @@ DotOpEmitter::DotOpEmitter(
       hlo_module_config_(hlo_module_config),
       target_machine_features_(target_machine_features) {}
 
-/* static */ tensorflow::Status DotOpEmitter::EmitDotOperation(
-    const HloInstruction& dot, bool transpose_lhs, bool transpose_rhs,
-    const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array,
-    const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array,
+/* static */ Status DotOpEmitter::EmitDotOperation(
+    const HloInstruction& dot, const llvm_ir::IrArray& target_array,
+    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
+    const llvm_ir::IrArray* addend_array,
     llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
     const HloModuleConfig& hlo_module_config,
     const TargetMachineFeatures& target_machine_features) {
   PrimitiveType type = target_array.GetShape().element_type();
   TF_RET_CHECK(F16 == type || F32 == type || F64 == type || C64 == type);
-  DotOpEmitter dot_emitter(dot, transpose_lhs, transpose_rhs, target_array,
-                           lhs_array, rhs_array, addend_array,
-                           executable_run_options_value, ir_builder,
-                           hlo_module_config, target_machine_features);
+  DotOpEmitter dot_emitter(dot, target_array, lhs_array, rhs_array,
+                           addend_array, executable_run_options_value,
+                           ir_builder, hlo_module_config,
+                           target_machine_features);
   return dot_emitter.Emit();
 }
 
+bool DotOpEmitter::EmitExperimentalGebpDotIfEnabled(
+    const DotOpEmitter::MatMultDims& mat_mult_dims) {
+  if (!EnableExperimentalLlvmIrGemm() || ShouldUseMultiThreadedEigen()) {
+    return false;
+  }
+
+  if (mat_mult_dims.lhs_non_canonical || mat_mult_dims.rhs_non_canonical) {
+    return false;
+  }
+
+  PrimitiveType primitive_type = dot_.shape().element_type();
+
+  switch (primitive_type) {
+    default:
+      return false;
+
+    case F32:
+    case F64:
+    case S32:
+    case S64:
+      break;
+  }
+
+  if (!(mat_mult_dims.lhs_column_major == mat_mult_dims.rhs_column_major &&
+        mat_mult_dims.rhs_column_major == mat_mult_dims.target_column_major)) {
+    return false;
+  }
+
+  VLOG(2) << "Emitting GEBP kernel in LLVM IR";
+
+  llvm::Value* lhs = lhs_array_.GetBasePointer();
+  llvm::Value* rhs = rhs_array_.GetBasePointer();
+  llvm::Value* target = target_array_.GetBasePointer();
+  int64 m = mat_mult_dims.m;
+  int64 k = mat_mult_dims.k;
+  int64 n = mat_mult_dims.n;
+
+  if (mat_mult_dims.lhs_column_major) {
+    std::swap(lhs, rhs);
+    std::swap(m, n);
+  }
+
+  int64 size_bytes = m * n * ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
+  ir_builder_->CreateMemSet(
+      target, ir_builder_->getInt8(0), size_bytes,
+      target_machine_features_.minimum_alignment_for_allocation(size_bytes));
+
+  MatrixMatrixBlockPanelEmitter::Dimensions gebp_dims(/*m=*/m, /*k=*/k,
+                                                      /*n=*/n);
+  MatrixMatrixBlockPanelEmitter gebp_emitter(
+      /*lhs=*/lhs, /*rhs=*/rhs, /*result=*/target, gebp_dims,
+      /*max_vectorization_width=*/8, /*min_vectorization_width=*/4,
+      /*k_tiling_factor=*/8, target_machine_features_, ir_builder_,
+      primitive_type);
+  gebp_emitter.Emit();
+  return true;
+}
+
 bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
   if (dot_.shape().dimensions_size() != 2) {
     return false;
@@ -578,7 +878,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
 
   if (mat_mult_dims.m == 1) {
     bool rhs_effectively_row_major =
-        transpose_rhs_ ^ !mat_mult_dims.rhs_column_major;
+        mat_mult_dims.rhs_non_canonical ^ !mat_mult_dims.rhs_column_major;
     if (rhs_effectively_row_major) {
       k = mat_mult_dims.k;
       m = mat_mult_dims.n;
@@ -594,7 +894,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
 
   if (mat_mult_dims.n == 1) {
     bool lhs_effectively_column_major =
-        transpose_lhs_ ^ mat_mult_dims.lhs_column_major;
+        mat_mult_dims.lhs_non_canonical ^ mat_mult_dims.lhs_column_major;
     if (lhs_effectively_column_major) {
       m = mat_mult_dims.m;
       k = mat_mult_dims.k;
@@ -609,7 +909,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
   }
 
   if (!is_column_major_matrix_vector && !is_row_major_matrix_vector) {
-    return false;
+    return EmitExperimentalGebpDotIfEnabled(mat_mult_dims);
   }
 
   int64 tiling_factor = GetGemvTilingFactor();
@@ -690,7 +990,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
   return true;
 }
 
-tensorflow::Status DotOpEmitter::Emit() {
+Status DotOpEmitter::Emit() {
   // The dot operation performs a sum of products over dimension 0 of the left
   // hand side operand and dimension 1 of the right hand side operand.
   //
@@ -734,23 +1034,17 @@ tensorflow::Status DotOpEmitter::Emit() {
 
   CHECK_EQ(addend_array_, nullptr);
 
-  if (PotentiallyImplementedAsEigenDot(dot_)) {
+  if (PotentiallyImplementedAsEigenDot(dot_, target_machine_features_)) {
     return EmitCallToRuntime();
   }
 
   // Reduce along dimension 0 of the LHS and 1 of the RHS. Vectors are a special
   // case where the reduction dimension is 0 for both LHS and RHS. This results
   // in a vector dot product producing a scalar.
-  int64 lhs_reduction_dimension = 0;
-  if (ShapeUtil::Rank(lhs_shape) >= 2) {
-    lhs_reduction_dimension =
-        ShapeUtil::GetDimensionNumber(lhs_shape, transpose_lhs_ ? -2 : -1);
-  }
-  int64 rhs_reduction_dimension = 0;
-  if (ShapeUtil::Rank(rhs_shape) >= 2) {
-    rhs_reduction_dimension =
-        ShapeUtil::GetDimensionNumber(rhs_shape, transpose_rhs_ ? -1 : -2);
-  }
+  int64 lhs_reduction_dimension =
+      dot_.dot_dimension_numbers().lhs_contracting_dimensions(0);
+  int64 rhs_reduction_dimension =
+      dot_.dot_dimension_numbers().rhs_contracting_dimensions(0);
 
   // Verify the reduction dimension in the two operands are the same size.
   TF_RET_CHECK(lhs_shape.dimensions(lhs_reduction_dimension) ==
@@ -874,10 +1168,10 @@ tensorflow::Status DotOpEmitter::Emit() {
   // loop.
   ir_builder_->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status DotOpEmitter::EmitScalarDot() {
+Status DotOpEmitter::EmitScalarDot() {
   // A scalar dot is just a scalar multiply.
   llvm::Value* result;
   llvm::Value* lhs_value =
@@ -902,10 +1196,10 @@ tensorflow::Status DotOpEmitter::EmitScalarDot() {
     result = ir_builder_->CreateFMul(lhs_value, rhs_value);
   }
   target_array_.EmitWriteArrayElement(/*index=*/{}, result, ir_builder_);
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
+Status DotOpEmitter::EmitCallToRuntime() {
   // The signature of the Eigen runtime matmul function is:
   //
   //   (void)(void* run_options, float* out, float* lhs, float* rhs,
@@ -914,8 +1208,7 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
   // The two transpose_... parameters are actually booleans, but we use int32
   // to avoid target-dependent calling convention details.
 
-  bool multi_threaded =
-      hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  bool multi_threaded = ShouldUseMultiThreadedEigen();
   bool use_mkl_dnn = hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
   PrimitiveType type = target_array_.GetShape().element_type();
   llvm::Type* float_type;
@@ -986,8 +1279,8 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
 
   const llvm_ir::IrArray* lhs = &lhs_array_;
   const llvm_ir::IrArray* rhs = &rhs_array_;
-  bool transpose_lhs = transpose_lhs_;
-  bool transpose_rhs = transpose_rhs_;
+  bool transpose_lhs = mat_mult_dims.lhs_non_canonical;
+  bool transpose_rhs = mat_mult_dims.rhs_non_canonical;
 
   if (!mat_mult_dims.lhs_column_major) {
     std::swap(mat_mult_dims.m, mat_mult_dims.n);
@@ -1007,7 +1300,7 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
        ir_builder_->getInt64(mat_mult_dims.k),
        ir_builder_->getInt32(transpose_lhs),
        ir_builder_->getInt32(transpose_rhs)});
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
@@ -1015,12 +1308,18 @@ DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
 
   const Shape& lhs_shape = lhs_array_.GetShape();
   const Shape& rhs_shape = rhs_array_.GetShape();
-
-  return {lhs_shape.dimensions(transpose_lhs_ ? 1 : 0),
-          lhs_shape.dimensions(transpose_lhs_ ? 0 : 1),
-          rhs_shape.dimensions(transpose_rhs_ ? 0 : 1),
-          LayoutUtil::Minor(lhs_shape.layout(), 0) == 0,
-          LayoutUtil::Minor(rhs_shape.layout(), 0) == 0};
+  const DotDimensionNumbers& dim_nums = dot_.dot_dimension_numbers();
+
+  return {
+      /*m=*/lhs_shape.dimensions(1 - dim_nums.lhs_contracting_dimensions(0)),
+      /*k=*/lhs_shape.dimensions(dim_nums.lhs_contracting_dimensions(0)),
+      /*n=*/rhs_shape.dimensions(1 - dim_nums.rhs_contracting_dimensions(0)),
+      /*lhs_column_major=*/LayoutUtil::Minor(lhs_shape.layout(), 0) == 0,
+      /*lhs_non_canonical=*/dim_nums.lhs_contracting_dimensions(0) == 0,
+      /*rhs_column_major=*/LayoutUtil::Minor(rhs_shape.layout(), 0) == 0,
+      /*rhs_non_canonical=*/dim_nums.rhs_contracting_dimensions(0) == 1,
+      /*target_column_major=*/
+      LayoutUtil::Minor(target_array_.GetShape().layout(), 0) == 0};
 }
 
 llvm_ir::IrArray::Index DotOpEmitter::EmitOperandArrayLoopNest(
@@ -1060,19 +1359,39 @@ static bool IsRank2WithNoPadding(const Shape& shape) {
 
 // In a gemm operation where output = lhs * rhs, check whether the given shapes
 // are valid for the operation.
-static bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
-                               const Shape& output_shape) {
+static bool AreValidGemmShapes(
+    const Shape& lhs_shape, const Shape& rhs_shape, const Shape& output_shape,
+    const TargetMachineFeatures& target_machine_features) {
   // The inputs and the output must
   // 1) be matrices with no padding, and
   // 2) have an allowed element type.
   PrimitiveType output_primitive_type = output_shape.element_type();
-  return (output_primitive_type == F64 || output_primitive_type == F32 ||
-          output_primitive_type == F16) &&
-         IsRank2WithNoPadding(lhs_shape) && IsRank2WithNoPadding(rhs_shape) &&
-         IsRank2WithNoPadding(output_shape);
+  if (!(output_primitive_type == F64 || output_primitive_type == F32 ||
+        output_primitive_type == F16)) {
+    return false;
+  }
+
+  if (!(IsRank2WithNoPadding(lhs_shape) && IsRank2WithNoPadding(rhs_shape) &&
+        IsRank2WithNoPadding(output_shape))) {
+    return false;
+  }
+
+  auto is_aligned = [&](const Shape& shape) {
+    return GetMinimumAlignmentForArray(shape, target_machine_features) >=
+           TargetMachineFeatures::kEigenExpectedTensorAlignment;
+  };
+
+  if (!is_aligned(lhs_shape) || !is_aligned(rhs_shape) ||
+      !is_aligned(output_shape)) {
+    return false;
+  }
+
+  return true;
 }
 
-bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo) {
+bool PotentiallyImplementedAsEigenDot(
+    const HloInstruction& hlo,
+    const TargetMachineFeatures& target_machine_features) {
   // For certain types of Dot, we can call Eigen
   if (hlo.opcode() == HloOpcode::kDot) {
     const Shape& lhs_shape = hlo.operand(0)->shape();
@@ -1089,28 +1408,18 @@ bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo) {
 
     // If gemm can accept the operand shapes, use it rather than a custom
     // kernel.
-    if (AreValidGemmShapes(lhs_shape, rhs_shape, hlo.shape())) {
+    if (AreValidGemmShapes(lhs_shape, rhs_shape, hlo.shape(),
+                           target_machine_features)) {
+      const DotDimensionNumbers& dim_numbers = hlo.dot_dimension_numbers();
       // The size of the reduction dimension should match. The shape inference
       // guarantees this invariant, so the check here is for programming
       // errors.
-      CHECK_EQ(lhs_shape.dimensions(1), rhs_shape.dimensions(0));
+      CHECK_EQ(lhs_shape.dimensions(dim_numbers.lhs_contracting_dimensions(0)),
+               rhs_shape.dimensions(dim_numbers.rhs_contracting_dimensions(0)));
       return true;
     }
   }
 
-  if (hlo.opcode() == HloOpcode::kFusion &&
-      hlo.fusion_kind() == HloInstruction::FusionKind::kTransposeDot &&
-      hlo.fused_expression_root()->opcode() == HloOpcode::kDot) {
-    auto* dot = hlo.fused_expression_root();
-    const Shape& lhs_shape = dot->operand(0)->shape();
-    const Shape& rhs_shape = dot->operand(1)->shape();
-    if (ShapeUtil::HasZeroElements(lhs_shape) ||
-        ShapeUtil::HasZeroElements(rhs_shape)) {
-      return false;
-    }
-    return true;
-  }
-
   return false;
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index 47e09243340840980ebe21be3a2b056985877235..d88ccea0dbc845c0d9a580a5b118c57c888fb557 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -31,7 +31,9 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo);
+bool PotentiallyImplementedAsEigenDot(
+    const HloInstruction& hlo,
+    const TargetMachineFeatures& target_machine_features);
 
 // Returns the index for an operand to `hlo` that should ideally be column
 // major.  Returns nullopt if there is no such operand or if `hlo` is not a dot
@@ -55,17 +57,16 @@ class DotOpEmitter {
   // dimensions as the result, and the result is computed as `addend_array` +
   // dot(`lhs_array`, `rhs_array`).  A non-null `addend_array` is only supported
   // for Matrix-vector products.
-  static tensorflow::Status EmitDotOperation(
-      const HloInstruction& dot, bool transpose_lhs, bool transpose_rhs,
-      const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array,
-      const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array,
+  static Status EmitDotOperation(
+      const HloInstruction& dot, const llvm_ir::IrArray& target_array,
+      const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
+      const llvm_ir::IrArray* addend_array,
       llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
       const HloModuleConfig& hlo_module_config,
       const TargetMachineFeatures& target_machine_features);
 
  private:
-  DotOpEmitter(const HloInstruction& dot, bool transpose_lhs,
-               bool transpose_rhs, const llvm_ir::IrArray& target_array,
+  DotOpEmitter(const HloInstruction& dot, const llvm_ir::IrArray& target_array,
                const llvm_ir::IrArray& lhs_array,
                const llvm_ir::IrArray& rhs_array,
                const llvm_ir::IrArray* addend_array,
@@ -75,18 +76,18 @@ class DotOpEmitter {
                const TargetMachineFeatures& target_machine_features);
 
   // Emits the IR to perform the dot operation.
-  tensorflow::Status Emit();
+  Status Emit();
 
   // Emits instructions to perform a scalar dot product (a multiply of the
   // LHS and RHS) and store the results in the target.
-  tensorflow::Status EmitScalarDot();
+  Status EmitScalarDot();
 
   // Emit an LLVM IR implementation of the dot operation if we can.  Returns
   // true if an LLVM IR implementation was emitted.
   bool EmitLlvmIrDotIfProfitable();
 
   // Emits a call to the CPU runtime to perform the matrix multiply.
-  tensorflow::Status EmitCallToRuntime();
+  Status EmitCallToRuntime();
 
   // Emits a series of nested loops for iterating over an operand array in the
   // dot operation. Loops are constructed in major to minor dimension layout
@@ -111,11 +112,20 @@ class DotOpEmitter {
     // The number of columns on the RHS.
     int64 n;
 
-    // True if the LHS matrix column major.
+    // True if the LHS matrix is column major.
     bool lhs_column_major;
 
-    // True if the RHS matrix column major.
+    // True if the LHS contraction dimension is not 1.
+    bool lhs_non_canonical;
+
+    // True if the RHS matrix is column major.
     bool rhs_column_major;
+
+    // True if the RHS contraction dimension is not 0.
+    bool rhs_non_canonical;
+
+    // True if the result matrix is column major.
+    bool target_column_major;
   };
 
   // Get the MatMultDims instance for the dot product this DotOpEmitter
@@ -123,6 +133,8 @@ class DotOpEmitter {
   // of rank 2 as well).
   MatMultDims GetMatMultDims() const;
 
+  bool EmitExperimentalGebpDotIfEnabled(const MatMultDims& mat_mult_dims);
+
   // When doing a tiled GEMV in LLVM IR, a "tile" consists of this many vector
   // registers.
   int64 GetGemvTilingFactor() const {
@@ -131,9 +143,18 @@ class DotOpEmitter {
         .value_or(kDefaultTilingFactor);
   }
 
+  // Returns true if we should use an experimental implementation of GEMM
+  // (general matrix matrix multiplication) if possible.
+  bool EnableExperimentalLlvmIrGemm() const {
+    return options::EnableExperimentalLlvmIrGemm(hlo_module_config_);
+  }
+
+  // Returns true if we should call into multi-threaded Eigen routines.
+  bool ShouldUseMultiThreadedEigen() {
+    return hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  }
+
   const HloInstruction& dot_;
-  const bool transpose_lhs_;
-  const bool transpose_rhs_;
   const llvm_ir::IrArray& target_array_;
   const llvm_ir::IrArray& lhs_array_;
   const llvm_ir::IrArray& rhs_array_;
diff --git a/tensorflow/compiler/xla/service/cpu/external_constant_pool.cc b/tensorflow/compiler/xla/service/cpu/external_constant_pool.cc
index 7dcc4ca7fa08b478f24065275ffa69725dc51682..c56286559158758ca6db5ae097729286bde346f0 100644
--- a/tensorflow/compiler/xla/service/cpu/external_constant_pool.cc
+++ b/tensorflow/compiler/xla/service/cpu/external_constant_pool.cc
@@ -26,13 +26,13 @@ limitations under the License.
 
 namespace xla {
 namespace cpu {
-void ExternalConstantPool::Insert(string name, const Literal& literal,
+void ExternalConstantPool::Insert(string name, const LiteralSlice& literal,
                                   int64 alignment) {
   CHECK(!ShapeUtil::IsTuple(literal.shape()));
   CHECK(alignment > 0 && IsPowerOfTwo(static_cast<uint64>(alignment)));
   CHECK(entries_.find(name) == entries_.end());
 
-  int64 literal_size = ShapeUtil::ByteSizeOf(literal.shape());
+  const int64 literal_size = ShapeUtil::ByteSizeOf(literal.shape());
   void* raw_pointer = tensorflow::port::AlignedMalloc(
       literal_size, std::max<size_t>(alignment, sizeof(void*)));
   CHECK(raw_pointer != nullptr) << "failed to allocate " << literal_size
diff --git a/tensorflow/compiler/xla/service/cpu/external_constant_pool.h b/tensorflow/compiler/xla/service/cpu/external_constant_pool.h
index 8008a56df4dbf16e7b57aee8a344058bb0d5883d..0677f5f0b58005079890052a426e5f48c5d09ed1 100644
--- a/tensorflow/compiler/xla/service/cpu/external_constant_pool.h
+++ b/tensorflow/compiler/xla/service/cpu/external_constant_pool.h
@@ -43,7 +43,7 @@ class ExternalConstantPool {
   // The constant pool copies out the contents of `literal` into a buffer it
   // owns -- it does not keep pointers to `literal`, or to memory owned by
   // `literal`.
-  void Insert(string name, const Literal& literal, int64 alignment);
+  void Insert(string name, const LiteralSlice& literal, int64 alignment);
 
   // Find the constant with name `name` in this constant pool.  If there isn't
   // such constant, return nullptr.
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
index f209a69e3cd0f8d336d61bafd1e22be8bc88ca3f..b560b7531c0d24e6f670e61a15dce295d9fa2a49 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
@@ -24,8 +24,25 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
+int64 GetMinimumAlignmentForArray(
+    const Shape& shape, const TargetMachineFeatures& target_machine_features) {
+  CHECK(ShapeUtil::IsArray(shape));
+  CHECK(!LayoutUtil::HasLayout(shape) || LayoutUtil::IsDense(shape.layout()));
+
+  // We don't require a layout to be set on `shape`.  This only works on CPU
+  // because we don't pad our tensors or otherwise have complicated data tiling
+  // schemes.
+
+  int64 allocation_size_bytes =
+      ShapeUtil::ElementsIn(shape) *
+      ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type());
+  return target_machine_features.minimum_alignment_for_allocation(
+      allocation_size_bytes);
+}
+
 bool PotentiallyImplementedAsEigenConvolution(
-    const HloInstruction& convolution) {
+    const HloInstruction& convolution,
+    const TargetMachineFeatures& target_machine_features) {
   // The following conditions are necessary (but not sufficient) for
   // implementing `convolution` with Eigen convolution:
   // - the input and kernel have a non-zero number of elements.
@@ -35,6 +52,18 @@ bool PotentiallyImplementedAsEigenConvolution(
   // To be sufficient, certain layout constraints need to be satisfied as well.
   const Shape& input_shape = convolution.operand(0)->shape();
   const Shape& kernel_shape = convolution.operand(1)->shape();
+  const Shape& output_shape = convolution.shape();
+
+  auto is_aligned = [&](const Shape& shape) {
+    return GetMinimumAlignmentForArray(shape, target_machine_features) >=
+           TargetMachineFeatures::kEigenExpectedTensorAlignment;
+  };
+
+  if (!is_aligned(input_shape) || !is_aligned(kernel_shape) ||
+      !is_aligned(output_shape)) {
+    return false;
+  }
+
   if (ShapeUtil::HasZeroElements(input_shape) ||
       ShapeUtil::HasZeroElements(kernel_shape)) {
     return false;
@@ -71,7 +100,6 @@ bool PotentiallyImplementedAsEigenConvolution(
     }
   }
 
-  const Shape& output_shape = convolution.shape();
   return dnums.input_batch_dimension() == 0 &&
          dnums.input_feature_dimension() == input_shape.dimensions_size() - 1 &&
          dnums.output_batch_dimension() == 0 &&
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
index 34b2003916933f5ec0a15d9e219063c0a912fa40..68fbc7caaa9bfec0ecd7cc7f473c8ca8afce19db 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
@@ -17,13 +17,20 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMISSION_UTILS_H_
 
 #include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
 namespace cpu {
 
 bool PotentiallyImplementedAsEigenConvolution(
-    const HloInstruction& convolution);
+    const HloInstruction& convolution,
+    const TargetMachineFeatures& target_machine_features);
+
+// Computes the minimum alignment guaranteed for a tensor of shape `shape` on
+// the target machine.
+int64 GetMinimumAlignmentForArray(
+    const Shape& shape, const TargetMachineFeatures& target_machine_features);
 
 // Dynamic loop bounds are specified as an array of dimension index
 // [start, limit) pairs of ir values (one for each partitioned outer dimension).
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc
index 215f48c4cc1a1a6b13d98dff76e0d1f0f773f5c1..abb2471e6ae6b2f2949ab2e91235e5047ae404f8 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
@@ -39,7 +40,12 @@ ENTRY Conv {
   HloComputation* entry_computation = module->entry_computation();
 
   HloInstruction* conv_instr = entry_computation->root_instruction();
-  EXPECT_FALSE(cpu::PotentiallyImplementedAsEigenConvolution(*conv_instr));
+  cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features(
+      [](int64 shape_size) {
+        return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
+      });
+  EXPECT_FALSE(cpu::PotentiallyImplementedAsEigenConvolution(
+      *conv_instr, target_machine_features));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 6347ee2a2a17502bb78ff3bdbba10ead35b72c40..13bd5e73db500e20b0e8c33bf921ee2457e126e5 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -83,7 +83,7 @@ IrEmitter::IrEmitter(
     llvm::Module* llvm_module,
     std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx,
     std::unordered_map<const HloComputation*, int64> computation_to_profile_idx,
-    llvm::TargetMachine* target_machine,
+    const TargetMachineFeatures* target_machine_features,
     ExternalConstantPool* external_constant_pool)
     : assignment_(assignment),
       module_(llvm_module),
@@ -94,7 +94,7 @@ IrEmitter::IrEmitter(
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
       hlo_module_config_(hlo_module.config()),
       is_top_level_computation_(false),
-      target_machine_features_(target_machine),
+      target_machine_features_(*target_machine_features),
       external_constant_pool_(external_constant_pool) {
   ir_builder_.setFastMathFlags(llvm_ir::GetFastMathFlags(
       /*fast_math_enabled=*/hlo_module_config_.debug_options()
@@ -227,32 +227,6 @@ Status IrEmitter::HandleCopy(HloInstruction* copy) {
   }
 }
 
-// Calculate the alignment of a buffer with a particular size.
-int IrEmitter::MinimumAlignmentForBufferSize(int64 buffer_size) {
-  // GLibc returns a pointer with alignment 8 on 32-bit platforms and 16 on
-  // 64-bit platforms.  TCMalloc returns a pointer with alignment 8 for
-  // allocations smaller than kMallocAlignmentThreshold bytes and at least
-  // alignment 16 for allocations greater than or equal to
-  // kMallocAlignmentThreshold bytes.  N.B. We could improve on this lower bound
-  // by explicitly allocating the memory with posix_memalign.  This is
-  // complicated by our desire to allow parameter buffers created by clients to
-  // be consumed directly by the JIT.
-  if (buffer_size == 0) {
-    // No need to align empty buffers.
-    return 1;
-  }
-
-  const int64 kMallocAlignmentThreshold = 512;
-
-  int pointer_size = module_->getDataLayout().getPointerSize();
-  int buffer_alignment = buffer_size >= kMallocAlignmentThreshold
-                             ? 2 * pointer_size
-                             : pointer_size;
-  DCHECK_GT(buffer_alignment, 0);
-
-  return buffer_alignment;
-}
-
 // Calculate the alignment of a buffer allocated for a given primitive type.
 int IrEmitter::MinimumAlignmentForPrimitiveType(PrimitiveType primitive_type) {
   int64 byte_size = ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
@@ -277,7 +251,7 @@ int IrEmitter::MinimumAlignmentForShape(const Shape& shape) {
   DCHECK_GE(buffer_size, 0);
   DCHECK_LE(buffer_size, SIZE_MAX);
 
-  return MinimumAlignmentForBufferSize(buffer_size);
+  return target_machine_features_.minimum_alignment_for_allocation(buffer_size);
 }
 
 void IrEmitter::AttachAlignmentMetadataForLoad(llvm::LoadInst* load,
@@ -290,7 +264,8 @@ void IrEmitter::AttachAlignmentMetadataForLoad(llvm::LoadInst* load,
 
 void IrEmitter::AttachAlignmentMetadataForLoad(llvm::LoadInst* load,
                                                int64 buffer_size) {
-  int alignment = MinimumAlignmentForBufferSize(buffer_size);
+  int alignment =
+      target_machine_features_.minimum_alignment_for_allocation(buffer_size);
   if (alignment > 1) {
     llvm_ir::SetAlignmentMetadataForLoad(load, alignment);
   }
@@ -530,7 +505,7 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
   HloComputation* function = reduce_window->to_apply();
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*reduce_window, /*operands=*/{operand},
-      /*supported_types=*/{F32, BF16}));
+      /*supported_types=*/{F32, BF16, S32}));
 
   // TODO(b/31410564): Implement dilation for reduce-window.
   if (window_util::HasDilation(window)) {
@@ -827,13 +802,6 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
         "Dot with multiple contracting dimensions not implemented.");
   }
 
-  if (dnums.lhs_contracting_dimensions(0) !=
-          std::min(lhs->shape().dimensions_size() - 1, 1) ||
-      dnums.rhs_contracting_dimensions(0) != 0) {
-    return Unimplemented(
-        "Dot with non-standard contracting dimensions not implemented.");
-  }
-
   llvm_ir::IrArray lhs_array(GetIrArrayFor(lhs));
   llvm_ir::IrArray rhs_array(GetIrArrayFor(rhs));
 
@@ -850,8 +818,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
 
   // Dot operation is complicated so we delegate to a helper class.
   return DotOpEmitter::EmitDotOperation(
-      *dot, /*transpose_lhs=*/false, /*transpose_rhs=*/false, target_array,
-      lhs_array, rhs_array, /*addend_array=*/nullptr,
+      *dot, target_array, lhs_array, rhs_array, /*addend_array=*/nullptr,
       GetExecutableRunOptionsArgument(), &ir_builder_, hlo_module_config_,
       target_machine_features_);
 }
@@ -869,7 +836,8 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
 
   // TODO(tonywy): Add PotentiallyImplementedAsMKLCovolution to support
   // different data layouts.
-  if (PotentiallyImplementedAsEigenConvolution(*convolution)) {
+  if (PotentiallyImplementedAsEigenConvolution(*convolution,
+                                               target_machine_features_)) {
     const Shape& lhs_shape = lhs->shape();
     const Shape& rhs_shape = rhs->shape();
     const Shape& convolution_shape = convolution->shape();
@@ -1035,12 +1003,14 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
         // We will accumulate the products into this sum to calculate
         // the output entry at the given index.
         PrimitiveType lhs_element_type = lhs->shape().element_type();
+        llvm::Type* lhs_llvm_type =
+            llvm_ir::PrimitiveTypeToIrType(lhs_element_type, module_);
         llvm::Value* sum_address = llvm_ir::EmitAllocaAtFunctionEntry(
-            llvm_ir::PrimitiveTypeToIrType(lhs_element_type, module_),
-            "convolution_sum_address", &ir_builder_,
+            lhs_llvm_type, "convolution_sum_address", &ir_builder_,
             MinimumAlignmentForPrimitiveType(lhs_element_type));
-        ir_builder_.CreateStore(
-            llvm::ConstantFP::get(ir_builder_.getFloatTy(), 0.0), sum_address);
+        llvm::Value* constant_zero =
+            llvm::Constant::getNullValue(lhs_llvm_type);
+        ir_builder_.CreateStore(constant_zero, sum_address);
 
         llvm_ir::ForLoopNest loops(IrName(convolution, "inner"), &ir_builder_);
         std::vector<llvm::Value*> kernel_spatial(num_spatial_dims);
@@ -1194,7 +1164,13 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
       {int8_ptr_type, int8_ptr_type, int8_ptr_type, int32_type, int32_type,
        int64_type, int64_type, int64_type, int64_type},
       /*isVarArg=*/false);
-  const char* fn_name = runtime::kEigenFftSymbolName;
+
+  bool multi_threaded_eigen =
+      hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  const char* fn_name = multi_threaded_eigen
+                            ? runtime::kEigenFftSymbolName
+                            : runtime::kEigenSingleThreadedFftSymbolName;
+
   llvm::Function* fft_func = llvm::cast<llvm::Function>(
       module_->getOrInsertFunction(fn_name, fft_type));
   fft_func->setCallingConv(llvm::CallingConv::C);
@@ -1216,16 +1192,45 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
 }
 
 Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
-  if (hlo_module_config_.replica_count() == 1) {
-    // When there is a single replica, a cross replica sum is the identity
-    // function, and the buffer assignment expects a copy (we could eliminate
-    // these at the HLO level as an optimization).
-    TF_RETURN_IF_ERROR(EmitTargetAddressForOp(crs));
+  if (hlo_module_config_.replica_count() != 1) {
+    // TODO(b/33011107): Support nontrivial cross replica sum on CPU.
+    return Unimplemented(
+        "CrossReplicaSum with >1 replica is not implemented on CPU.");
+  }
+
+  // When there is a single replica, a cross replica sum is the identity
+  // function, and the buffer assignment expects a copy.
+  //
+  // TODO(b/80100934): We would like to eliminate one-replica CRS nodes entirely
+  // in algebraic-simplifier, but currently on some platforms
+  // HloModuleConfig::num_replicas changes between when the module is compiled
+  // and when it's run.
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(crs));
+
+  // CRS with one operand and one replica is simply the identity function.
+  if (crs->operand_count() == 1) {
     return EmitMemcpy(*crs->operand(0), *crs);
   }
 
-  // TODO(b/33011107): Support cross replica sum on CPU.
-  return Unimplemented("CrossReplicaSum is not implemented on CPU.");
+  // CRS with multiple operands and one replica produces a (one-deep) tuple.
+  std::vector<llvm::Value*> operand_ptrs;
+  for (int64 i = 0; i < crs->operand_count(); ++i) {
+    llvm::Value* in_ptr = GetEmittedValueFor(crs->operand(i));
+    TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_slice,
+                        assignment_.GetUniqueSlice(crs, {i}));
+
+    const Shape& operand_shape = crs->operand(i)->shape();
+    CHECK(ShapeUtil::IsArray(operand_shape))
+        << "Operands to cross-replica-sum must be arrays: " << crs->ToString();
+    operand_ptrs.push_back(EmitTempBufferPointer(out_slice, operand_shape));
+
+    // TODO(b/63762267): Be more aggressive about specifying alignment.
+    ir_builder_.CreateMemCpy(operand_ptrs.back(), /*DstAlign=*/1, in_ptr,
+                             /*SrcAlign=*/1,
+                             ShapeUtil::ByteSizeOf(operand_shape));
+  }
+  llvm_ir::EmitTuple(GetIrArrayFor(crs), operand_ptrs, &ir_builder_, module_);
+  return Status::OK();
 }
 
 // Fills up the free variables in 'index_with_free_var' with values from
@@ -2086,44 +2091,7 @@ static const HloInstruction* StripTranspose(const HloInstruction& hlo) {
 
 Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   auto* root = fusion->fused_expression_root();
-  if (fusion->fusion_kind() == HloInstruction::FusionKind::kTransposeDot) {
-    DCHECK(root->opcode() == HloOpcode::kDot);
-    const HloInstruction* lhs_parameter = StripTranspose(*root->operand(0));
-    const HloInstruction* rhs_parameter = StripTranspose(*root->operand(1));
-    DCHECK(lhs_parameter->opcode() == HloOpcode::kParameter &&
-           rhs_parameter->opcode() == HloOpcode::kParameter);
-    const HloInstruction* lhs =
-        fusion->operand(lhs_parameter->parameter_number());
-    const HloInstruction* rhs =
-        fusion->operand(rhs_parameter->parameter_number());
-
-    TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
-        /*instruction=*/*root, /*operands=*/{lhs, rhs},
-        /*supported_types=*/{F16, F32, F64}));
-
-    llvm_ir::IrArray lhs_array(GetIrArrayFor(lhs));
-    llvm_ir::IrArray rhs_array(GetIrArrayFor(rhs));
-
-    Shape target_shape = fusion->shape();
-    TF_RETURN_IF_ERROR(EmitTargetAddressForOp(fusion));
-    llvm_ir::IrArray target_array = GetIrArrayFor(fusion);
-    VLOG(2) << "HandleFusion kTransposeDot: ";
-    VLOG(2) << "  lhs operand: "
-            << llvm_ir::DumpToString(*lhs_array.GetBasePointer());
-    VLOG(2) << "  rhs operand: "
-            << llvm_ir::DumpToString(*rhs_array.GetBasePointer());
-    VLOG(2) << "  target: "
-            << llvm_ir::DumpToString(*target_array.GetBasePointer());
-
-    // Dot operation is complicated so we delegate to a helper class.
-    TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation(
-        *root, root->operand(0)->IsRank2Transpose(),
-        root->operand(1)->IsRank2Transpose(), target_array, lhs_array,
-        rhs_array, /*addend_array=*/nullptr, GetExecutableRunOptionsArgument(),
-        &ir_builder_, hlo_module_config_, target_machine_features_));
-    return Status::OK();
-  } else if (llvm_ir::CanEmitFusedDynamicUpdateSliceInPlace(fusion,
-                                                            assignment_)) {
+  if (llvm_ir::CanEmitFusedDynamicUpdateSliceInPlace(fusion, assignment_)) {
     VLOG(3) << "HandleFusion FusedDynamicUpdateSliceInPlace";
     CpuElementalIrEmitter elemental_emitter(hlo_module_config_, this, module_);
     TF_RETURN_IF_ERROR(EmitTargetAddressForOp(fusion));
@@ -2166,9 +2134,9 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
         GetIrArrayFor(fusion->operand(addend_param_number)));
 
     TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation(
-        *dot, /*transpose_lhs=*/false, /*transpose_rhs=*/false, target_array,
-        lhs_array, rhs_array, &addend_array, GetExecutableRunOptionsArgument(),
-        &ir_builder_, hlo_module_config_, target_machine_features_));
+        *dot, target_array, lhs_array, rhs_array, &addend_array,
+        GetExecutableRunOptionsArgument(), &ir_builder_, hlo_module_config_,
+        target_machine_features_));
     return Status::OK();
   } else {
     return Unimplemented("Fusion kind not implemented on CPU");
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 5a040760804fa5609e1d68511d4b2abe8e2ec8f9..f49cfc1dc378bb80da3ddf995363acfa2081067b 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -76,7 +76,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
                 instruction_to_profile_idx,
             std::unordered_map<const HloComputation*, int64>
                 computation_to_profile_idx,
-            llvm::TargetMachine* target_machine,
+            const TargetMachineFeatures* target_machine,
             ExternalConstantPool* external_constant_pool);
   ~IrEmitter() override;
 
@@ -514,9 +514,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // Calculate the alignment of a buffer allocated for a given primitive type.
   int MinimumAlignmentForPrimitiveType(PrimitiveType primitive_type);
 
-  // Calculate the alignment of a buffer with a particular size.
-  int MinimumAlignmentForBufferSize(int64 buffer_size);
-
   // Returns the number of bytes within the shape.
   int64 ByteSizeOf(const Shape& shape) const;
 
@@ -536,7 +533,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
 
   bool is_top_level_computation_;
 
-  TargetMachineFeatures target_machine_features_;
+  const TargetMachineFeatures& target_machine_features_;
 
   int64 external_global_constant_counter_ = 0;
   ExternalConstantPool* external_constant_pool_;
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index fb28280fade307ac1f193e7dca481bd2afa855fc..63d0f7b95c7e45913c707471dbe2dc62e05251d6 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -104,7 +104,9 @@ class DefaultCostModel : public ParallelCostModel {
 
 ParallelTaskAssignment::ParallelTaskAssignment(
     const int64 max_parallelism,
-    const HloCostAnalysis::ShapeSizeFunction& shape_size, HloModule* module) {
+    const HloCostAnalysis::ShapeSizeFunction& shape_size, HloModule* module,
+    const TargetMachineFeatures* target_machine_features)
+    : target_machine_features_(*target_machine_features) {
   VLOG(1) << "ParallelTaskAssignment max_parallelism: " << max_parallelism;
   // Run cost analysis on 'module'.
   auto cost_analysis = MakeUnique<HloCostAnalysis>(shape_size);
@@ -127,7 +129,7 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
   // Currently, we do not assign parallel tasks to instructions with at least
   // one of the following properties:
   // *) Internal threading (library calls to kConv, kDot, kFft, kCustomCall).
-  // *) Emit custom loops (kSelectAndScatter, FusionKind::kTransposeDot).
+  // *) Emit custom loops (kSelectAndScatter).
   // *) Operations that are not thread safe (like infeed and rng).
   // *) Tuple-shaped.
   // TODO(b/27458679) Parallelize instructions which are skipped here.
@@ -139,8 +141,10 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
       opcode == HloOpcode::kFft || opcode == HloOpcode::kInfeed ||
       opcode == HloOpcode::kOutfeed || opcode == HloOpcode::kRng ||
       (opcode == HloOpcode::kConvolution &&
-       PotentiallyImplementedAsEigenConvolution(*instruction)) ||
-      PotentiallyImplementedAsEigenDot(*instruction) ||
+       PotentiallyImplementedAsEigenConvolution(*instruction,
+                                                target_machine_features_)) ||
+      PotentiallyImplementedAsEigenDot(*instruction,
+                                       target_machine_features_) ||
       (opcode == HloOpcode::kFusion &&
        instruction->fusion_kind() != HloInstruction::FusionKind::kLoop) ||
       ShapeUtil::IsTuple(instruction->shape())) {
@@ -231,7 +235,8 @@ bool ParallelTaskAssigner::AssignParallelTasksHelper(
 void ParallelTaskAssigner::ComputeTargetParallelTasks(
     HloModule* module, HloToParallelTasks* hlo_to_parallel_tasks) {
   ParallelTaskAssignment parallel_task_assignment(max_parallelism_,
-                                                  shape_size_function_, module);
+                                                  shape_size_function_, module,
+                                                  &target_machine_features_);
 
   // Compute parallel task counts for all instructions in 'module'.
   for (auto* computation : module->computations()) {
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
index 7140dabe516cd7ea9260456e994e8b63b68c60d6..8becc8fa23424d7454cc783eb9d853aecb5d053b 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
 
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
@@ -39,7 +40,8 @@ class ParallelTaskAssignment {
   // 'module': the containing HloModule.
   ParallelTaskAssignment(const int64 max_parallelism,
                          const HloCostAnalysis::ShapeSizeFunction& shape_size,
-                         HloModule* module);
+                         HloModule* module,
+                         const TargetMachineFeatures* target_machine_features);
   ~ParallelTaskAssignment() {}
 
   // Computes and returns the target parallel task count for 'instruction'.
@@ -47,6 +49,7 @@ class ParallelTaskAssignment {
 
  private:
   std::unique_ptr<ParallelCostModel> cost_model_;
+  const TargetMachineFeatures& target_machine_features_;
 };
 
 // ParallelTaskAssigner computes target parallel task counts for all HLOs
@@ -63,8 +66,11 @@ class ParallelTaskAssigner : public HloPassInterface {
   // 'shape_size': shape size function used by HloCostAnalysis during parallel
   //               task assignment.
   ParallelTaskAssigner(const int64 max_parallelism,
-                       const HloCostAnalysis::ShapeSizeFunction& shape_size)
-      : max_parallelism_(max_parallelism), shape_size_function_(shape_size) {}
+                       const HloCostAnalysis::ShapeSizeFunction& shape_size,
+                       const TargetMachineFeatures* target_machine_features)
+      : max_parallelism_(max_parallelism),
+        shape_size_function_(shape_size),
+        target_machine_features_(*target_machine_features) {}
   ~ParallelTaskAssigner() override {}
 
   tensorflow::StringPiece name() const override {
@@ -94,6 +100,7 @@ class ParallelTaskAssigner : public HloPassInterface {
 
   int64 max_parallelism_;
   HloCostAnalysis::ShapeSizeFunction shape_size_function_;
+  const TargetMachineFeatures& target_machine_features_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
index 13eb75a57213b1a68a5732a4f6061efdf97fa4f4..fc2efbaf9a22b02cd729da2f367d53bc15506836 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -31,6 +32,19 @@ class ParallelTaskAssignmentTest : public HloVerifiedTestBase {
   // Use any value larger than 2 since we only test whether a module is
   // parallelized or not
   const int max_parallelism_ = 10;
+
+  cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features_;
+
+  ParallelTaskAssignmentTest()
+      : target_machine_features_([](int64 shape_size) {
+          return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
+        }) {}
+
+  StatusOr<bool> RunParallelTaskAssigner(HloModule* module) {
+    return cpu::ParallelTaskAssigner(max_parallelism_, shape_size_func_,
+                                     &target_machine_features_)
+        .Run(module);
+  }
 };
 
 TEST_F(ParallelTaskAssignmentTest, DotOperationNotParallelized) {
@@ -45,9 +59,7 @@ TEST_F(ParallelTaskAssignmentTest, DotOperationNotParallelized) {
   )";
 
   ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
-                                            max_parallelism_, shape_size_func_)
-                                            .Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
   EXPECT_FALSE(changed);
 }
 
@@ -74,9 +86,7 @@ TEST_F(ParallelTaskAssignmentTest,
   )";
 
   ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
-                                            max_parallelism_, shape_size_func_)
-                                            .Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
   EXPECT_FALSE(changed);
 }
 
@@ -92,9 +102,7 @@ TEST_F(ParallelTaskAssignmentTest, RngOperationNotParallelized) {
   )";
 
   ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
-                                            max_parallelism_, shape_size_func_)
-                                            .Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
   EXPECT_FALSE(changed);
 }
 
@@ -108,9 +116,7 @@ TEST_F(ParallelTaskAssignmentTest, InfeedOutfeedOperationNotParallelized) {
   )";
 
   ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
-                                            max_parallelism_, shape_size_func_)
-                                            .Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
   EXPECT_FALSE(changed);
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
index 984cb0616e02475babad7160d0f43bb23de0b50e..0bf693edd0b985a4e62c16414646cc6a17db26ee 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
@@ -21,8 +21,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/types.h"
 
 // 'tensorflow' namespace is used so that int64 and other types don't require
@@ -71,11 +69,9 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
-  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = fft_shape[i];
     out_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
-    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<float, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -88,8 +84,8 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
 
   // Compute the full FFT using a temporary tensor.
-  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
-  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(in_dims);
+
   const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
   full_fft.device(device) =
       input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(axes);
@@ -112,11 +108,9 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
-  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
     out_dims[i + 1] = fft_shape[i];
-    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -129,8 +123,7 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   // region we will slice from input given fft_shape. We slice input to
   // fft_shape on its inner-most dimensions, except the last (which we
   // slice to fft_shape[-1] / 2 + 1).
-  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
-  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(out_dims);
 
   // Calculate the starting point and range of the source of
   // negative frequency part.
@@ -179,7 +172,6 @@ template <int FFTRank, typename EigenDevice>
 void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
                       int32 fft_type, int64 input_batch, int64 fft_length0,
                       int64 fft_length1, int64 fft_length2) {
-  CHECK(::xla::FftType_IsValid(fft_type)) << fft_type;
   switch (fft_type) {
     case ::xla::FftType::FFT:
       EigenFftC2C<true, FFTRank, EigenDevice>(
@@ -204,7 +196,8 @@ void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
           input_batch, fft_length0, fft_length1, fft_length2);
       break;
     default:
-      LOG(FATAL) << "Unsupported FFT type: " << fft_type;
+      // Unsupported FFT type
+      abort();
   }
 }
 
@@ -230,7 +223,8 @@ void EigenFftImpl(const EigenDevice& device, void* out, void* operand,
                                                  fft_length1, fft_length2);
       break;
     default:
-      LOG(FATAL) << "Unsupported FFT rank " << fft_rank;
+      // Unsupported FFT rank
+      abort();
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2613ddb12704aea7d0884c6c8c062dc028383639
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+using tensorflow::int32;
+using tensorflow::int64;
+
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenSingleThreadedFft(
+    const void* run_options_ptr, void* out, void* operand, int32 fft_type,
+    int32 fft_rank, int64 input_batch, int64 fft_length0, int64 fft_length1,
+    int64 fft_length2) {
+  tensorflow::xla::EigenFftImpl(Eigen::DefaultDevice(), out, operand, fft_type,
+                                fft_rank, input_batch, fft_length0, fft_length1,
+                                fft_length2);
+}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcd133d012cf074a4cd2f550585881388bea6156
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
+
+#include "tensorflow/core/platform/types.h"
+
+extern "C" {
+
+extern void __xla_cpu_runtime_EigenSingleThreadedFft(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
+    void* operand, tensorflow::int32 fft_type, tensorflow::int32 fft_rank,
+    tensorflow::int64 input_batch, tensorflow::int64 fft_length0,
+    tensorflow::int64 fft_length1, tensorflow::int64 fft_length2);
+
+}  // extern "C"
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index ff6f0a9d4e443c2ed7d2dd6c58f4aaf28205b0cb..c4c90515ac7ec2721cb9ea48d42e3c5080e249af 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -73,23 +74,33 @@ llvm::StringRef GetHostCpuName() {
 }
 }  // namespace
 
+/*static*/ std::unique_ptr<llvm::TargetMachine>
+SimpleOrcJIT::InferTargetMachineForJIT(
+    const llvm::TargetOptions& target_options,
+    llvm::CodeGenOpt::Level opt_level) {
+  std::unique_ptr<llvm::TargetMachine> target_machine(
+      llvm::EngineBuilder()
+          .setTargetOptions(target_options)
+          .setOptLevel(opt_level)
+          .selectTarget(
+              /*TargetTriple=*/llvm::Triple(), /*MArch=*/"",
+              /*MCPU=*/GetHostCpuName(),
+              /*MAttrs=*/DetectMachineAttributes()));
+  CHECK(target_machine != nullptr);
+  return target_machine;
+}
+
 SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
                            llvm::CodeGenOpt::Level opt_level,
                            bool optimize_for_size, bool enable_fast_math,
                            bool disable_expensive_passes,
                            LLVMCompiler::ModuleHook pre_optimization_hook,
                            LLVMCompiler::ModuleHook post_optimization_hook)
-    : target_machine_(
-          CHECK_NOTNULL(llvm::EngineBuilder()
-                            .setTargetOptions(target_options)
-                            .setOptLevel(opt_level)
-                            .selectTarget(
-                                /*TargetTriple=*/llvm::Triple(), /*MArch=*/"",
-                                /*MCPU=*/GetHostCpuName(),
-                                /*MAttrs=*/DetectMachineAttributes()))),
+    : target_machine_(InferTargetMachineForJIT(target_options, opt_level)),
       disassembler_(*target_machine_),
       data_layout_(target_machine_->createDataLayout()),
       symbol_resolver_(llvm::orc::createLegacyLookupResolver(
+          execution_session_,
           [this](const std::string& name) -> llvm::JITSymbol {
             return this->ResolveRuntimeSymbol(name);
           },
@@ -192,6 +203,7 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedFft);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index f4260a95bc45557b6cd969f7d3fff01c8b392575..1851a3ee0bb97b4860605d7211a6ae70ac88686b 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -95,6 +95,12 @@ class SimpleOrcJIT {
     return &external_constant_pool_;
   }
 
+  // Creates an llvm::TargetMachine suitable for JITting code that will run on
+  // the current machine.
+  static std::unique_ptr<llvm::TargetMachine> InferTargetMachineForJIT(
+      const llvm::TargetOptions& target_options,
+      llvm::CodeGenOpt::Level opt_level);
+
  private:
   llvm::JITSymbol ResolveRuntimeSymbol(const std::string& name);
 
diff --git a/tensorflow/compiler/xla/service/cpu/target_machine_features.cc b/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
index eeb049737dddd11ef2ce229df772baec3ac03dd8..a0cd8ee2d2be10bcee9c2e216e24908d949e2d7b 100644
--- a/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
+++ b/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
@@ -18,7 +18,7 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-llvm::TargetTransformInfo* TargetMachineFeatures::GetTargetTransformInfoFor(
+llvm::TargetTransformInfo* LLVMTargetMachineFeatures::GetTargetTransformInfoFor(
     const llvm::Function& function) const {
   auto it = target_transform_info_cache_.find(&function);
   if (it == target_transform_info_cache_.end()) {
@@ -31,5 +31,30 @@ llvm::TargetTransformInfo* TargetMachineFeatures::GetTargetTransformInfoFor(
   return &it->second;
 }
 
+int64 LLVMTargetMachineFeatures::minimum_alignment_for_allocation(
+    int64 size_bytes) const {
+  // GLibc malloc returns a pointer with alignment 8 on 32-bit platforms and 16
+  // on 64-bit platforms.  TCMalloc returns a pointer with alignment 8 for
+  // allocations smaller than kMallocAlignmentThreshold bytes and at least
+  // alignment 16 for allocations greater than or equal to
+  // kMallocAlignmentThreshold bytes.  N.B. We could improve on this lower bound
+  // by explicitly allocating the memory with posix_memalign.  This is
+  // complicated by our desire to allow parameter buffers created by clients to
+  // be consumed directly by the JIT.
+  if (size_bytes == 0) {
+    // No need to align empty buffers.
+    return 1;
+  }
+
+  const int64 kMallocAlignmentThreshold = 512;
+
+  int pointer_size = target_machine_->getPointerSize(0);
+  int buffer_alignment =
+      size_bytes >= kMallocAlignmentThreshold ? 2 * pointer_size : pointer_size;
+  DCHECK_GT(buffer_alignment, 0);
+
+  return buffer_alignment;
+}
+
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/target_machine_features.h b/tensorflow/compiler/xla/service/cpu/target_machine_features.h
index 703942615e552dccde7ddec8c8b90e8a486652af..8b00ae9e47eeed26ffe80707b89593b267e8dbb8 100644
--- a/tensorflow/compiler/xla/service/cpu/target_machine_features.h
+++ b/tensorflow/compiler/xla/service/cpu/target_machine_features.h
@@ -24,43 +24,68 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-// Wraps an llvm::TargetMachine and parses out some information that feeds into
-// LLVM IR code generation decisions.
+// Abstract interface for classes providing information about the target we're
+// compiling for.
 class TargetMachineFeatures {
  public:
   static constexpr int kX86AvxVectorByteSize = 32;
 
-  TargetMachineFeatures(llvm::TargetMachine* target_machine)
-      : target_machine_(target_machine) {}
+  // Input and output tensor buffers must be aligned to this many bytes if we
+  // want to call an Eigen backed GEMM or Convolution.
+  static constexpr int kEigenExpectedTensorAlignment = 16;
 
   // Return the vectorization factor, which is the number of bytes of data
   // explicitly vectorized routines will try to process at once.
-  int vectorization_factor_in_bytes() const {
-    // Ideally this should be a function of the cache line size (which we can
-    // get from llvm::TargetTransformInfo::getCacheLineSize) of the target
-    // machine.  Guess a value of 128 bytes for now.
-    return 128;
-  }
+  virtual int vectorization_factor_in_bytes() const = 0;
 
   // Return the size of the largest vector size in bytes.  We need to pass in
   // "function" since llvm functions can contain annotations for specializing
   // them to specific micro-architectures (though currently XLA does not use
   // this functionality).
-  int vector_register_byte_size(const llvm::Function& function) const {
-    llvm::TargetTransformInfo* tti = GetTargetTransformInfoFor(function);
-    return tti->getRegisterBitWidth(/*Vector=*/true) / 8;
-  }
+  virtual int vector_register_byte_size(
+      const llvm::Function& function) const = 0;
 
   // Return the number of elements of type `type` that can fit into the largest
   // vector register available.  We need to pass in "function" since llvm
   // functions can contain annotations for specializing them to specific
   // micro-architectures (though currently XLA does not use this functionality).
+  virtual int vector_register_num_elements(const llvm::Function& function,
+                                           PrimitiveType type) const = 0;
+
+  // Returns the minimum alignment for a buffer of size size_bytes.
+  virtual int64 minimum_alignment_for_allocation(int64 size_bytes) const = 0;
+
+  virtual ~TargetMachineFeatures() = default;
+};
+
+// Implements the TargetMachineFeatures interface using an llvm::TargetMachine.
+class LLVMTargetMachineFeatures : public TargetMachineFeatures {
+ public:
+  static constexpr int kX86AvxVectorByteSize = 32;
+
+  LLVMTargetMachineFeatures(llvm::TargetMachine* target_machine)
+      : target_machine_(target_machine) {}
+
+  int vectorization_factor_in_bytes() const override {
+    // Ideally this should be a function of the cache line size (which we can
+    // get from llvm::TargetTransformInfo::getCacheLineSize) of the target
+    // machine.  Guess a value of 128 bytes for now.
+    return 128;
+  }
+
+  int vector_register_byte_size(const llvm::Function& function) const override {
+    llvm::TargetTransformInfo* tti = GetTargetTransformInfoFor(function);
+    return tti->getRegisterBitWidth(/*Vector=*/true) / 8;
+  }
+
   int vector_register_num_elements(const llvm::Function& function,
-                                   PrimitiveType type) const {
+                                   PrimitiveType type) const override {
     return vector_register_byte_size(function) /
            (primitive_util::BitWidth(type) / 8);
   }
 
+  int64 minimum_alignment_for_allocation(int64 size_bytes) const override;
+
  private:
   llvm::TargetTransformInfo* GetTargetTransformInfoFor(
       const llvm::Function& function) const;
diff --git a/tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h b/tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffc6927cbe1a2b6fd1a1ca3aac9b6e047741c2af
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TARGET_MACHINE_FEATURES_FAKE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TARGET_MACHINE_FEATURES_FAKE_H_
+
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
+
+namespace xla {
+namespace cpu {
+// Delegates calls to minimum_alignment_for_allocation to a user provided
+// std::function, crashes on all other methods.
+//
+// Primarily useful for testing.
+class TargetMachineFeaturesWithFakeAlignmentLogic
+    : public TargetMachineFeatures {
+ public:
+  explicit TargetMachineFeaturesWithFakeAlignmentLogic(
+      std::function<int64(int64)> fake_alignment_logic)
+      : fake_alignment_logic_(std::move(fake_alignment_logic)) {}
+
+  int vectorization_factor_in_bytes() const override {
+    LOG(FATAL) << "Unexpected call to " << __func__;
+  }
+
+  int vector_register_byte_size(const llvm::Function& function) const override {
+    LOG(FATAL) << "Unexpected call to " << __func__;
+  }
+
+  int vector_register_num_elements(const llvm::Function& function,
+                                   PrimitiveType type) const override {
+    LOG(FATAL) << "Unexpected call to " << __func__;
+  }
+
+  int64 minimum_alignment_for_allocation(int64 size_bytes) const override {
+    return fake_alignment_logic_(size_bytes);
+  }
+
+ private:
+  std::function<int64(int64)> fake_alignment_logic_;
+};
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TARGET_MACHINE_FEATURES_FAKE_H_
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index 18a915e5339623c73fee0e339fe75ee405898a36..67f776e7b5883f425b41c05342b74bebe223e17f 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -32,7 +32,6 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index 6479bf76aab581ae3ec2923d98dab53720cab203..edcaec584997b17dce30b8c46fda4abc78441064 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -143,6 +143,12 @@ class VectorSupportLibrary {
 
   llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
                                     llvm::Value* offset_elements);
+  llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
+                                    llvm::Value* offset_elements, int64 scale) {
+    return ComputeOffsetPointer(
+        base_pointer,
+        ir_builder_->CreateMul(ir_builder_->getInt64(scale), offset_elements));
+  }
   llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
                                     int64 offset_elements) {
     return ComputeOffsetPointer(base_pointer,
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc
index 35db4fd2a22cc1615ade77a801cb28c504db09a6..e228bb56bce8febcca28ae171f6de90973d020ab 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc
@@ -29,7 +29,7 @@ StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
     : DeviceMemoryAllocator(platform),
       stream_executors_(stream_executors.begin(), stream_executors.end()) {}
 
-StatusOr<se::DeviceMemoryBase> StreamExecutorMemoryAllocator::Allocate(
+StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
     int device_ordinal, uint64 size, bool retry_on_failure) {
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
                       GetStreamExecutor(device_ordinal));
@@ -40,22 +40,17 @@ StatusOr<se::DeviceMemoryBase> StreamExecutorMemoryAllocator::Allocate(
         tensorflow::strings::HumanReadableNumBytes(size).c_str(), size,
         device_ordinal);
   }
-  return result;
+  return OwningDeviceMemory(result, device_ordinal, this);
 }
 
-tensorflow::Status StreamExecutorMemoryAllocator::Deallocate(
-    int device_ordinal, se::DeviceMemoryBase* mem) {
-  if (!mem->is_null()) {
+Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
+                                                 se::DeviceMemoryBase mem) {
+  if (!mem.is_null()) {
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
                         GetStreamExecutor(device_ordinal));
-    // We make a local copy of 'mem' so the original is not zeroed out by the
-    // Deallocate() call below. This gives us a better chance of
-    // catching double-free bugs, since Deallocate silently succeeds for null
-    // values.
-    se::DeviceMemoryBase mem_copy(*mem);
-    stream_executor->Deallocate(&mem_copy);
+    stream_executor->Deallocate(&mem);
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 StatusOr<se::StreamExecutor*> StreamExecutorMemoryAllocator::GetStreamExecutor(
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h
index da45c4d45a1c56fd39b1e3e17ff131de59ceeced..d87b86caf0d3acaa5bf9a455cff2315cedb2496d 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -37,28 +38,29 @@ class DeviceMemoryAllocator {
       : platform_(platform) {}
   virtual ~DeviceMemoryAllocator() {}
 
+  // Allocates memory on the device.
+  //
+  // If size > 0 and the returned StatusOr is OK, the wrapped OwningDeviceMemory
+  // must not be null.  If size == 0, must return a null OwningDeviceMemory.
+  //
   // 'retry_on_failure': If false, and the first attempt to allocate the memory
   // fails, the allocation should return immediately without retrying.  An
   // example use case is optional scratch spaces where a failure has only
   // performance impact.
-  //
-  // Allocate() should return a null pointer for a size-0 allocation.
-  // Deallocate() must be a no-op for null pointers.
-  virtual StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal,
-                                                  uint64 size,
-                                                  bool retry_on_failure) = 0;
+  virtual StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                                bool retry_on_failure) = 0;
 
   // Two-arg version of Allocate(), which sets retry-on-failure to true.
   //
   // (We don't simply use a default argument on the virtual Allocate function
   // because default args on virtual functions are disallowed by the Google
   // style guide.)
-  StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size) {
+  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size) {
     return Allocate(device_ordinal, size, /*retry_on_failure=*/true);
   }
 
-  virtual tensorflow::Status Deallocate(int device_ordinal,
-                                        se::DeviceMemoryBase* mem) = 0;
+  // Must be a nop for null pointers.
+  virtual Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) = 0;
 
   // Return the platform that the allocator allocates memory on.
   const se::Platform* platform() const { return platform_; }
@@ -68,6 +70,7 @@ class DeviceMemoryAllocator {
   virtual bool AllowsAsynchronousDeallocation() const = 0;
 
  protected:
+  friend class OwningDeviceMemory;
   const se::Platform* platform_;
 };
 
@@ -79,14 +82,13 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
       const se::Platform* platform,
       tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);
 
-  StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
-                                          bool retry_on_failure) override;
+  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                        bool retry_on_failure) override;
 
   // Pull in two-arg overload that sets retry_on_failure to true.
   using DeviceMemoryAllocator::Allocate;
 
-  tensorflow::Status Deallocate(int device_ordinal,
-                                se::DeviceMemoryBase* mem) override;
+  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
 
   bool AllowsAsynchronousDeallocation() const override;
 
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 0528b076027603796a445d8b0e9cbcebd1b513a7..b9d7ec9c2e17e560580fcea060bf552c42fe3b3c 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -138,6 +138,9 @@ class DfsHloVisitorBase {
   virtual Status HandleExp(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
+  virtual Status HandleExpm1(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
   virtual Status HandleFloor(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
@@ -150,6 +153,9 @@ class DfsHloVisitorBase {
   virtual Status HandleClz(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
+  virtual Status HandleLog1p(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
   virtual Status HandleCos(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index ae32d33766093cf4e610a0dc05f7d8c88cb37d31..9a8bab353ef6b1e0b05b250d35296bc3cef8bc37 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -418,8 +418,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
     }
     case HloOpcode::kExp:
       return EmitExp(op->shape().element_type(), operand_value);
+    case HloOpcode::kExpm1:
+      return EmitExpm1(op->shape().element_type(), operand_value);
     case HloOpcode::kLog:
       return EmitLog(op->shape().element_type(), operand_value);
+    case HloOpcode::kLog1p:
+      return EmitLog1p(op->shape().element_type(), operand_value);
     case HloOpcode::kCos:
       return EmitCos(op->shape().element_type(), operand_value);
     case HloOpcode::kSin:
@@ -493,6 +497,22 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       return EmitComposeComplex(
           op, ir_builder_->CreateFMul(one_half, log_sum_sq), angle);
     }
+    case HloOpcode::kLog1p: {
+      // log1p(a+bi) = .5*log((a+1)^2+b^2) + i*atan2(b, a + 1)
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
+      llvm::Type* llvm_ty = a->getType();
+      auto one = llvm::ConstantFP::get(llvm_ty, 1.0);
+      auto a_plus_one = ir_builder_->CreateFAdd(a, one);
+      auto sum_sq = ir_builder_->CreateFAdd(
+          ir_builder_->CreateFMul(a_plus_one, a_plus_one),
+          ir_builder_->CreateFMul(b, b));
+      TF_ASSIGN_OR_RETURN(auto log_sum_sq, EmitLog(component_type, sum_sq));
+      TF_ASSIGN_OR_RETURN(auto angle, EmitAtan2(component_type, b, a_plus_one));
+      auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5);
+      return EmitComposeComplex(
+          op, ir_builder_->CreateFMul(one_half, log_sum_sq), angle);
+    }
     case HloOpcode::kConvert: {
       PrimitiveType from_type = op->operand(0)->shape().element_type();
       TF_RET_CHECK(primitive_util::IsComplexType(from_type));
@@ -523,6 +543,20 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       return EmitComposeComplex(op, ir_builder_->CreateFMul(exp_a, cos_b),
                                 ir_builder_->CreateFMul(exp_a, sin_b));
     }
+    case HloOpcode::kExpm1: {
+      // e^(a+bi)-1 = (e^a*cos(b)-1)+e^a*sin(b)i
+      TF_ASSIGN_OR_RETURN(
+          auto exp_a, EmitExp(component_type, EmitExtractReal(operand_value)));
+      TF_ASSIGN_OR_RETURN(
+          auto cos_b, EmitCos(component_type, EmitExtractImag(operand_value)));
+      TF_ASSIGN_OR_RETURN(
+          auto sin_b, EmitSin(component_type, EmitExtractImag(operand_value)));
+      auto one = llvm::ConstantFP::get(exp_a->getType(), 1.0);
+      auto real_result =
+          ir_builder_->CreateFSub(ir_builder_->CreateFMul(exp_a, cos_b), one);
+      auto imag_result = ir_builder_->CreateFMul(exp_a, sin_b);
+      return EmitComposeComplex(op, real_result, imag_result);
+    }
     case HloOpcode::kCos: {
       // cos(z) = .5(e^(iz) + e^(-iz))
       // cos(a+bi) = .5(e^(-b+ai) + e^(b-ai))
@@ -975,6 +1009,28 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog(PrimitiveType prim_type,
                                       {value->getType()}, ir_builder_);
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog1p(PrimitiveType prim_type,
+                                                     llvm::Value* value) const {
+  auto x = value;
+  auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_);
+  auto one = llvm::ConstantFP::get(type, 1.0);
+  auto negative_half = llvm::ConstantFP::get(type, -0.5);
+  // When x is large, the naive evaluation of ln(x + 1) is more
+  // accurate than the Taylor series.
+  TF_ASSIGN_OR_RETURN(auto for_large_x,
+                      EmitLog(prim_type, ir_builder_->CreateFAdd(x, one)));
+  // The Taylor series for ln(x+1) is x - x^2/2 - x^3/3 + ….
+  auto for_small_x = ir_builder_->CreateFMul(
+      ir_builder_->CreateFAdd(ir_builder_->CreateFMul(negative_half, x), one),
+      x);
+  const auto kAntilogarithmIsSmallThreshold = 1e-4;
+  auto abs_x = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value},
+                                            {type}, ir_builder_);
+  auto x_is_small = ir_builder_->CreateFCmpOLT(
+      abs_x, llvm::ConstantFP::get(type, kAntilogarithmIsSmallThreshold));
+  return ir_builder_->CreateSelect(x_is_small, for_small_x, for_large_x);
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitSin(PrimitiveType prim_type,
                                                    llvm::Value* value) const {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {value},
@@ -993,6 +1049,29 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitExp(PrimitiveType prim_type,
                                       {value->getType()}, ir_builder_);
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitExpm1(PrimitiveType prim_type,
+                                                     llvm::Value* value) const {
+  auto x = value;
+  auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_);
+  auto one = llvm::ConstantFP::get(type, 1.0);
+  auto half = llvm::ConstantFP::get(type, 0.5);
+  // When the exponent is large, the naive evaluation of e^(x) - 1 is more
+  // accurate than the Taylor series.
+  TF_ASSIGN_OR_RETURN(auto exp_x, EmitExp(prim_type, value));
+  auto for_large_x = ir_builder_->CreateFSub(exp_x, one);
+  // The Taylor series for exp(x) is 1 + x + x^2/2 + x^3/6 + ….
+  // We want exp(x)-1 which is x + x^2/2 + x^3/6 + ….
+  auto x_squared = ir_builder_->CreateFAdd(x, x);
+  auto x_squared_over_two = ir_builder_->CreateFMul(x_squared, half);
+  auto for_small_x = ir_builder_->CreateFAdd(x, x_squared_over_two);
+  const auto kExponentIsSmallThreshold = 1e-5;
+  auto abs_x = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value},
+                                            {type}, ir_builder_);
+  auto x_is_small = ir_builder_->CreateFCmpOLT(
+      abs_x, llvm::ConstantFP::get(type, kExponentIsSmallThreshold));
+  return ir_builder_->CreateSelect(x_is_small, for_small_x, for_large_x);
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitPow(PrimitiveType prim_type,
                                                    llvm::Value* lhs,
                                                    llvm::Value* rhs) const {
@@ -1468,6 +1547,26 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
     llvm_ir::IrArray::Index dim_index(1, ir_builder_->getInt64(i));
     TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value,
                         operand_to_generator.at(hlo->operand(1))(dim_index));
+
+    // Clamp the start index so that the sliced portion fits in the operand:
+    // start_index = clamp(start_index, 0, operand_dim_size - output_dim_size)
+
+    // TODO(b/74360564): This is implementation defined behavior, but is
+    // currently respected by all implementations. Change this if we ever decide
+    // to oficially document different behavior.
+    start_index_value = ir_builder_->CreateSExtOrBitCast(start_index_value,
+                                                         index[i]->getType());
+    llvm::Value* operand_dim_size = llvm::ConstantInt::get(
+        start_index_value->getType(), input_hlo->shape().dimensions(i));
+    llvm::Value* output_dim_size = llvm::ConstantInt::get(
+        start_index_value->getType(), hlo->shape().dimensions(i));
+
+    start_index_value = EmitIntegralMin(
+        ir_builder_->CreateSub(operand_dim_size, output_dim_size),
+        EmitIntegralMax(llvm::ConstantInt::get(start_index_value->getType(), 0),
+                        start_index_value, /*is_signed=*/true),
+        /*is_signed=*/true);
+
     start_index_value->setName(
         AsStringRef(IrName(hlo, StrCat("start_idx", i))));
     slice_start_index[i] = start_index_value;
@@ -1476,14 +1575,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
   llvm_ir::IrArray::Index input_index(rank);
   for (int64 i = 0; i < rank; ++i) {
     // Emit IR which computes:
-    //   input_index = (start_index + offset_index) % dim_size
-    // Security note: this is the code that keeps the indices in-bounds.
-    llvm::Value* dim_size = llvm::ConstantInt::get(
-        index[i]->getType(), input_hlo->shape().dimensions(i));
-    llvm::Value* start_index = ir_builder_->CreateZExtOrBitCast(
-        slice_start_index[i], index[i]->getType());
-    input_index[i] = ir_builder_->CreateURem(
-        ir_builder_->CreateAdd(start_index, index[i]), dim_size);
+    //   input_index = start_index + offset_index
+    input_index[i] = ir_builder_->CreateAdd(slice_start_index[i], index[i]);
   }
   return operand_to_generator.at(input_hlo)(input_index);
 }
@@ -1582,104 +1675,48 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
   const int64 rank = ShapeUtil::Rank(input_hlo->shape());
   llvm_ir::IrArray::Index slice_start_index(rank);
   llvm_ir::IrArray::Index slice_limit_index(rank);
-  // Slice starts at update[index - slice_start_index_adjusted],
-  // where adjusted value = slice_start_index when in bounds, and
-  // adjusted value = slice_start_index - input_dim, when wrapping.
-  llvm_ir::IrArray::Index slice_start_index_adjusted(rank);
-
   // Slice intersection gathers (ANDs) conditions on all ranks for which
   // 'input' is set to 'update'
   llvm::Value* slice_intersection = ir_builder_->getTrue();
 
   for (int64 i = 0; i < rank; ++i) {
-    // Emit IR to read dynamic start indices from 'start_hlo'.
     llvm_ir::IrArray::Index dim_index(1, ir_builder_->getInt64(i));
     TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value,
                         operand_to_generator.at(start_hlo)(dim_index));
-    start_index_value->setName(
-        AsStringRef(IrName(hlo, StrCat("start_idx", i))));
-    slice_start_index[i] = ir_builder_->CreateZExtOrBitCast(
-        start_index_value, index[i]->getType());
 
+    // Clamp the start index so that the update region fits in the operand.
+    // start_index = clamp(start_index, 0, input_dim_size - update_dim_size)
+
+    // TODO(b/74360564): This is implementation defined behavior, but is
+    // currently respected by all implementations. Change this if we ever decide
+    // to oficially document different behavior.
+    start_index_value = ir_builder_->CreateSExtOrBitCast(start_index_value,
+                                                         index[i]->getType());
     llvm::Value* input_dim_size = llvm::ConstantInt::get(
         index[i]->getType(), input_hlo->shape().dimensions(i));
     llvm::Value* update_dim_size = llvm::ConstantInt::get(
         index[i]->getType(), update_hlo->shape().dimensions(i));
 
-    // Generate code to handle wrapping semantics:
-    // slice_start_index[i] = slice_start_index[i] % input_dim_size;
-    // slice_limit_index[i] = slice_start_index[i] + update_dim_size.
-    // slice_start_index[i] is updated in place and it will now be in
-    // range. slice_limit_index[i] may be out of range, and it's being
-    // URem-ed below if so.
-    slice_start_index[i] =
-        ir_builder_->CreateURem(slice_start_index[i], input_dim_size);
+    start_index_value = EmitIntegralMin(
+        ir_builder_->CreateSub(input_dim_size, update_dim_size),
+        EmitIntegralMax(llvm::ConstantInt::get(start_index_value->getType(), 0),
+                        start_index_value, /*is_signed=*/true),
+        /*is_signed=*/true);
+
+    start_index_value->setName(
+        AsStringRef(IrName(hlo, StrCat("start_idx", i))));
+    slice_start_index[i] = start_index_value;
     slice_limit_index[i] =
         ir_builder_->CreateAdd(slice_start_index[i], update_dim_size);
 
-    // Test if slice_limit_index[i] is in bounds
-    llvm::Value* in_bounds =
-        ir_builder_->CreateICmpULE(slice_limit_index[i], input_dim_size);
-    llvm_ir::LlvmIfData if_in_bounds =
-        llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
-
-    // Handle true BB (slice_limit_index[i] <= input_dim_size).
-    SetToFirstInsertPoint(if_in_bounds.true_block, ir_builder_);
-    // Check that index[i] >= slice_start_index[i] &&
-    //            index[i] < slice_limit_index[i]
-    llvm::Value* slice_intersection_in_bounds = ir_builder_->CreateAnd(
+    slice_intersection = ir_builder_->CreateAnd(
         slice_intersection,
         ir_builder_->CreateICmpSGE(index[i], slice_start_index[i]),
-        "slice_intersection_in");
-    slice_intersection_in_bounds = ir_builder_->CreateAnd(
-        slice_intersection_in_bounds,
+        "slice_intersection");
+    slice_intersection = ir_builder_->CreateAnd(
+        slice_intersection,
         ir_builder_->CreateICmpSLT(index[i], slice_limit_index[i]),
-        "slice_intersection_in");
-
-    // Handle false BB (slice_limit_index[i] > input_dim_size).
-    SetToFirstInsertPoint(if_in_bounds.false_block, ir_builder_);
-    // Check that index[i] >= slice_start_index[i] ||
-    //            index[i] < slice_limit_index[i]%input_dim_size.
-    llvm::Value* index_wraps = ir_builder_->CreateICmpSLT(
-        index[i],
-        ir_builder_->CreateURem(slice_limit_index[i], input_dim_size));
-    llvm::Value* slice_intersection_or = ir_builder_->CreateOr(
-        ir_builder_->CreateICmpSGE(index[i], slice_start_index[i]), index_wraps,
-        "slice_intersection_out");
-    llvm::Value* slice_intersection_out_of_bounds = ir_builder_->CreateAnd(
-        slice_intersection, slice_intersection_or, "slice_intersection_out");
-    // Create value for slice_start_index_adjusted[i] when out of bounds.
-    // If within out-of-bounds if.
-    llvm_ir::LlvmIfData if_start_needs_adjustment =
-        llvm_ir::EmitIfThenElse(index_wraps, "adjust_start", ir_builder_);
-    SetToFirstInsertPoint(if_start_needs_adjustment.true_block, ir_builder_);
-    llvm::Value* slice_start_index_adjusted_oob =
-        ir_builder_->CreateSub(slice_start_index[i], input_dim_size);
-    SetToFirstInsertPoint(if_start_needs_adjustment.after_block, ir_builder_);
-    llvm::PHINode* slice_start_index_adjusted_phi =
-        ir_builder_->CreatePHI(slice_start_index_adjusted_oob->getType(), 2);
-    slice_start_index_adjusted_phi->addIncoming(
-        slice_start_index_adjusted_oob, if_start_needs_adjustment.true_block);
-    slice_start_index_adjusted_phi->addIncoming(
-        slice_start_index[i], if_start_needs_adjustment.false_block);
-    // End of if within if.
-
-    // After checking in/out of bounds.
-    SetToFirstInsertPoint(if_in_bounds.after_block, ir_builder_);
-    llvm::PHINode* phi_slice_intersection =
-        ir_builder_->CreatePHI(slice_intersection->getType(), 2);
-    phi_slice_intersection->addIncoming(slice_intersection_in_bounds,
-                                        if_in_bounds.true_block);
-    phi_slice_intersection->addIncoming(slice_intersection_out_of_bounds,
-                                        if_start_needs_adjustment.after_block);
-    slice_intersection = phi_slice_intersection;
-
-    llvm::PHINode* phi_index =
-        ir_builder_->CreatePHI(slice_start_index[i]->getType(), 2);
-    phi_index->addIncoming(slice_start_index[i], if_in_bounds.true_block);
-    phi_index->addIncoming(slice_start_index_adjusted_phi,
-                           if_start_needs_adjustment.after_block);
-    slice_start_index_adjusted[i] = phi_index;
+        "slice_intersection");
   }
 
   // Emit:
@@ -1696,12 +1733,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
   // Compute update index for intersection case.
   llvm_ir::IrArray::Index update_index(rank);
   for (int64 i = 0; i < rank; ++i) {
-    llvm::Value* update_dim_size = llvm::ConstantInt::get(
-        index[i]->getType(), update_hlo->shape().dimensions(i));
-    // NOTE: Subtraction will be positive due to bounds checking above.
-    update_index[i] = ir_builder_->CreateURem(
-        ir_builder_->CreateSub(index[i], slice_start_index_adjusted[i]),
-        update_dim_size);
+    update_index[i] = ir_builder_->CreateSub(index[i], slice_start_index[i]);
   }
   TF_ASSIGN_OR_RETURN(llvm::Value * true_value,
                       operand_to_generator.at(update_hlo)(update_index));
@@ -1784,8 +1816,13 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
     const llvm_ir::IrArray::Index& dot_result_index) const {
   auto lhs_generator = operand_to_generator.at(hlo->operand(0));
   auto rhs_generator = operand_to_generator.at(hlo->operand(1));
-  int64 contracted_dim_size = hlo->operand(0)->shape().dimensions(
-      hlo->operand(0)->shape().dimensions_size() - 1);
+
+  const DotDimensionNumbers& dim_numbers = hlo->dot_dimension_numbers();
+  int64 lhs_contracting_dim = dim_numbers.lhs_contracting_dimensions(0);
+  int64 rhs_contracting_dim = dim_numbers.rhs_contracting_dimensions(0);
+
+  int64 contracted_dim_size =
+      hlo->operand(0)->shape().dimensions(lhs_contracting_dim);
   int64 lhs_dims = hlo->operand(0)->shape().dimensions_size();
   int64 rhs_dims = hlo->operand(1)->shape().dimensions_size();
 
@@ -1816,13 +1853,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
   for (int64 i = 0; i < lhs_dims - 1; i++) {
     lhs_index.push_back(dot_result_index[i]);
   }
-  lhs_index.push_back(inner_loop->GetIndVarValue());
+  lhs_index.InsertAt(lhs_contracting_dim, inner_loop->GetIndVarValue());
 
-  for (int64 i = 0; i < rhs_dims - 2; i++) {
+  for (int64 i = 0; i < rhs_dims - 1; i++) {
     rhs_index.push_back(dot_result_index[lhs_dims - 1 + i]);
   }
-  rhs_index.push_back(inner_loop->GetIndVarValue());
-  rhs_index.push_back(dot_result_index.back());
+  rhs_index.InsertAt(rhs_contracting_dim, inner_loop->GetIndVarValue());
 
   llvm::Value* current_accumulator =
       ir_builder_->CreateLoad(accumulator_alloca);
@@ -1877,10 +1913,12 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
     case HloOpcode::kImag:
     case HloOpcode::kIsFinite:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kNegate:
     case HloOpcode::kNot:
     case HloOpcode::kReal:
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index 26dff0d96f1d0f00fcdf12410a3769d18a186773..d199473374ad394913413a7d3fe805f8782936f7 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -105,6 +105,9 @@ class ElementalIrEmitter {
   virtual StatusOr<llvm::Value*> EmitLog(PrimitiveType prim_type,
                                          llvm::Value* value) const;
 
+  virtual StatusOr<llvm::Value*> EmitLog1p(PrimitiveType prim_type,
+                                           llvm::Value* value) const;
+
   virtual StatusOr<llvm::Value*> EmitSin(PrimitiveType prim_type,
                                          llvm::Value* value) const;
 
@@ -114,6 +117,9 @@ class ElementalIrEmitter {
   virtual StatusOr<llvm::Value*> EmitExp(PrimitiveType prim_type,
                                          llvm::Value* value) const;
 
+  virtual StatusOr<llvm::Value*> EmitExpm1(PrimitiveType prim_type,
+                                           llvm::Value* value) const;
+
   virtual StatusOr<llvm::Value*> EmitPow(PrimitiveType prim_type,
                                          llvm::Value* lhs,
                                          llvm::Value* rhs) const;
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b43dc0c65d9b6e7c05e06010ba2ff2eb27392295
--- /dev/null
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+
+namespace xla {
+namespace {
+
+using tensorflow::gtl::nullopt;
+
+class ElementalIrEmitterExecutionTest : public HloTestBase {
+ protected:
+  void RunTest(const string& hlo_text,
+               tensorflow::gtl::ArraySlice<Literal*> args) {
+    HloModuleConfig config;
+    config.set_debug_options(GetDebugOptionsForTest());
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                            tools::Parse(hlo_text, config));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), args, nullopt));
+  }
+};
+
+XLA_TEST_F(ElementalIrEmitterExecutionTest, DotFusion) {
+  const string hlo_text = R"(
+HloModule FusedDot
+
+fused_computation {
+  arg0 = s32[1,2,1]{2,1,0} parameter(0)
+  reshape.lhs = s32[2,1]{1,0} reshape(arg0)
+  arg1 = s32[1,2,1]{2,1,0} parameter(1)
+  reshape.rhs = s32[2,1]{1,0} reshape(arg1)
+  ROOT dot = s32[1,1]{1,0} dot(reshape.lhs, reshape.rhs), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+ENTRY main {
+  entry_arg0 = s32[1,2,1]{2,1,0} parameter(0)
+  entry_arg1 = s32[1,2,1]{2,1,0} parameter(1)
+  ROOT fusion = s32[1,1]{1,0} fusion(entry_arg0, entry_arg1), kind=kLoop, calls=fused_computation
+}
+)";
+
+  std::unique_ptr<Literal> lhs = Literal::CreateR3<int32>({{{1}, {2}}});
+  std::unique_ptr<Literal> rhs = Literal::CreateR3<int32>({{{3}, {4}}});
+  RunTest(hlo_text, {lhs.get(), rhs.get()});
+}
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/execution_tracker.cc b/tensorflow/compiler/xla/service/execution_tracker.cc
index 2f0b9ed2bd98fbea4e67c0a30d5aa41ff6a06979..6794cfe297b0fb9a15eb9b7e6906d225f9597d07 100644
--- a/tensorflow/compiler/xla/service/execution_tracker.cc
+++ b/tensorflow/compiler/xla/service/execution_tracker.cc
@@ -37,11 +37,11 @@ AsyncExecution::AsyncExecution(Backend* backend,
   }
 }
 
-tensorflow::Status AsyncExecution::BlockUntilDone() const {
+Status AsyncExecution::BlockUntilDone() const {
   for (auto& stream : streams_) {
     TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 ExecutionTracker::ExecutionTracker() : next_handle_(1) {}
@@ -61,7 +61,7 @@ ExecutionHandle ExecutionTracker::Register(
   return execution_handle;
 }
 
-tensorflow::Status ExecutionTracker::Unregister(const ExecutionHandle& handle) {
+Status ExecutionTracker::Unregister(const ExecutionHandle& handle) {
   tensorflow::mutex_lock lock(execution_mutex_);
   auto it = handle_to_execution_.find(handle.handle());
   if (it == handle_to_execution_.end()) {
@@ -69,7 +69,7 @@ tensorflow::Status ExecutionTracker::Unregister(const ExecutionHandle& handle) {
                     handle.handle());
   }
   handle_to_execution_.erase(handle.handle());
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 StatusOr<const AsyncExecution*> ExecutionTracker::Resolve(
diff --git a/tensorflow/compiler/xla/service/execution_tracker.h b/tensorflow/compiler/xla/service/execution_tracker.h
index 5b6bddf9f16a85f7863f4d05c39c7d4c99209af1..4458152dd9a98890fc3a3e7f324245ec68821467 100644
--- a/tensorflow/compiler/xla/service/execution_tracker.h
+++ b/tensorflow/compiler/xla/service/execution_tracker.h
@@ -43,7 +43,7 @@ class AsyncExecution {
   AsyncExecution(Backend* backend, std::vector<Backend::StreamPtr> streams,
                  const ExecutionProfile& profile, GlobalDataHandle result);
 
-  tensorflow::Status BlockUntilDone() const;
+  Status BlockUntilDone() const;
 
   const GlobalDataHandle& result() const { return result_; }
 
@@ -77,7 +77,7 @@ class ExecutionTracker {
                            GlobalDataHandle data);
 
   // Unregisters the execution for the given handle.
-  tensorflow::Status Unregister(const ExecutionHandle& handle);
+  Status Unregister(const ExecutionHandle& handle);
 
   // Resolves the given ExecutionHandle to an AsyncExecution. Returns an
   // error status if the given handle is not found, which means that the
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index ddb687314ee8221ba9282f230db498b3a5d23499..5ee67ccb4ae147683c7b41941670c6fc413a0d09 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -89,7 +89,7 @@ GenericTransferManager::TransferLiteralFromDevice(
 }
 
 Status GenericTransferManager::TransferLiteralToDevice(
-    se::StreamExecutor* executor, const Literal& literal,
+    se::StreamExecutor* executor, const LiteralSlice& literal,
     const ShapedBuffer& device_buffer) {
   const Shape& shape = literal.shape();
   VLOG(2) << "transferring literal shape to device: "
@@ -115,7 +115,7 @@ Status GenericTransferManager::TransferLiteralToDevice(
           TF_RET_CHECK(GetByteSizeRequirement(device_subshape) ==
                        device_memory.size());
           // Element is array-shaped: transfer array data to device buffer.
-          const auto subliteral = LiteralView::Create(literal, index);
+          const auto subliteral = LiteralSlice(literal, index);
           std::unique_ptr<Literal> relayed_out_literal;
           const void* source;
           if (LayoutUtil::Equal(device_subshape.layout(),
@@ -137,7 +137,7 @@ Status GenericTransferManager::TransferLiteralToDevice(
 }
 
 Status GenericTransferManager::TransferLiteralToInfeed(
-    se::StreamExecutor* executor, const Literal& literal) {
+    se::StreamExecutor* executor, const LiteralSlice& literal) {
   return Unimplemented("Generic transfer to Infeed");
 }
 
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index 0579099de40ba3e43f69e4e6474b56691064c692..3da9570ef7eebcdf618439f628fb4d5589993e4f 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -45,11 +45,11 @@ class GenericTransferManager : public TransferManager {
       se::StreamExecutor* executor, const ShapedBuffer& device_buffer) override;
 
   Status TransferLiteralToDevice(se::StreamExecutor* executor,
-                                 const Literal& literal,
+                                 const LiteralSlice& literal,
                                  const ShapedBuffer& device_buffer) override;
 
   Status TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                 const Literal& literal) override;
+                                 const LiteralSlice& literal) override;
   Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
                                     const Shape& literal_shape,
                                     Literal* literal) override;
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 7cb7f550730eeb562a6163cf5499ffaaf02d3327..4012f87f2bf69d1ab056da5d6c750441c7404980 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -291,6 +291,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
         "//tensorflow/core/platform/default/build_config:cudnn_plugin",
@@ -388,8 +389,10 @@ cc_library(
     deps = [
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:instruction_fusion",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index 837f05244f7a8c71483cc30eeac9e1c219e6bbd2..ab5149dcdb09290cd0c0b2233029d0988a95f036 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -37,11 +37,11 @@ void BufferAllocations::Builder::RegisterBuffer(BufferAllocation::Index index,
 }
 
 StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
-    const BufferAssignment& buffer_assignment, int device_ordinal,
+    const BufferAssignment* buffer_assignment, int device_ordinal,
     DeviceMemoryAllocator* memory_allocator) {
-  const int64 num_buffers = buffer_assignment.Allocations().size();
-  auto buffer_allocations = WrapUnique(
-      new BufferAllocations(num_buffers, device_ordinal, memory_allocator));
+  const int64 num_buffers = buffer_assignment->Allocations().size();
+  auto buffer_allocations = WrapUnique(new BufferAllocations(
+      num_buffers, device_ordinal, memory_allocator, buffer_assignment));
 
   for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
     // If buffer #i's address is already registered (e.g. external arguments or
@@ -62,28 +62,28 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
 
     // Allocate each allocation that might escape, or is the temp buffer.
     bool seen_temp_buffer = false;
-    const BufferAllocation& allocation = buffer_assignment.GetAllocation(i);
+    const BufferAllocation& allocation = buffer_assignment->GetAllocation(i);
     if (allocation.maybe_live_out() || allocation.IsPreallocatedTempBuffer()) {
       const int64 buffer_size = allocation.size();
       se::DeviceMemoryBase buffer_address;
       if (buffer_size > 0) {
-        TF_ASSIGN_OR_RETURN(buffer_address, memory_allocator->Allocate(
-                                                device_ordinal, buffer_size));
-        if (buffer_address == nullptr) {
-          return ResourceExhausted(
-              "Out of memory when allocating %s for buffer %lld.",
-              tensorflow::strings::HumanReadableNumBytes(buffer_size).c_str(),
-              i);
-        }
-        if (reinterpret_cast<uintptr_t>(buffer_address.opaque()) %
+        OwningDeviceMemory buffer;
+        TF_ASSIGN_OR_RETURN(
+            buffer, memory_allocator->Allocate(device_ordinal, buffer_size));
+        if (reinterpret_cast<uintptr_t>(buffer.opaque()) %
                 kCudaMallocAlignBytes !=
             0) {
           return InternalError(
               "Address returned by memory_allocator->Allocate must be a "
               "multiple of %llx, but was %p",
-              kCudaMallocAlignBytes, buffer_address.opaque());
+              kCudaMallocAlignBytes, buffer.opaque());
         }
+        // We do manual memory management within BufferAllocations.  Be sure not
+        // to do a TF_RETURN_IF_ERROR between this line and the
+        // buffer_allocations->SetBuffer(buffer_address) call below!
+        buffer_address = buffer.Forget();
       }
+
       buffer_allocations->SetBuffer(i, buffer_address);
       if (allocation.IsPreallocatedTempBuffer()) {
         if (seen_temp_buffer) {
@@ -103,28 +103,42 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
               << "B)";
     }
   }
-
   return std::move(buffer_allocations);
 }
 
-tensorflow::Status BufferAllocations::TearDown(
-    const std::set<se::DeviceMemoryBase>& live_addresses,
-    const BufferAssignment& buffer_assignment) {
-  // Deallocate temporary buffers.
-  const int64 num_buffers = buffer_assignment.Allocations().size();
+BufferAllocations::~BufferAllocations() {
+  if (!torn_down_) {
+    // Presumably if we're executing this branch, the caller is in an error
+    // state, otherwise it would have explicitly called TearDown so it could
+    // save some set of live addresses.  So ignoring any errors in TearDown is
+    // sensible.
+    TearDown(/*live_addresses=*/{}).IgnoreError();
+  }
+}
+
+Status BufferAllocations::TearDown(
+    const std::set<se::DeviceMemoryBase>& live_addresses) {
+  // Deallocate temporary buffers, taking care to try to deallocate all of them
+  // even if one of the deallocations fails.
+  Status status;
+  const int64 num_buffers = buffer_assignment_->Allocations().size();
   for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
-    const BufferAllocation& allocation = buffer_assignment.GetAllocation(i);
+    const BufferAllocation& allocation = buffer_assignment_->GetAllocation(i);
     se::DeviceMemoryBase buffer_address = GetDeviceAddress(allocation.index());
     // Deallocate buffers marked "maybe_live_out" but aren't actually live out,
     // and temp buffers.
     if ((allocation.maybe_live_out() &&
          !live_addresses.count(buffer_address)) ||
         allocation.IsPreallocatedTempBuffer()) {
-      TF_RETURN_IF_ERROR(
-          memory_allocator_->Deallocate(device_ordinal_, &buffer_address));
+      auto dealloc_result =
+          memory_allocator_->Deallocate(device_ordinal_, buffer_address);
+      if (!dealloc_result.ok() && status.ok()) {
+        status = dealloc_result;
+      }
     }
   }
-  return tensorflow::Status::OK();
+  torn_down_ = true;
+  return status;
 }
 
 se::DeviceMemoryBase BufferAllocations::GetDeviceAddress(
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
index c2fc35be4ca4bc6df85ee21fb6b564bfd6de3ec0..636623502597b3a66523938ba430e9d5a82f796c 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@@ -48,13 +48,15 @@ class BufferAllocations {
     // `device_ordinal` is the number of the device this function allocates
     // memory on.
     StatusOr<std::unique_ptr<BufferAllocations>> Build(
-        const BufferAssignment& buffer_assignment, int device_ordinal,
+        const BufferAssignment* buffer_assignment, int device_ordinal,
         DeviceMemoryAllocator* memory_allocator);
 
    private:
     std::map<BufferAllocation::Index, se::DeviceMemoryBase> registered_buffers_;
   };
 
+  ~BufferAllocations();
+
   BufferAllocations(const BufferAllocations&) = delete;
   BufferAllocations& operator=(const BufferAllocations&) = delete;
 
@@ -76,16 +78,16 @@ class BufferAllocations {
 
   // Tears down all buffers allocated by this object that are not in
   // `live_addresses`.
-  tensorflow::Status TearDown(
-      const std::set<se::DeviceMemoryBase>& live_addresses,
-      const BufferAssignment& buffer_assignment);
+  Status TearDown(const std::set<se::DeviceMemoryBase>& live_addresses);
 
  private:
   BufferAllocations(BufferAllocation::Index buffer_count, int device_ordinal,
-                    DeviceMemoryAllocator* memory_allocator)
+                    DeviceMemoryAllocator* memory_allocator,
+                    const BufferAssignment* buffer_assignment)
       : buffers_(buffer_count),
         device_ordinal_(device_ordinal),
-        memory_allocator_(memory_allocator) {}
+        memory_allocator_(memory_allocator),
+        buffer_assignment_(buffer_assignment) {}
 
   // Sets the device address of buffer `buffer_index`.
   void SetBuffer(BufferAllocation::Index buffer_index,
@@ -100,8 +102,9 @@ class BufferAllocations {
   se::DeviceMemoryBase temp_buffer_base_;
 
   int device_ordinal_;
-
   DeviceMemoryAllocator* memory_allocator_;
+  const BufferAssignment* buffer_assignment_;
+  bool torn_down_ = false;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index dce8de2e301ecfaa4674b8be48b8c02bfabf3f4b..77a48965e031349b045a956fd3f28c58607328e5 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -35,9 +35,10 @@ ConditionalThunk::ConditionalThunk(
       true_thunk_(std::move(true_thunk_sequence), hlo),
       false_thunk_(std::move(false_thunk_sequence), hlo) {}
 
-Status ConditionalThunk::Initialize(const GpuExecutable& executable) {
-  TF_RETURN_IF_ERROR(true_thunk_.Initialize(executable));
-  TF_RETURN_IF_ERROR(false_thunk_.Initialize(executable));
+Status ConditionalThunk::Initialize(const GpuExecutable& executable,
+                                    se::StreamExecutor* executor) {
+  TF_RETURN_IF_ERROR(true_thunk_.Initialize(executable, executor));
+  TF_RETURN_IF_ERROR(false_thunk_.Initialize(executable, executor));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
index e40872688fdad24d24db5f7cebb3206c77652dce..ee03865d174469285a9e98b8a30fea90d997df37 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -47,7 +47,8 @@ class ConditionalThunk : public Thunk {
   ConditionalThunk(const ConditionalThunk&) = delete;
   ConditionalThunk& operator=(const ConditionalThunk&) = delete;
 
-  Status Initialize(const GpuExecutable& executable) override;
+  Status Initialize(const GpuExecutable& executable,
+                    se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
                          se::Stream* stream) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 64d3b84b8c73d82800270aebcebf7f7a8fec5fe4..f0881124128c9b043392ffc4fa3aee2cd5b754c7 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -29,11 +29,6 @@ namespace xla {
 namespace gpu {
 
 using se::dnn::AlgorithmDesc;
-using se::dnn::BatchDescriptor;
-using se::dnn::ConvolutionDescriptor;
-using se::dnn::DataLayout;
-using se::dnn::FilterDescriptor;
-using se::dnn::FilterLayout;
 
 ConvolutionThunk::ConvolutionThunk(
     CudnnConvKind convolution_kind, const BufferAllocation::Slice& input_buffer,
diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
index bf912fbd14de5874062a79db28186ab233575233..ee38c0318a878c7bcdc02afdcd146bfb4498d9a2 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
@@ -29,12 +29,12 @@ HostToDeviceCopyThunk::HostToDeviceCopyThunk(
       destination_buffer_(destination_buffer),
       mem_size_(mem_size) {}
 
-tensorflow::Status HostToDeviceCopyThunk::ExecuteOnStream(
+Status HostToDeviceCopyThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   se::DeviceMemoryBase destination_data =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
   stream->ThenMemcpy(&destination_data, source_address_, mem_size_);
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk(
@@ -46,14 +46,14 @@ DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk(
       destination_buffer_(destination_buffer),
       mem_size_(mem_size) {}
 
-tensorflow::Status DeviceToDeviceCopyThunk::ExecuteOnStream(
+Status DeviceToDeviceCopyThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   se::DeviceMemoryBase destination_data =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
   se::DeviceMemoryBase source_data =
       buffer_allocations.GetDeviceAddress(source_buffer_);
   stream->ThenMemcpy(&destination_data, source_data, mem_size_);
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.h b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
index 2e7eb5f3445bc9294fa455ef31fb816cdba4726c..8b128386f61636de9ac41e856a2b00c578e05735 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
@@ -39,8 +39,8 @@ class HostToDeviceCopyThunk : public Thunk {
   HostToDeviceCopyThunk(const HostToDeviceCopyThunk&) = delete;
   HostToDeviceCopyThunk& operator=(const HostToDeviceCopyThunk&) = delete;
 
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
  private:
   const void* source_address_;
@@ -62,8 +62,8 @@ class DeviceToDeviceCopyThunk : public Thunk {
   DeviceToDeviceCopyThunk(const DeviceToDeviceCopyThunk&) = delete;
   DeviceToDeviceCopyThunk& operator=(const DeviceToDeviceCopyThunk&) = delete;
 
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
  private:
   const BufferAllocation::Slice source_buffer_;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index c4c56c56928810d043085f284cda351391195c3b..6a46bdb9b438f81dc564b9033f5d302f90b6a997 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -35,35 +35,22 @@ class ScratchAllocator : public se::ScratchAllocator {
   ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator)
       : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
 
-  ~ScratchAllocator() override;
-
   int64 GetMemoryLimitInBytes(se::Stream* stream) override {
     return 1LL << 32;  // 4GB.  TODO(jlebar): Tune this?
   }
   int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
 
-  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
-      se::Stream* stream, int64 byte_size) override;
+  StatusOr<se::DeviceMemory<uint8>> AllocateBytes(se::Stream* stream,
+                                                  int64 byte_size) override;
 
  private:
   const int device_ordinal_;
   DeviceMemoryAllocator* memory_allocator_;
-  std::vector<se::DeviceMemoryBase> allocated_buffers_;
+  std::vector<OwningDeviceMemory> allocated_buffers_;
   int64 total_allocated_bytes_ = 0;
 };
 
-ScratchAllocator::~ScratchAllocator() {
-  for (auto& allocated_buffer : allocated_buffers_) {
-    if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer)
-             .ok()) {
-      // The program can still continue with failed deallocation.
-      LOG(ERROR) << "Failed to deallocate the allocated buffer: "
-                 << allocated_buffer.opaque();
-    }
-  }
-}
-
-se::port::StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
+StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
     se::Stream* stream, int64 byte_size) {
   CHECK_GE(byte_size, 0) << "byte_size must be positive.";
   if (byte_size > GetMemoryLimitInBytes(stream)) {
@@ -74,19 +61,14 @@ se::port::StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
             byte_size, GetMemoryLimitInBytes(stream)));
   }
 
-  auto status_or_memory =
-      memory_allocator_->Allocate(device_ordinal_, byte_size,
-                                  /*retry_on_failure=*/false);
-  if (!status_or_memory.ok()) {
-    return se::port::Status(se::port::error::RESOURCE_EXHAUSTED,
-                            tensorflow::strings::Printf(
-                                "Failed to allocate %lld bytes on device %d.",
-                                byte_size, device_ordinal_));
-  }
-  se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie();
-  allocated_buffers_.push_back(allocated_buffer);
+  TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
+                      memory_allocator_->Allocate(device_ordinal_, byte_size,
+                                                  /*retry_on_failure=*/false));
   total_allocated_bytes_ += byte_size;
-  return se::DeviceMemory<uint8>(allocated_buffer);
+
+  se::DeviceMemoryBase buffer_addr = allocated_buffer.AsDeviceMemoryBase();
+  allocated_buffers_.push_back(std::move(allocated_buffer));
+  return se::DeviceMemory<uint8>(buffer_addr);
 }
 
 // Determines whether we can safely perform a winograd non-fused convolution for
@@ -197,22 +179,42 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
   // We don't put any data in these buffers, because (in theory, anyway) the
   // speed of a conv isn't affected by the data being convolved.
   ScratchAllocator input_output_allocator(device_ordinal, allocator);
-  se::port::StatusOr<DeviceMemoryBase> input_buf =
+  StatusOr<DeviceMemoryBase> maybe_input_buf =
       input_output_allocator.AllocateBytes(&stream,
                                            ShapeUtil::ByteSizeOf(input_shape));
-  se::port::StatusOr<DeviceMemoryBase> filter_buf =
+  StatusOr<DeviceMemoryBase> maybe_filter_buf =
       input_output_allocator.AllocateBytes(&stream,
                                            ShapeUtil::ByteSizeOf(filter_shape));
-  se::port::StatusOr<DeviceMemoryBase> output_buf =
+  StatusOr<DeviceMemoryBase> maybe_output_buf =
       input_output_allocator.AllocateBytes(&stream,
                                            ShapeUtil::ByteSizeOf(output_shape));
-  if (!input_buf.ok() || !filter_buf.ok() || !output_buf.ok()) {
+  if (!maybe_input_buf.ok() || !maybe_filter_buf.ok() ||
+      !maybe_output_buf.ok()) {
     LOG(WARNING)
         << "Couldn't allocate space for input/filter/output of convolution "
         << instr->ToString() << ".  Falling back to default algorithm.";
     return nullopt;
   }
 
+  DeviceMemoryBase input_buf = maybe_input_buf.ValueOrDie();
+  DeviceMemoryBase filter_buf = maybe_filter_buf.ValueOrDie();
+  DeviceMemoryBase output_buf = maybe_output_buf.ValueOrDie();
+
+  // Although we don't have evidence this matters, zero out the buffers before
+  // autotuning.  It's conceivable that using uninitialized memory as the inputs
+  // might affect performance if e.g. the inputs contain denormals, and this is
+  // easy enough.
+  if (!stream.ThenMemZero(&input_buf, input_buf.size())
+           .ThenMemZero(&filter_buf, filter_buf.size())
+           .ThenMemZero(&output_buf, output_buf.size())
+           .BlockHostUntilDone()
+           .ok()) {
+    LOG(WARNING)
+        << "Couldn't zero out input/filter/output buffer for convolution "
+        << instr->ToString() << ".  Falling back to default algorithm.";
+    return nullopt;
+  }
+
   const bool use_winograd_nonfused = ShouldIncludeWinogradNonfusedAlgo(
       input_shape, output_shape, dnums, stream_exec_);
   se::dnn::ProfileResult best_result;
@@ -225,12 +227,12 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
     VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
             << instr->ToString();
 
-    bool launch_ok = RunCudnnConvolution(
-                         kind, input_shape, filter_shape, output_shape,
-                         input_buf.ValueOrDie(), filter_buf.ValueOrDie(),
-                         output_buf.ValueOrDie(), &scratch_allocator, window,
-                         dnums, AlgorithmConfig(alg), &stream, &profile_result)
-                         .ok();
+    bool launch_ok =
+        RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
+                            input_buf, filter_buf, output_buf,
+                            &scratch_allocator, window, dnums,
+                            AlgorithmConfig(alg), &stream, &profile_result)
+            .ok();
 
     if (launch_ok && profile_result.is_valid()) {
       int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 5af7a77ea858563fbea05af8efd54f96a74aee93..e5e2a0478a0659986ddec8d6785827b14b9efb56 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -227,6 +227,11 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLog(
   return EmitLibdeviceMathCall("__nv_log", {value}, {prim_type}, prim_type);
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLog1p(
+    PrimitiveType prim_type, llvm::Value* value) const {
+  return EmitLibdeviceMathCall("__nv_log1p", {value}, {prim_type}, prim_type);
+}
+
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitSin(
     PrimitiveType prim_type, llvm::Value* value) const {
   return EmitLibdeviceMathCall("__nv_sin", {value}, {prim_type}, prim_type);
@@ -242,6 +247,11 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitExp(
   return EmitLibdeviceMathCall("__nv_exp", {value}, {prim_type}, prim_type);
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitExpm1(
+    PrimitiveType prim_type, llvm::Value* value) const {
+  return EmitLibdeviceMathCall("__nv_expm1", {value}, {prim_type}, prim_type);
+}
+
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitPow(PrimitiveType prim_type,
                                                       llvm::Value* lhs,
                                                       llvm::Value* rhs) const {
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index 77d4569b1e8e398005e8f517ff086a77aedd382d..91f4d960aa62fff3e0699ece37a8c74d7dcf2f59 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -64,6 +64,9 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitLog(PrimitiveType prim_type,
                                  llvm::Value* value) const override;
 
+  StatusOr<llvm::Value*> EmitLog1p(PrimitiveType prim_type,
+                                   llvm::Value* value) const override;
+
   StatusOr<llvm::Value*> EmitSin(PrimitiveType prim_type,
                                  llvm::Value* value) const override;
 
@@ -73,6 +76,9 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitExp(PrimitiveType prim_type,
                                  llvm::Value* value) const override;
 
+  StatusOr<llvm::Value*> EmitExpm1(PrimitiveType prim_type,
+                                   llvm::Value* value) const override;
+
   StatusOr<llvm::Value*> EmitPow(PrimitiveType prim_type, llvm::Value* lhs,
                                  llvm::Value* rhs) const override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
index cc747addbd152eb82b0b2ef92b8653fc861f97be..e14ee6918bf148861ecccac99355fccf7ae93103 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
@@ -31,23 +31,12 @@ FftScratchAllocator::FftScratchAllocator(
     int device_ordinal, DeviceMemoryAllocator* memory_allocator)
     : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
 
-FftScratchAllocator::~FftScratchAllocator() {
-  for (auto& allocated_buffer : allocated_buffers_) {
-    if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer)
-             .ok()) {
-      // The program can still continue with failed deallocation.
-      LOG(ERROR) << "Failed to deallocate the allocated buffer: "
-                 << allocated_buffer.opaque();
-    }
-  }
-}
-
 int64 FftScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) {
   constexpr int64 kFftScratchSize = 1LL << 32;  // 4GB by default.
   return kFftScratchSize;
 }
 
-se::port::StatusOr<se::DeviceMemory<uint8>> FftScratchAllocator::AllocateBytes(
+StatusOr<se::DeviceMemory<uint8>> FftScratchAllocator::AllocateBytes(
     se::Stream* stream, int64 byte_size) {
   CHECK_GE(byte_size, 0) << "byte_size must be positive.";
   if (byte_size > GetMemoryLimitInBytes(stream)) {
@@ -58,18 +47,14 @@ se::port::StatusOr<se::DeviceMemory<uint8>> FftScratchAllocator::AllocateBytes(
             byte_size, GetMemoryLimitInBytes(stream)));
   }
 
-  auto status_or_memory =
-      memory_allocator_->Allocate(device_ordinal_, byte_size,
-                                  /*retry_on_failure=*/false);
-  if (!status_or_memory.ok()) {
-    return tensorflow::errors::ResourceExhausted(
-        "Failed to allocate %lld bytes on device %d.", byte_size,
-        device_ordinal_);
-  }
-  se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie();
-  allocated_buffers_.push_back(allocated_buffer);
+  TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
+                      memory_allocator_->Allocate(device_ordinal_, byte_size,
+                                                  /*retry_on_failure=*/false));
   total_allocated_bytes_ += byte_size;
-  return se::DeviceMemory<uint8>(allocated_buffer);
+
+  se::DeviceMemoryBase buffer_addr = allocated_buffer.AsDeviceMemoryBase();
+  allocated_buffers_.push_back(std::move(allocated_buffer));
+  return se::DeviceMemory<uint8>(buffer_addr);
 }
 
 namespace {
@@ -121,8 +106,8 @@ FftThunk::FftThunk(FftType fft_type,
       input_shape_(input_shape),
       output_shape_(output_shape) {}
 
-tensorflow::Status FftThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+Status FftThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                 se::Stream* stream) {
   VLOG(3) << "FFT type: " << FftTypeToString(fft_type_);
   VLOG(3) << "Input shape: " << ShapeUtil::HumanStringWithLayout(input_shape_);
   VLOG(3) << "Output shape: "
@@ -222,7 +207,7 @@ tensorflow::Status FftThunk::ExecuteOnStream(
       LOG(FATAL) << "unsupported fft type";
   }
   if (launch_ok) {
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
   return InternalError("Unable to launch fft for thunk %p with type %s", this,
                        FftTypeToString(fft_type_).c_str());
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.h b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
index 24b1dca99865fe21d0ca3af91e0d169f7b74a78a..b0a22564f3a09bb67a3c01723f6e37c604656d45 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
@@ -39,8 +39,6 @@ class FftScratchAllocator : public se::ScratchAllocator {
   FftScratchAllocator(int device_ordinal,
                       DeviceMemoryAllocator* memory_allocator);
 
-  ~FftScratchAllocator() override;
-
   int64 GetMemoryLimitInBytes(se::Stream* stream) override;
 
   int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
@@ -51,7 +49,7 @@ class FftScratchAllocator : public se::ScratchAllocator {
  private:
   const int device_ordinal_;
   DeviceMemoryAllocator* memory_allocator_;
-  std::vector<se::DeviceMemoryBase> allocated_buffers_;
+  std::vector<OwningDeviceMemory> allocated_buffers_;
   int64 total_allocated_bytes_ = 0;
 };
 
@@ -73,8 +71,8 @@ class FftThunk : public Thunk {
   FftThunk& operator=(const FftThunk&) = delete;  // Cannot share fft_plan_
 
   // Does the FFT for the thunk on "stream".
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
  private:
   const se::fft::Type fft_type_;
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
index 6e6966df3987eef29b2122c3ef8f11b7cd0bfe14..b36539e0cb8d0a2f4758dd90acbdd8fc7181b8ca 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@@ -30,19 +30,20 @@ ForThunk::ForThunk(const int64 loop_limit,
       body_thunk_sequence_(
           MakeUnique<SequentialThunk>(std::move(*body_thunk_sequence), hlo)) {}
 
-tensorflow::Status ForThunk::Initialize(const GpuExecutable& executable) {
-  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable));
-  return tensorflow::Status::OK();
+Status ForThunk::Initialize(const GpuExecutable& executable,
+                            se::StreamExecutor* executor) {
+  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
+  return Status::OK();
 }
 
-tensorflow::Status ForThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+Status ForThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                 se::Stream* stream) {
   for (int64 i = 0; i < loop_limit_; ++i) {
     // Invoke loop body thunk sequence.
     TF_RETURN_IF_ERROR(
         body_thunk_sequence_->ExecuteOnStream(buffer_allocations, stream));
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index c78d1c50686297aea8235af928aba562697f49bc..41ddfe0ceb1d0516c1c64feca53212a925632209 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -36,9 +36,10 @@ class ForThunk : public Thunk {
   ForThunk(const ForThunk&) = delete;
   ForThunk& operator=(const ForThunk&) = delete;
 
-  tensorflow::Status Initialize(const GpuExecutable& executable) override;
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status Initialize(const GpuExecutable& executable,
+                    se::StreamExecutor* executor) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
  private:
   const int64 loop_limit_;
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index 0ec12f52d8b398218ec370fc74bfdf6f97f43893..79fca43d022816645b8a07b9e806fe9cc3745e7c 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -215,14 +215,32 @@ se::blas::ComputationType GetBlasComputationType(PrimitiveType type) {
   }
 }
 
+DotDimensionNumbers GetDimensionNumbers(const HloInstruction& hlo_instruction) {
+  if (hlo_instruction.opcode() == HloOpcode::kDot) {
+    return hlo_instruction.dot_dimension_numbers();
+  }
+  CHECK_EQ(hlo_instruction.opcode(), HloOpcode::kFusion);
+  CHECK_EQ(hlo_instruction.fusion_kind(), HloInstruction::FusionKind::kOutput);
+  CHECK_EQ(hlo_instruction.fused_expression_root()->opcode(),
+           HloOpcode::kMultiply);
+  // Try to find the dot inside the output fusion node.
+  const HloInstruction* dot =
+      hlo_instruction.fused_expression_root()->operand(0);
+  if (dot->opcode() != HloOpcode::kDot) {
+    dot = hlo_instruction.fused_expression_root()->operand(1);
+  }
+  CHECK_EQ(dot->opcode(), HloOpcode::kDot);
+
+  return dot->dot_dimension_numbers();
+}
+
 }  // namespace
 
 GemmThunk::GemmThunk(const BufferAllocation::Slice& lhs_buffer,
                      const BufferAllocation::Slice& rhs_buffer,
                      const BufferAllocation::Slice& output_buffer,
                      const Shape& lhs_shape, const Shape& rhs_shape,
-                     const Shape& output_shape, bool transpose_lhs,
-                     bool transpose_rhs, double alpha,
+                     const Shape& output_shape, double alpha,
                      const HloInstruction* hlo_instruction)
     : Thunk(Kind::kGemm, hlo_instruction),
       lhs_buffer_(lhs_buffer),
@@ -231,12 +249,10 @@ GemmThunk::GemmThunk(const BufferAllocation::Slice& lhs_buffer,
       lhs_shape_(lhs_shape),
       rhs_shape_(rhs_shape),
       output_shape_(output_shape),
-      transpose_lhs_(transpose_lhs),
-      transpose_rhs_(transpose_rhs),
       alpha_(alpha) {}
 
-tensorflow::Status GemmThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                  se::Stream* stream) {
   VLOG(2) << "Executing a GemmThunk";
 
   se::DeviceMemoryBase lhs_data =
@@ -284,10 +300,12 @@ tensorflow::Status GemmThunk::ExecuteOnStream(
                             shape.dimensions(!is_row_major));
   };
 
-  const MatrixDescriptor lhs_descriptor =
-      make_descriptor(lhs_data, lhs_shape_, transpose_lhs_);
-  const MatrixDescriptor rhs_descriptor =
-      make_descriptor(rhs_data, rhs_shape_, transpose_rhs_);
+  DotDimensionNumbers dim_nums = GetDimensionNumbers(*hlo_instruction());
+
+  const MatrixDescriptor lhs_descriptor = make_descriptor(
+      lhs_data, lhs_shape_, dim_nums.lhs_contracting_dimensions(0) == 0);
+  const MatrixDescriptor rhs_descriptor = make_descriptor(
+      rhs_data, rhs_shape_, dim_nums.rhs_contracting_dimensions(0) == 1);
 
   // Dispatches to a regular cublas gemm, a gemm-with-algorithm, or attempts to
   // autotune this gemm to figure out the best algorithm.
@@ -350,7 +368,7 @@ tensorflow::Status GemmThunk::ExecuteOnStream(
   if (!launch_ok) {
     return InternalError("Unable to launch cuBLAS gemm on stream %p", stream);
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index a18f425bc38fd3fbbb345901514c4ac16dbe97ec..7a4830d64e7caef5a1170cbdbf8ab373fdaf16e2 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -35,22 +35,20 @@ namespace gpu {
 class GemmThunk : public Thunk {
  public:
   // Constructs a thunk that computes "output = (lhs <dot> rhs) * alpha" using
-  // BLAS gemm. transpose_lhs and transpose_rhs indicate whether gemm should
-  // transpose the lhs and rhs operand. hlo_instruction is as in Thunk. alpha is
-  // a constant.
+  // BLAS gemm.  hlo_instruction is as in Thunk. alpha is a constant.
   GemmThunk(const BufferAllocation::Slice& lhs_buffer,
             const BufferAllocation::Slice& rhs_buffer,
             const BufferAllocation::Slice& output_buffer,
             const Shape& lhs_shape, const Shape& rhs_shape,
-            const Shape& output_shape, bool transpose_lhs, bool transpose_rhs,
-            double alpha, const HloInstruction* hlo_instruction);
+            const Shape& output_shape, double alpha,
+            const HloInstruction* hlo_instruction);
 
   GemmThunk(const GemmThunk&) = delete;
   GemmThunk& operator=(const GemmThunk&) = delete;
 
   // Does the gemm operation for the thunk on "stream", which must be non-null.
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
   // Returns true if we'll perform autotuning if run on the given stream.  If
   // so, we want the GPU to be quiescent during autotuning, so as not to
@@ -69,8 +67,6 @@ class GemmThunk : public Thunk {
   const Shape rhs_shape_;
   const Shape output_shape_;
 
-  const bool transpose_lhs_;
-  const bool transpose_rhs_;
   const double alpha_;
 
   // Maps device names (StreamExecutor::DeviceDescription::name()) to autotune
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 4fdc4c89618bc0f179b2332373cb2fd3cf637390..d50153d8a31077e759bd6104d5bca8868a554fde 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -128,9 +128,8 @@ string GetLibdeviceDir(const string& config_cuda_data_dir) {
 }
 
 // Runs optimization passes on the given HLO module.
-tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
-                                     se::StreamExecutor* stream_exec,
-                                     DeviceMemoryAllocator* device_allocator) {
+Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
+                         DeviceMemoryAllocator* device_allocator) {
   {
     HloPassPipeline pipeline("optimization");
     pipeline.AddInvariantChecker<HloVerifier>();
@@ -248,7 +247,7 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
   {
     HloPassPipeline pipeline("layout_assignment");
     pipeline.AddPass<GpuLayoutAssignment>(
-        hlo_module->device_entry_computation_layout());
+        hlo_module->mutable_device_entry_computation_layout());
 
     // The LayoutAssignment pass may leave behind kCopy instructions which are
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
@@ -283,12 +282,12 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
       TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
     }
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 // Modifies the given HLO module so that it will be accepted by IrEmitter.
 // Unlike optimization passes, the passes are necessary for correctness.
-tensorflow::Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
+Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
   // In some cases, we have to place the result of an instruction in a temporary
   // buffer. For instance, the buffer that holds an external parameter is
   // assumed immutable at this point, and should not be reused for output
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
index 9db85bc788bde46c890a46ce9b0902ddce3f5675..d9560779f313d5a559c3eb0f5b28ff5dd210d9d5 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
@@ -84,8 +84,13 @@ StatusOr<bool> GpuCopyInsertion::Run(HloModule* module) {
       for (int64 i = 0; i < hlo->operand_count() - 2; ++i) {
         TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
       }
-    } else if (ImplementedAsLibraryCall(*hlo)) {
-      // For all other library calls, materialize all the operands into memory.
+    } else if (ImplementedAsLibraryCall(*hlo) ||
+               hlo->opcode() == HloOpcode::kCrossReplicaSum) {
+      // For all other library calls and cross-replica-sum, materialize all the
+      // operands into memory.  (Cross-replica-sum gets its constant args
+      // materialized even if it's not implemented as a libcall to simplify the
+      // implementation.  It's slower, but we can constant fold away constant
+      // args *anyway*, so we just need to make it work.)
       for (int64 i = 0; i < hlo->operand_count(); ++i) {
         TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
       }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 980cc89fa03abd874a8e0a694f2abb775c1de050..25d8f720ea4791a4c94efcad6909cd0c113fbe70 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -32,12 +32,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
+using tensorflow::tracing::ScopedAnnotation;
+
 // A helper class for profiling HLO in the course of GPU program execution.
 // All of the profiling is guarded internally, to avoid the caller needing to
 // have lots of conditionals sprinkled around.
@@ -134,9 +137,10 @@ Status GpuExecutable::ExecuteThunks(
     const BufferAllocations& buffer_allocations, bool block_host_until_done,
     HloExecutionProfile* hlo_execution_profile) {
   se::Stream* main_stream = run_options->stream();
+  se::StreamExecutor* executor = main_stream->parent();
 
   std::pair<int, int> stream_compute_compatibility;
-  main_stream->parent()->GetDeviceDescription().cuda_compute_capability(
+  executor->GetDeviceDescription().cuda_compute_capability(
       &stream_compute_compatibility.first,
       &stream_compute_compatibility.second);
   TF_RET_CHECK(stream_compute_compatibility == compute_capability_)
@@ -155,21 +159,39 @@ Status GpuExecutable::ExecuteThunks(
   sub_streams.reserve(thunk_schedule_->StreamCount() - 1);
   while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
     sub_streams.emplace_back();
-    TF_ASSIGN_OR_RETURN(
-        sub_streams.back(),
-        run_options->BorrowStream(main_stream->parent()->device_ordinal()));
+    TF_ASSIGN_OR_RETURN(sub_streams.back(),
+                        run_options->BorrowStream(executor->device_ordinal()));
   }
 
   HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
                                 sub_streams, hlo_module_->entry_computation());
   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
 
-  // The next event enqueued on stream N must not run until the thunk at
-  // last_blocking_thunk_for_stream[N] completes.
-  std::map<int32, const Thunk*> last_blocking_thunk_for_stream;
+  // This top-level trace serves two purposes:
+  //  1) It marks the scope of the whole XLA module.
+  //  2) It tells us whether tracing is enabled.  We use this to avoid the
+  //     expensive HloInstruction::ToString() calls inside the loop below if
+  //     tracing is disabled.
+  ScopedAnnotation top_level_annotation(hlo_module_->name(), "XLA GPU module");
+
   std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
   for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
-    TF_RETURN_IF_ERROR(thunk->Initialize(*this));
+    // Annotate execution of this op if tracing was enabled when we started
+    // running this module.  If tracing is enabled *while* we're running the
+    // module, we won't get any data, but that's probably an OK trade-off.
+    //
+    // TODO(jlebar): Should we cache the results of HloInstruction::ToString(),
+    // since we expect it to be an expensive call?
+    tensorflow::gtl::optional<ScopedAnnotation> op_annotation;
+    if (top_level_annotation.IsEnabled()) {
+      op_annotation.emplace(
+          thunk->hlo_instruction() != nullptr
+              ? thunk->hlo_instruction()->ToString(HloPrintOptions::Canonical())
+              : "<unknown>",
+          "XLA op");
+    }
+
+    TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
     int32 stream_no =
         thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction());
     se::Stream* stream =
@@ -179,18 +201,10 @@ Status GpuExecutable::ExecuteThunks(
       stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get());
     }
 
-    if (last_blocking_thunk_for_stream.count(stream_no)) {
-      stream->ThenWaitFor(FindOrDie(thunk_to_finish_event,
-                                    last_blocking_thunk_for_stream[stream_no])
-                              .get());
-      last_blocking_thunk_for_stream.erase(stream_no);
-    }
-
     // If this thunk requests it, wait for all currently-executing thunks to
     // finish.  This is useful e.g. if the thunk is about to perform autotuning.
     if (thunk->ShouldHaltAllActivityBeforeRunning(stream)) {
       TF_RETURN_IF_ERROR(main_stream->BlockHostUntilDone());
-      last_blocking_thunk_for_stream.clear();
     }
 
     profiler.StartOperation();
@@ -198,22 +212,11 @@ Status GpuExecutable::ExecuteThunks(
             << thunk->hlo_instruction()->ToString() << " on stream "
             << stream_no;
     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
-    if (thunk_schedule_->Depended(thunk) || thunk->ShouldBlockFutureThunks()) {
+    if (thunk_schedule_->Depended(thunk)) {
       auto finish_event = MakeUnique<se::Event>(main_stream->parent());
       finish_event->Init();
       stream->ThenRecordEvent(finish_event.get());
       thunk_to_finish_event[thunk] = std::move(finish_event);
-
-      if (thunk->ShouldBlockFutureThunks()) {
-        // Set last_blocking_thunk_for_stream on all streams other than this one
-        // so that all other streams will wait for this thunk to complete before
-        // executing any events that occur later in the total order.
-        for (int32 i = 0; i < sub_streams.size() + 1; ++i) {
-          if (i != stream_no) {
-            last_blocking_thunk_for_stream[i] = thunk;
-          }
-        }
-      }
     }
     profiler.FinishOperation(thunk->hlo_instruction());
   }
@@ -286,8 +289,8 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
   se::StreamExecutor* executor = run_options->stream()->parent();
   TF_ASSIGN_OR_RETURN(
       auto buffer_allocations,
-      buffer_allocations_builder.Build(*assignment_, executor->device_ordinal(),
-                                       memory_allocator));
+      buffer_allocations_builder.Build(
+          assignment_.get(), executor->device_ordinal(), memory_allocator));
 
   bool block_host_until_done =
       !memory_allocator->AllowsAsynchronousDeallocation();
@@ -329,8 +332,7 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
         buffers_in_result.insert(src_base);
         return Status::OK();
       }));
-  TF_RETURN_IF_ERROR(
-      buffer_allocations->TearDown(buffers_in_result, *assignment_));
+  TF_RETURN_IF_ERROR(buffer_allocations->TearDown(buffers_in_result));
 
   return std::move(shaped_buffer);
 }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
index 51aae79c3d8d0000007f9d2926d245de838d3aca..86a3a7111fd79494e469beecf3234f6cec9adb9c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
@@ -27,8 +27,7 @@ namespace gpu {
 // layout constraints for operands and results of library calls.
 class GpuLayoutAssignment : public LayoutAssignment {
  public:
-  explicit GpuLayoutAssignment(
-      const ComputationLayout& entry_computation_layout)
+  explicit GpuLayoutAssignment(ComputationLayout* entry_computation_layout)
       : LayoutAssignment(entry_computation_layout) {}
   ~GpuLayoutAssignment() override {}
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index 7c801955943021def4ddc0accd9f318b7916ce93..4c45d2e94aebce5496da94841f6a1ae9015615c1 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -69,7 +69,7 @@ TEST_F(LayoutAssignmentTest, Elementwise) {
         *computation_layout.mutable_result_layout() =
             ShapeLayout(result_shape_with_layout);
 
-        GpuLayoutAssignment layout_assignment(computation_layout);
+        GpuLayoutAssignment layout_assignment(&computation_layout);
         EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
 
         for (const HloInstruction* operand : add->operands()) {
@@ -156,7 +156,7 @@ TEST_F(LayoutAssignmentTest, BatchNormInference) {
         *computation_layout.mutable_result_layout() = ShapeLayout(result_shape);
       }
 
-      GpuLayoutAssignment layout_assignment(computation_layout);
+      GpuLayoutAssignment layout_assignment(&computation_layout);
       EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
 
       // The first operand to batchnorm should have the same layout as the
@@ -225,7 +225,7 @@ TEST_F(LayoutAssignmentTest, BatchNormTraining) {
                 {result_shape, offset_scale_shape, offset_scale_shape}));
       }
 
-      GpuLayoutAssignment layout_assignment(computation_layout);
+      GpuLayoutAssignment layout_assignment(&computation_layout);
       EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
 
       // The first operand to batchnorm should have the same layout as the
@@ -305,7 +305,7 @@ TEST_F(LayoutAssignmentTest, BatchNormGrad) {
                   {result_shape, scale_shape, scale_shape}));
         }
 
-        GpuLayoutAssignment layout_assignment(computation_layout);
+        GpuLayoutAssignment layout_assignment(&computation_layout);
         EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
 
         // The first and fourth operands to the batchnorm call should have the
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index f13727ca9b6954f6be9b9277018fcc64ee326954..7bb8df6581b49b1bf8c84a972f715e8dc119d8de 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -44,8 +44,8 @@ GpuTransferManager::GpuTransferManager()
           /*pointer_size=*/llvm::DataLayout(gpu::GpuCompiler::kDataLayout)
               .getPointerSize(0 /* default address space */)) {}
 
-Status GpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                                   const Literal& literal) {
+Status GpuTransferManager::TransferLiteralToInfeed(
+    se::StreamExecutor* executor, const LiteralSlice& literal) {
   const Shape& shape = literal.shape();
   VLOG(2) << "Transferring literal to infeed with shape: "
           << ShapeUtil::HumanString(shape);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
index d040a99975230578c270deabdfe60c61649e778c..09f8227f508a3159f3def285898e15bfad544552 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
@@ -37,7 +37,7 @@ class GpuTransferManager : public GenericTransferManager {
   ~GpuTransferManager() override {}
 
   Status TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                 const Literal& literal) override;
+                                 const LiteralSlice& literal) override;
   Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
                                 const void* source) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
index ece9fa04dce3fd12713fb7e58097dc16ebba83df..e230d538cc2df826778e8d13eaaaf31ec81c57f0 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
@@ -42,6 +42,15 @@ class HloScheduleTest : public HloTestBase {
         .ConsumeValueOrDie();
   }
 
+  std::unique_ptr<HloModule> CreateNewModule() {
+    HloModuleConfig config;
+    auto debug_options = GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_disable_multi_streaming(false);
+    config.set_debug_options(debug_options);
+    return MakeUnique<HloModule>("test_module", VersionedComputationHandle(),
+                                 config);
+  }
+
   HloVec RemoveHlo(const HloVec& input,
                    const std::unordered_set<const HloInstruction*>& remove) {
     HloVec result(input);
@@ -65,9 +74,9 @@ TEST_F(HloScheduleTest, SequentialMatMul) {
   HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/2, f32_2x2_, /*name=*/"z"));
   HloInstruction* dot1 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, x, y));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, x, y));
   HloInstruction* dot2 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, dot1, z));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, dot1, z));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build(dot2));
@@ -193,11 +202,11 @@ TEST_F(HloScheduleTest, ConcurrentMatMul) {
   HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/1, f32_2x2_, /*name=*/"y"));
   HloInstruction* dot1 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, x, y));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, x, y));
   HloInstruction* dot2 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, y, x));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, y, x));
   HloInstruction* add = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, dot1, dot2));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, dot1, dot2));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build(add));
@@ -259,24 +268,24 @@ TEST_F(HloScheduleTest, LatticeMatMul) {
     params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
         i, f32_2x2_, /*name=*/tensorflow::strings::Printf("param%d", i))));
   }
-  HloInstruction* d00 = builder.AddInstruction(HloInstruction::CreateBinary(
-      f32_2x2_, HloOpcode::kDot, params[2], params[3]));
+  HloInstruction* d00 = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(f32_2x2_, params[2], params[3]));
   HloInstruction* d10 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, params[1], d00));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, params[1], d00));
   HloInstruction* d11 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d00, params[4]));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d00, params[4]));
   HloInstruction* d20 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, params[0], d10));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, params[0], d10));
   HloInstruction* d21 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d10, d11));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d10, d11));
   HloInstruction* d22 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d11, params[5]));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d11, params[5]));
   HloInstruction* d30 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d20, d21));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d20, d21));
   HloInstruction* d31 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d21, d22));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d21, d22));
   HloInstruction* d40 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d30, d31));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d30, d31));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build(d40));
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 85ecbe8fdb34700ca738b99ddd9ea615afc35da3..5d5bef6b57b57fce4255a145634745b38dccacc7 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -46,41 +48,100 @@ bool IsFusile(const HloInstruction& hlo) {
          hlo.opcode() == HloOpcode::kTranspose;
 }
 
+bool IsIEEEFloatingPointScalarConstant(const HloInstruction* constant) {
+  if (constant->opcode() != HloOpcode::kConstant ||
+      !ShapeUtil::IsScalar(constant->shape())) {
+    return false;
+  }
+  auto type = constant->shape().element_type();
+  return type == F16 || type == F32 || type == F64;
+}
+
 }  // namespace
 
+/*static*/ bool GpuInstructionFusion::IsExpensive(
+    const HloInstruction& instruction) {
+  switch (instruction.opcode()) {
+    // We say that floating-point division is cheap on the GPU.
+    case HloOpcode::kDivide:
+      return !ShapeUtil::ElementIsFloating(instruction.shape()) &&
+             InstructionFusion::IsExpensive(instruction);
+
+    default:
+      return InstructionFusion::IsExpensive(instruction);
+  }
+}
+
 bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
 
   // Check if we can use output fusion for (A @ B) * alpha
-  if (producer->opcode() == HloOpcode::kDot) {
-    if (consumer->opcode() == HloOpcode::kMultiply) {
-      CHECK_EQ(consumer->operand_count(), 2);
-      int64 other_operand_index = 1 - operand_index;
-      const HloInstruction* alpha = consumer->operand(other_operand_index);
-      if (alpha->opcode() == HloOpcode::kConstant &&
-          ShapeUtil::IsScalar(alpha->shape())) {
+  if (consumer->operand_count() == 2 &&
+      (producer->opcode() == HloOpcode::kDot ||
+       (producer->opcode() == HloOpcode::kFusion &&
+        producer->fused_expression_root()->opcode() == HloOpcode::kDot))) {
+    int64 other_operand_index = 1 - operand_index;
+    const HloInstruction* alpha = consumer->operand(other_operand_index);
+    HloInstruction* op1 = nullptr;
+    HloInstruction* op2 = nullptr;
+    if (consumer->opcode() == HloOpcode::kFusion &&
+        consumer->fusion_kind() == HloInstruction::FusionKind::kLoop &&
+        Match(consumer->fused_expression_root(),
+              match::Op()
+                  .WithOpcode(HloOpcode::kMultiply)
+                  .WithOperand(0, match::Op(&op1))
+                  .WithOperand(1, match::Op(&op2)))) {
+      CHECK(op1 != nullptr && op2 != nullptr);
+      // If 'consumer' is a fusion node, it should consist of a broadcast of a
+      // scalar constant fused into a multiply, but nothing more. So one operand
+      // should be a parameter, and the other should be a broadcast.
+      if (op1->opcode() != HloOpcode::kParameter) {
+        std::swap(op1, op2);
+      }
+      if (op1->opcode() != HloOpcode::kParameter ||
+          op2->opcode() != HloOpcode::kBroadcast) {
+        return false;
+      }
+      if (IsIEEEFloatingPointScalarConstant(alpha)) {
+        return true;
+      }
+    } else if (consumer->opcode() == HloOpcode::kMultiply) {
+      // Fuse if 'alpha' is a broadcast of a scalar constant.
+      if (alpha->opcode() == HloOpcode::kBroadcast &&
+          alpha->dimensions().empty() &&
+          IsIEEEFloatingPointScalarConstant(alpha->operand(0))) {
         return true;
       }
     }
   }
 
-  // Only allow to fuse transpose into an output fusion.
+  // Only allow fusing transpose or broadcast into an output fusion that is
+  // implemented as a Gemm call.
   if (consumer->opcode() == HloOpcode::kFusion &&
-      consumer->fusion_kind() == HloInstruction::FusionKind::kOutput) {
-    if (producer->opcode() != HloOpcode::kTranspose) {
-      return false;
-    }
-    // Check that the transpose is the operand of a dot.
+      consumer->fusion_kind() == HloInstruction::FusionKind::kOutput &&
+      ImplementedAsGemm(*consumer)) {
     auto producer_operand_index = consumer->operand_index(producer);
     auto fused_parameter = consumer->fused_parameter(producer_operand_index);
     const std::vector<HloInstruction*>& fused_parameter_users =
         fused_parameter->users();
-    return (fused_parameter_users.size() == 1 &&
-            fused_parameter_users[0]->opcode() == HloOpcode::kDot);
+    if (fused_parameter_users.size() != 1) {
+      return false;
+    }
+    if (producer->opcode() == HloOpcode::kTranspose) {
+      // Check that the transpose is an operand of a dot.
+      return fused_parameter_users[0]->opcode() == HloOpcode::kDot;
+    }
+    if (producer->opcode() == HloOpcode::kBroadcast) {
+      // Check that the broadcast is a broadcast of a scalar constant into a
+      // multiply.
+      return producer->dimensions().empty() &&
+             IsIEEEFloatingPointScalarConstant(producer->operand(0)) &&
+             fused_parameter_users[0]->opcode() == HloOpcode::kMultiply;
+    }
   }
 
-  // Output fusion is not currently supported on GPUs.
+  // Other output fusions are not currently supported on GPUs.
   if (producer->opcode() == HloOpcode::kFusion) {
     return false;
   }
@@ -121,7 +182,9 @@ HloInstruction::FusionKind GpuInstructionFusion::ChooseKind(
   if (IsReductionToVector(*consumer)) {
     return HloInstruction::FusionKind::kInput;
   }
-  if (producer->opcode() == HloOpcode::kDot) {
+  if (producer->opcode() == HloOpcode::kDot ||
+      (producer->opcode() == HloOpcode::kFusion &&
+       producer->fused_expression_root()->opcode() == HloOpcode::kDot)) {
     return HloInstruction::FusionKind::kOutput;
   }
   if (HloOpcode::kFusion == consumer->opcode()) {
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
index bb2990e6dfc9de0a11566bb3a2fb3a1b62498ffa..9fb06b0a244186484b1c17edf13bd28a4305a1a6 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
@@ -27,6 +27,8 @@ class GpuInstructionFusion : public InstructionFusion {
   explicit GpuInstructionFusion(bool may_duplicate)
       : InstructionFusion(GpuInstructionFusion::IsExpensive, may_duplicate) {}
 
+  static bool IsExpensive(const HloInstruction& instruction);
+
   bool ShouldFuse(HloInstruction* consumer, int64 operand_index) override;
 
   HloInstruction::FusionKind ChooseKind(
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 4b231c449f8f101127b4d30bfff20c69d8cef5c1..760e0e90f583d0e43975e23b731a40af75c7dc17 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -108,8 +108,8 @@ TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfDotUnfused) {
   HloComputation::Builder builder(TestName());
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(S32, {1, 1}), "0"));
-  auto dot1 = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(S32, {1, 1}), HloOpcode::kDot, param0, param0));
+  auto dot1 = builder.AddInstruction(HloInstruction::CreateCanonicalDot(
+      ShapeUtil::MakeShape(S32, {1, 1}), param0, param0));
   auto reshape2 = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(S32, {1, 1, 1}), dot1));
 
@@ -125,8 +125,8 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
   HloComputation::Builder builder(TestName());
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(S32, {1, 1}), "0"));
-  auto dot1 = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(S32, {1, 1}), HloOpcode::kDot, param0, param0));
+  auto dot1 = builder.AddInstruction(HloInstruction::CreateCanonicalDot(
+      ShapeUtil::MakeShape(S32, {1, 1}), param0, param0));
   auto transpose2 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {1, 1}), dot1, {0, 1}));
 
@@ -232,12 +232,13 @@ TEST_F(InstructionFusionTest, DotOutputFusion) {
   auto module = tools::Parse(R"(
   HloModule test_module
   ENTRY OutputFusion {
-    constant = f32[] constant(3)
+    alpha = f32[] constant(3)
+    broadcast = f32[4,4]{1,0} broadcast(alpha), dimensions={}
     p0 = f32[4,3]{1,0} parameter(0)
     p1 = f32[4,3]{1,0} parameter(1)
     transpose = f32[3,4]{1,0} transpose(p1), dimensions={1, 0}
-    dot = f32[4,4]{1,0} dot(p0, transpose)
-    ROOT mul = f32[4,4] multiply(constant, dot)
+    dot = f32[4,4]{1,0} dot(p0, transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    ROOT mul = f32[4,4] multiply(dot, broadcast)
   })")
                     .ValueOrDie();
 
@@ -247,10 +248,93 @@ TEST_F(InstructionFusionTest, DotOutputFusion) {
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Fusion());
+  EXPECT_EQ(root->fusion_kind(), HloInstruction::FusionKind::kOutput);
   EXPECT_THAT(
       root->fused_expression_root(),
-      op::Multiply(op::Parameter(),
-                   op::Dot(op::Parameter(), op::Transpose(op::Parameter()))));
+      op::Multiply(op::Dot(op::Parameter(), op::Transpose(op::Parameter())),
+                   op::Broadcast(op::Parameter())));
+}
+
+// Compute sum(1/p0), where p0 has type f32, twice.  Check that the division is
+// duplicated and fused into both reduces.
+TEST_F(InstructionFusionTest, FloatingPointDivIsCheap) {
+  auto module = tools::Parse(R"(
+  HloModule test_module
+  Add {
+    lhs = f32[] parameter(0)
+    rhs = f32[] parameter(1)
+    ROOT add = f32[] add(lhs, rhs)
+  }
+  ENTRY TestComputation {
+    zero = f32[] constant(0)
+    one = f32[] constant(1)
+    p0 = f32[100] parameter(0)
+    recip = f32[100] divide(one, p0)
+    sum1 = f32[] reduce(recip, zero), dimensions={0}, to_apply=Add
+    sum2 = f32[] reduce(recip, zero), dimensions={0}, to_apply=Add
+    ROOT root = (f32[], f32[]) tuple(sum1, sum2)
+  })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Tuple(op::Fusion(), op::Fusion()));
+}
+
+// Compute sum(100/p0), where p0 has type s32, twice.  Check that the division
+// is *not* duplicated and fused into both reduces, because we say that integer
+// division is not cheap.
+TEST_F(InstructionFusionTest, IntegerDivIsNotCheap) {
+  auto module = tools::Parse(R"(
+  HloModule test_module
+  Add {
+    lhs = s32[] parameter(0)
+    rhs = s32[] parameter(1)
+    ROOT add = s32[] add(lhs, rhs)
+  }
+  ENTRY TestComputation {
+    zero = s32[] constant(0)
+    one_hundred = s32[] constant(100)
+    p0 = s32[100] parameter(0)
+    recip = s32[100] divide(one_hundred, p0)
+    sum1 = s32[] reduce(recip, zero), dimensions={0}, to_apply=Add
+    sum2 = s32[] reduce(recip, zero), dimensions={0}, to_apply=Add
+    ROOT mul = (s32[], s32[]) tuple(sum1, sum2)
+  })")
+                    .ValueOrDie();
+
+  EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
+                   .Run(module.get())
+                   .ValueOrDie());
+}
+
+TEST_F(InstructionFusionTest, DotOutputFusionImpossible) {
+  auto module = tools::Parse(R"(
+  HloModule test_module
+  ENTRY NoOutputFusion {
+    alpha = f32[] constant(3)
+    broadcast = f32[4,4]{1,0} broadcast(alpha), dimensions={}
+    p0 = f32[4,3]{1,0} parameter(0)
+    p1 = f32[3,4]{1,0} parameter(1)
+    dot = f32[4,4]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    d = f32[4,4]{1,0} multiply(dot, dot)
+    ROOT mul = f32[4,4] multiply(d, broadcast)
+  })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion());
+  EXPECT_EQ(root->fusion_kind(), HloInstruction::FusionKind::kLoop);
+  EXPECT_THAT(root->fused_expression_root(),
+              op::Multiply(op::Multiply(op::Parameter(), op::Parameter()),
+                           op::Broadcast(op::Parameter())));
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 532d436ee82b985a4efe300f90223e1298e85765..22e715099526c20532bb298e84e50457d89f615e 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -59,6 +59,25 @@ bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
          !ShapeUtil::HasZeroElements(lhs_shape) &&
          !ShapeUtil::HasZeroElements(rhs_shape);
 }
+
+bool DotImplementedAsGemm(const HloInstruction& dot) {
+  CHECK_EQ(dot.opcode(), HloOpcode::kDot);
+  const Shape& lhs_shape = dot.operand(0)->shape();
+  const Shape& rhs_shape = dot.operand(1)->shape();
+
+  // If gemm can accept the operand shapes, use it rather than a custom
+  // kernel.
+  if (AreValidGemmShapes(lhs_shape, rhs_shape, dot.shape())) {
+    // The size of the reduction dimension should match. The shape inference
+    // guarantees this invariant, so the check here is for programming
+    // errors.
+    const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers();
+    CHECK_EQ(lhs_shape.dimensions(dim_numbers.lhs_contracting_dimensions(0)),
+             rhs_shape.dimensions(dim_numbers.rhs_contracting_dimensions(0)));
+    return true;
+  }
+  return false;
+}
 }  // namespace
 
 bool ImplementedAsGemm(const HloInstruction& hlo) {
@@ -69,24 +88,7 @@ bool ImplementedAsGemm(const HloInstruction& hlo) {
 
   // For certain types of Dot, we can call pre-canned BLAS gemm.
   if (hlo.opcode() == HloOpcode::kDot) {
-    const Shape& lhs_shape = hlo.operand(0)->shape();
-    const Shape& rhs_shape = hlo.operand(1)->shape();
-
-    // If gemm can accept the operand shapes, use it rather than a custom
-    // kernel.
-    if (AreValidGemmShapes(lhs_shape, rhs_shape, hlo.shape())) {
-      // The size of the reduction dimension should match. The shape inference
-      // guarantees this invariant, so the check here is for programming
-      // errors.
-      CHECK_EQ(lhs_shape.dimensions(1), rhs_shape.dimensions(0));
-      return true;
-    }
-  }
-
-  if (hlo.opcode() == HloOpcode::kFusion &&
-      hlo.fusion_kind() == HloInstruction::FusionKind::kTransposeDot &&
-      hlo.fused_expression_root()->opcode() == HloOpcode::kDot) {
-    return true;
+    return DotImplementedAsGemm(hlo);
   }
 
   if (hlo.opcode() == HloOpcode::kFusion &&
@@ -98,7 +100,7 @@ bool ImplementedAsGemm(const HloInstruction& hlo) {
       dot = hlo.fused_expression_root()->operand(1);
     }
     if (dot->opcode() == HloOpcode::kDot) {
-      return ImplementedAsGemm(*dot);
+      return DotImplementedAsGemm(*dot);
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
index 71aada080ae8df70bffce3e1854b5fbd833efd23..bb47a4280541ce2806472aa9365bb0ef38c0c3b3 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -116,6 +117,26 @@ Status IrEmitterNested::HandleParameter(HloInstruction* parameter) {
 Status IrEmitterNested::EmitTargetElementLoop(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator) {
+  // For MOF we give the loop emitter an array for every output it should
+  // generate.
+  if (hlo.IsMultiOutputFusion()) {
+    std::vector<llvm_ir::IrArray> target_arrays;
+    for (int64 i = 0, e = ShapeUtil::TupleElementCount(hlo.shape()); i != e;
+         ++i) {
+      target_arrays.push_back(GetIrArray(hlo, hlo, {i}));
+    }
+    TF_RETURN_IF_ERROR(
+        llvm_ir::LoopEmitter(element_generator, target_arrays, &ir_builder_)
+            .EmitLoop());
+
+    std::vector<llvm::Value*> tuple_operand_ptrs;
+    for (const llvm_ir::IrArray& array : target_arrays) {
+      tuple_operand_ptrs.push_back(array.GetBasePointer());
+    }
+    llvm_ir::EmitTuple(GetIrArray(hlo, hlo), tuple_operand_ptrs, &ir_builder_,
+                       module_);
+    return Status::OK();
+  }
   return llvm_ir::LoopEmitter(element_generator, GetIrArray(hlo, hlo),
                               &ir_builder_)
       .EmitLoop();
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 9f37235d32296828d3ca54f35517c3ee57607cfc..55d4c1d13d3ad41e09d48db70478cf5e6af59808 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -267,7 +267,10 @@ int ComputeMaxUnrollFactor(const HloInstruction* hlo) {
 
   // Find the largest possible power of two to unroll by.
   // TODO(kramerb): Make this smarter.
-  int64 num_elements = ShapeUtil::ElementsIn(hlo->shape());
+  const Shape& element_shape = hlo->IsMultiOutputFusion()
+                                   ? ShapeUtil::GetSubshape(hlo->shape(), {0})
+                                   : hlo->shape();
+  int64 num_elements = ShapeUtil::ElementsIn(element_shape);
   for (int i = max_unroll_factor; i > 1; i /= 2) {
     if (num_elements % i == 0) {
       return i;
@@ -565,12 +568,8 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     return Status::OK();
   }
 
-  int unroll_factor = 1;
-  // TODO(kramerb): Unrolling multi-output loop fusions too.
-  if (!fusion->IsMultiOutputFusion()) {
-    CHECK(fusion->fusion_kind() == HloInstruction::FusionKind::kLoop);
-    unroll_factor = ComputeMaxUnrollFactor(fusion);
-  }
+  CHECK(fusion->fusion_kind() == HloInstruction::FusionKind::kLoop);
+  int unroll_factor = ComputeMaxUnrollFactor(fusion);
 
   thunk_sequence_->emplace_back(BuildKernelThunk(fusion, unroll_factor));
   return IrEmitter::HandleFusion(fusion);
@@ -1928,6 +1927,52 @@ Status IrEmitterUnnested::HandleSelect(HloInstruction* select) {
   return IrEmitter::HandleSelect(select);
 }
 
+Status IrEmitterUnnested::HandleCrossReplicaSum(HloInstruction* crs) {
+  if (hlo_module_config_.replica_count() != 1) {
+    // TODO(b/33011107): Support nontrivial cross replica sum on GPU.
+    return Unimplemented(
+        "CrossReplicaSum with >1 replica is not implemented on GPU.");
+  }
+
+  // CRS with one operand and one replica is simply the identity function.
+  // Buffer assignment expects a copy, so that's what we do.
+  //
+  // TODO(b/80100934): We would like to eliminate one-replica CRS nodes entirely
+  // in algebraic-simplifier, but currently on some platforms
+  // HloModuleConfig::num_replicas changes between when the module is compiled
+  // and when it's run.
+  if (crs->operand_count() == 1) {
+    CHECK(ShapeUtil::IsArray(crs->operand(0)->shape()))
+        << "Operands to cross-replica-sum must be arrays: " << crs->ToString();
+    thunk_sequence_->push_back(MakeUnique<DeviceToDeviceCopyThunk>(
+        /*source_address=*/GetAllocationSlice(*crs->operand(0)),
+        /*destination_buffer=*/GetAllocationSlice(*crs),
+        /*mem_size=*/ShapeUtil::ByteSizeOf(crs->shape()), crs));
+    return Status::OK();
+  }
+
+  // One-replica CRS with multiple operands produces a tuple of the inputs.
+  // Again, buffer assignment expects us to copy each.
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  std::vector<BufferAllocation::Slice> tuple_element_buffers;
+  for (int64 i = 0; i < crs->operand_count(); ++i) {
+    tuple_element_buffers.push_back(ir_emitter_context_->buffer_assignment()
+                                        .GetUniqueSlice(crs, {i})
+                                        .ValueOrDie());
+    thunks.push_back(MakeUnique<DeviceToDeviceCopyThunk>(
+        /*source_address=*/GetAllocationSlice(*crs->operand(i)),
+        /*destination_buffer=*/tuple_element_buffers.back(),
+        /*mem_size=*/ShapeUtil::ByteSizeOf(crs->operand(i)->shape()), crs));
+  }
+
+  // Output a tuple of the buffers above.
+  thunks.push_back(MakeUnique<TupleThunk>(tuple_element_buffers,
+                                          GetAllocationSlice(*crs), crs));
+  thunk_sequence_->push_back(
+      MakeUnique<SequentialThunk>(std::move(thunks), crs));
+  return Status::OK();
+}
+
 Status IrEmitterUnnested::HandleInfeed(HloInstruction* infeed) {
   thunk_sequence_->emplace_back(BuildInfeedThunk(infeed));
   return Status::OK();
@@ -2194,6 +2239,21 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildInfeedThunk(
       /*destination_buffer=*/GetAllocationSlice(*inst), inst);
 }
 
+namespace {
+double GetScalarConstantAsDouble(const Literal& literal) {
+  switch (literal.shape().element_type()) {
+    case F16:
+      return static_cast<double>(literal.Get<Eigen::half>({}));
+    case F32:
+      return literal.Get<float>({});
+    case F64:
+      return literal.Get<double>({});
+    default:
+      LOG(FATAL) << "Unsupported type.";
+  }
+}
+}  // namespace
+
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
     const HloInstruction* inst) {
   if (inst->opcode() == HloOpcode::kDot) {
@@ -2206,65 +2266,48 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
         lhs->shape(),               // The shape of LHS.
         rhs->shape(),               // The shape of RHS.
         inst->shape(),              // The shape of the output.
-        false,                      // Do not transpose LHS.
-        false,                      // Do not transpose RHS.
         1.0,                        // alpha.
         inst);
   }
 
   if (inst->opcode() == HloOpcode::kFusion) {
-    if (inst->fusion_kind() == HloInstruction::FusionKind::kOutput) {
-      const HloInstruction* mul = inst->fused_expression_root();
-      const HloInstruction* dot = mul->operand(0);
-      const HloInstruction* alpha = mul->operand(1);
-      if (dot->opcode() != HloOpcode::kDot) {
-        std::swap(dot, alpha);
-      }
-      DCHECK(dot->opcode() == HloOpcode::kDot);
-      const HloInstruction* lhs_parameter = StripTranspose(*dot->operand(0));
-      const HloInstruction* rhs_parameter = StripTranspose(*dot->operand(1));
-      DCHECK(lhs_parameter->opcode() == HloOpcode::kParameter &&
-             rhs_parameter->opcode() == HloOpcode::kParameter);
-      const HloInstruction* lhs =
-          inst->operand(lhs_parameter->parameter_number());
-      const HloInstruction* rhs =
-          inst->operand(rhs_parameter->parameter_number());
-
-      return MakeUnique<GemmThunk>(
-          GetAllocationSlice(*lhs),             // The buffer assigned to LHS.
-          GetAllocationSlice(*rhs),             // The buffer assigned to RHS.
-          GetAllocationSlice(*mul),             // The output buffer.
-          lhs->shape(),                         // The shape of LHS.
-          rhs->shape(),                         // The shape of RHS.
-          inst->shape(),                        // The shape of the output.
-          dot->operand(0)->IsRank2Transpose(),  // Transpose LHS.
-          dot->operand(1)->IsRank2Transpose(),  // Transpose RHS.
-          alpha->literal().Get<double>({0}),    // alpha.
-          inst);
-    } else {
-      const HloInstruction* dot = inst->fused_expression_root();
-      DCHECK(dot->opcode() == HloOpcode::kDot);
-      const HloInstruction* lhs_parameter = StripTranspose(*dot->operand(0));
-      const HloInstruction* rhs_parameter = StripTranspose(*dot->operand(1));
-      DCHECK(lhs_parameter->opcode() == HloOpcode::kParameter &&
-             rhs_parameter->opcode() == HloOpcode::kParameter);
-      const HloInstruction* lhs =
-          inst->operand(lhs_parameter->parameter_number());
-      const HloInstruction* rhs =
-          inst->operand(rhs_parameter->parameter_number());
-
-      return MakeUnique<GemmThunk>(
-          GetAllocationSlice(*lhs),             // The buffer assigned to LHS.
-          GetAllocationSlice(*rhs),             // The buffer assigned to RHS.
-          GetAllocationSlice(*inst),            // The output buffer.
-          lhs->shape(),                         // The shape of LHS.
-          rhs->shape(),                         // The shape of RHS.
-          inst->shape(),                        // The shape of the output.
-          dot->operand(0)->IsRank2Transpose(),  // Transpose LHS.
-          dot->operand(1)->IsRank2Transpose(),  // Transpose RHS.
-          1.0,                                  // Alpha.
-          inst);
+    CHECK_EQ(inst->fusion_kind(), HloInstruction::FusionKind::kOutput);
+    const HloInstruction* mul = inst->fused_expression_root();
+    const HloInstruction* dot = mul->operand(0);
+    const HloInstruction* alpha = mul->operand(1);
+    if (dot->opcode() != HloOpcode::kDot) {
+      std::swap(dot, alpha);
     }
+    if (alpha->opcode() == HloOpcode::kBroadcast) {
+      alpha = alpha->operand(0);
+    }
+    alpha = inst->operand(alpha->parameter_number());
+    // TODO(b/74185543): Remove the following if block once we support fusion
+    // with a non-constant as well. Then we will just always use the constant
+    // on the device.
+    if (alpha->opcode() == HloOpcode::kCopy) {
+      alpha = alpha->operand(0);
+    }
+
+    DCHECK(dot->opcode() == HloOpcode::kDot);
+    const HloInstruction* lhs_parameter = StripTranspose(*dot->operand(0));
+    const HloInstruction* rhs_parameter = StripTranspose(*dot->operand(1));
+    DCHECK(lhs_parameter->opcode() == HloOpcode::kParameter &&
+           rhs_parameter->opcode() == HloOpcode::kParameter);
+    const HloInstruction* lhs =
+        inst->operand(lhs_parameter->parameter_number());
+    const HloInstruction* rhs =
+        inst->operand(rhs_parameter->parameter_number());
+
+    return MakeUnique<GemmThunk>(
+        GetAllocationSlice(*lhs),   // The buffer assigned to LHS.
+        GetAllocationSlice(*rhs),   // The buffer assigned to RHS.
+        GetAllocationSlice(*inst),  // The output buffer.
+        lhs->shape(),               // The shape of LHS.
+        rhs->shape(),               // The shape of RHS.
+        inst->shape(),              // The shape of the output.
+        GetScalarConstantAsDouble(alpha->literal()),  // alpha.
+        inst);
   }
 
   LOG(FATAL) << "Cannot build a GemmThunk for " << inst->ToString();
@@ -2540,16 +2583,14 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
         .EmitLoop(IrName(&hlo));
   }
 
-  CHECK_EQ(unroll_factor, 1)
-      << "multi-output fusion does not support unrolling";
-
   // For multiple outputs fusion, we need to emit each operand and the root.
   std::vector<llvm_ir::IrArray> output_arrays;
   for (int64 i = 0; i < ShapeUtil::TupleElementCount(hlo.shape()); ++i) {
     output_arrays.push_back(GetIrArray(hlo, hlo, {i}));
   }
   TF_RETURN_IF_ERROR(ParallelLoopEmitter(element_generator, output_arrays,
-                                         launch_dimensions, &ir_builder_)
+                                         launch_dimensions, &ir_builder_,
+                                         unroll_factor)
                          .EmitLoop(IrName(&hlo)));
 
   std::vector<llvm::Value*> tuple_operand_ptrs;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index b41ab2162ab81f66e123a7055ca3ffc815c3ef88..e42c5e86862576bad1c8610652d1c50d2364cd83 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -76,6 +76,7 @@ class IrEmitterUnnested : public IrEmitter {
   Status HandleInfeed(HloInstruction* xla_infeed) override;
   Status HandleRng(HloInstruction* random) override;
   Status HandleSelect(HloInstruction* select) override;
+  Status HandleCrossReplicaSum(HloInstruction* crs) override;
 
   Status EmitTargetElementLoop(
       const HloInstruction& hlo,
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index d376ef7a245eb9ed86939f44c611b6dde5606b23..f56c1ce69f11ed79c8be76834269f29de93a9645 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -35,26 +35,38 @@ KernelThunk::KernelThunk(
       kernel_name_(kernel_name),
       unroll_factor_(unroll_factor) {}
 
-tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) {
+Status KernelThunk::Initialize(const GpuExecutable& executable,
+                               se::StreamExecutor* executor) {
   tensorflow::mutex_lock lock(mutex_);
-  if (loader_spec_) {
-    // Already initialized by another thread.
-    return tensorflow::Status::OK();
-  }
+  if (!loader_spec_) {
+    loader_spec_.reset(new se::MultiKernelLoaderSpec(args_.size()));
+    tensorflow::StringPiece ptx = executable.ptx();
+    // Convert tensorflow::StringPiece to se::port::StringPiece because
+    // StreamExecutor uses the latter.
+    loader_spec_->AddCudaPtxInMemory(
+        se::port::StringPiece(ptx.data(), ptx.size()), kernel_name_);
 
-  loader_spec_.reset(new se::MultiKernelLoaderSpec(args_.size()));
-  tensorflow::StringPiece ptx = executable.ptx();
-  // Convert tensorflow::StringPiece to se::port::StringPiece because
-  // StreamExecutor uses the latter.
-  loader_spec_->AddCudaPtxInMemory(
-      se::port::StringPiece(ptx.data(), ptx.size()), kernel_name_);
+    if (!executable.cubin().empty()) {
+      loader_spec_->AddCudaCubinInMemory(
+          reinterpret_cast<const char*>(executable.cubin().data()),
+          kernel_name_);
+    }
+  }
 
-  if (!executable.cubin().empty()) {
-    loader_spec_->AddCudaCubinInMemory(
-        reinterpret_cast<const char*>(executable.cubin().data()), kernel_name_);
+  // Load the kernel into the device if necessary.
+  //
+  // We could alternatively do this within ExecuteOnStream, but doing it here
+  // lets the time spent loading the kernel not count towards our execution
+  // profiles.
+  auto it = kernel_cache_.find(executor);
+  if (kernel_cache_.end() == it) {
+    it = kernel_cache_.emplace(executor, se::KernelBase(executor)).first;
+    if (!executor->GetKernel(*loader_spec_, &it->second)) {
+      return InternalError("Unable to load kernel %s", kernel_name_.c_str());
+    }
   }
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 void KernelThunk::SetLaunchDimensions(const LaunchDimensions& launch_dims) {
@@ -62,21 +74,18 @@ void KernelThunk::SetLaunchDimensions(const LaunchDimensions& launch_dims) {
   launch_dimensions_ = launch_dims;
 }
 
-tensorflow::Status KernelThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+Status KernelThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                    se::Stream* stream) {
   // Load the kernel.
   se::StreamExecutor* executor = stream->parent();
   LaunchDimensions launch_dimensions;
   const se::KernelBase* kernel = nullptr;
+
   {
     tensorflow::mutex_lock lock(mutex_);
     auto it = kernel_cache_.find(executor);
-    if (kernel_cache_.end() == it) {
-      it = kernel_cache_.emplace(executor, se::KernelBase(executor)).first;
-      if (!executor->GetKernel(*loader_spec_, &it->second)) {
-        return InternalError("Unable to load kernel %s", kernel_name_.c_str());
-      }
-    }
+    CHECK(it != kernel_cache_.end())
+        << "Initialize() not called for StreamExecutor " << executor;
     launch_dimensions = launch_dimensions_;
     kernel = &it->second;
   }
@@ -97,7 +106,7 @@ tensorflow::Status KernelThunk::ExecuteOnStream(
           *kernel_args)) {
     return InternalError("Unable to launch kernel %s", kernel_name_.c_str());
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index b556befe66b6bebba1a958f553f0a9b2c4eebbe4..7def27e189b66747569344a3dbe5c0c446f903be 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -57,11 +57,12 @@ class KernelThunk : public Thunk {
   int unroll_factor() const { return unroll_factor_; }
   void SetLaunchDimensions(const LaunchDimensions& launch_dims);
 
-  tensorflow::Status Initialize(const GpuExecutable& executable) override;
+  Status Initialize(const GpuExecutable& executable,
+                    se::StreamExecutor* executor) override;
 
   // Executes the kernel for the thunk on "stream", which must be non-null.
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
  private:
   // Buffers passed to the kernel as arguments.
@@ -83,7 +84,8 @@ class KernelThunk : public Thunk {
   mutable tensorflow::mutex mutex_;
   std::unique_ptr<se::MultiKernelLoaderSpec> loader_spec_ GUARDED_BY(mutex_);
 
-  // Loaded kernels for each `StreamExecutor`
+  // Loaded kernels for each `StreamExecutor`.  Requires pointer stability of
+  // values.
   std::unordered_map<se::StreamExecutor*, se::KernelBase> kernel_cache_
       GUARDED_BY(mutex_);
 };
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index 86c4ac18b0501c38aaaae5a007bddcf261ca338f..7de8f9e1ee922bdbf65fd1299702482e1843f17e 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -47,7 +47,6 @@ cc_library(
         "@llvm//:scalar",
         "@llvm//:support",
         "@llvm//:target",
-        "@llvm//:transform_utils",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index d70cb07c57d48c0faed2cdc5ea9fc5ce5fb32be0..a4e4e85bf3d2c197cfc691b7fca0920aa6571729 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -77,8 +77,7 @@ static string GetLibdeviceFilename(const string& libdevice_dir_path,
   // Since CUDA 9.0, all GPU versions are included in a single file
   const char* unified_libdevice_filename = "libdevice.10.bc";
   std::vector<string> unified_libdevice_files;
-  const tensorflow::Status status =
-    tensorflow::Env::Default()->GetMatchingPaths(
+  const Status status = tensorflow::Env::Default()->GetMatchingPaths(
       tensorflow::io::JoinPath(libdevice_dir_path, unified_libdevice_filename),
       &unified_libdevice_files);
   if (status.ok() && unified_libdevice_files.size() == 1) {
@@ -273,7 +272,7 @@ string EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) {
     codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
         llvm::Triple(module->getTargetTriple())));
 
-    target_machine->addPassesToEmitFile(codegen_passes, pstream,
+    target_machine->addPassesToEmitFile(codegen_passes, pstream, nullptr,
                                         llvm::TargetMachine::CGFT_AssemblyFile);
     codegen_passes.run(*module);
   }
@@ -311,11 +310,11 @@ bool CouldNeedLibdevice(const llvm::Module& module) {
 }
 
 // Links libdevice into the given module if the module needs libdevice.
-tensorflow::Status LinkLibdeviceIfNecessary(
-    llvm::Module* module, std::pair<int, int> compute_capability,
-    const string& libdevice_dir_path) {
+Status LinkLibdeviceIfNecessary(llvm::Module* module,
+                                std::pair<int, int> compute_capability,
+                                const string& libdevice_dir_path) {
   if (!CouldNeedLibdevice(*module)) {
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 
   llvm::Linker linker(*module);
@@ -336,7 +335,7 @@ tensorflow::Status LinkLibdeviceIfNecessary(
     return tensorflow::errors::Internal(tensorflow::strings::StrCat(
         "Error linking libdevice from ", libdevice_path));
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 StatusOr<string> CompileModuleToPtx(llvm::Module* module,
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
index 7bda4e2fcd469bd430e5ef1846251c8504225383..c8f0d4185c63c5bafca6f30acab31cbe8e987277 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
@@ -370,26 +370,38 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
   return true;
 }
 
-StatusOr<bool> PadInsertion::Run(HloModule* module) {
+StatusOr<bool> PadInsertion::RunOnComputation(HloComputation* computation) {
   bool changed = false;
-  for (HloInstruction* instruction :
-       module->entry_computation()->MakeInstructionPostOrder()) {
-    if (IsCustomCallToDnnConvolution(*instruction)) {
-      const auto& target = instruction->custom_call_target();
-      if (target == kCudnnConvForwardCallTarget) {
-        changed |= CanonicalizeForwardConvolution(instruction);
-      } else if (target == kCudnnConvBackwardFilterCallTarget) {
-        changed |= CanonicalizeBackwardFilterConvolution(instruction);
-      } else if (target == kCudnnConvBackwardInputCallTarget) {
-        changed |= CanonicalizeBackwardInputConvolution(instruction);
-      } else {
-        LOG(FATAL) << "Unknown custom call target for cudnn conv: "
-                   << instruction->ToString();
-      }
+  std::vector<HloInstruction*> convs;
+  for (auto* instr : computation->instructions()) {
+    if (IsCustomCallToDnnConvolution(*instr)) {
+      convs.push_back(instr);
+    }
+  }
+  for (HloInstruction* instruction : convs) {
+    const auto& target = instruction->custom_call_target();
+    if (target == kCudnnConvForwardCallTarget) {
+      changed |= CanonicalizeForwardConvolution(instruction);
+    } else if (target == kCudnnConvBackwardFilterCallTarget) {
+      changed |= CanonicalizeBackwardFilterConvolution(instruction);
+    } else if (target == kCudnnConvBackwardInputCallTarget) {
+      changed |= CanonicalizeBackwardInputConvolution(instruction);
+    } else {
+      LOG(FATAL) << "Unknown custom call target for cudnn conv: "
+                 << instruction->ToString();
     }
   }
   return changed;
 }
 
+StatusOr<bool> PadInsertion::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
+    changed |= result;
+  }
+  return changed;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.h b/tensorflow/compiler/xla/service/gpu/pad_insertion.h
index 5e1c68701daa02eba64f3e34933ce373a496c1b8..67e51509e4c717951c83c7e41943af1de762dee0 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.h
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.h
@@ -31,6 +31,7 @@ class PadInsertion : public HloPassInterface {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
+  StatusOr<bool> RunOnComputation(HloComputation* computation);
   // Returns if any changes are made to the parent computation.
   bool CanonicalizeForwardConvolution(HloInstruction* conv);
   bool CanonicalizeBackwardFilterConvolution(HloInstruction* backward_conv);
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
index c8510808f10a731af90154447bd3e1e037db6348..88cb10883e97ae663dc492ad088e6daf9133d7f5 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
@@ -20,24 +20,24 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-SequentialThunk::SequentialThunk(std::vector<std::unique_ptr<Thunk>>&& thunks,
+SequentialThunk::SequentialThunk(std::vector<std::unique_ptr<Thunk>> thunks,
                                  const HloInstruction* hlo)
     : Thunk(Kind::kSequential, hlo), thunks_(std::move(thunks)) {}
 
-tensorflow::Status SequentialThunk::Initialize(
-    const GpuExecutable& executable) {
+Status SequentialThunk::Initialize(const GpuExecutable& executable,
+                                   se::StreamExecutor* executor) {
   for (auto& thunk : thunks_) {
-    TF_RETURN_IF_ERROR(thunk->Initialize(executable));
+    TF_RETURN_IF_ERROR(thunk->Initialize(executable, executor));
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status SequentialThunk::ExecuteOnStream(
+Status SequentialThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   for (const auto& thunk : thunks_) {
     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
index df17b8d67b80321c7088243eae46e7a723b4ede9..135f79e413dfaa27f2f2264e0daa3beb3c305e0f 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
@@ -31,16 +31,17 @@ namespace gpu {
 // require multiple kernel launches or library calls.
 class SequentialThunk : public Thunk {
  public:
-  SequentialThunk(std::vector<std::unique_ptr<Thunk>>&& thunks,
+  SequentialThunk(std::vector<std::unique_ptr<Thunk>> thunks,
                   const HloInstruction* hlo);
   SequentialThunk(const SequentialThunk&) = delete;
   SequentialThunk& operator=(const SequentialThunk&) = delete;
 
   const std::vector<std::unique_ptr<Thunk>>& thunks() const { return thunks_; }
 
-  tensorflow::Status Initialize(const GpuExecutable& executable) override;
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status Initialize(const GpuExecutable& executable,
+                    se::StreamExecutor* executor) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
  private:
   // The list of sub-thunks.
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
index 8c98956f1a9b2a0bb1d304a27eb8c8cfcf610784..696fa7e0194032b5c78bf11383c3280a62de07fa 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
@@ -28,6 +28,15 @@ namespace gpu {
 
 class StreamAssignmentTest : public HloTestBase {
  protected:
+  std::unique_ptr<HloModule> CreateNewModule() {
+    HloModuleConfig config;
+    auto debug_options = GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_disable_multi_streaming(false);
+    config.set_debug_options(debug_options);
+    return MakeUnique<HloModule>("test_module", VersionedComputationHandle(),
+                                 config);
+  }
+
   // Pre-canned shapes.
   Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2});
 };
@@ -41,9 +50,9 @@ TEST_F(StreamAssignmentTest, SequentialMatMul) {
   HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/2, f32_2x2_, /*name=*/"z"));
   HloInstruction* dot1 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, x, y));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, x, y));
   HloInstruction* dot2 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, dot1, z));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, dot1, z));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build(dot2));
@@ -60,9 +69,9 @@ TEST_F(StreamAssignmentTest, ConcurrentMatMul) {
   HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/1, f32_2x2_, /*name=*/"y"));
   HloInstruction* dot1 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, x, y));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, x, y));
   HloInstruction* dot2 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, y, x));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, y, x));
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, dot1, dot2));
 
@@ -91,24 +100,24 @@ TEST_F(StreamAssignmentTest, LatticeMatMul) {
     params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
         i, f32_2x2_, /*name=*/tensorflow::strings::Printf("param%d", i))));
   }
-  HloInstruction* d00 = builder.AddInstruction(HloInstruction::CreateBinary(
-      f32_2x2_, HloOpcode::kDot, params[2], params[3]));
+  HloInstruction* d00 = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(f32_2x2_, params[2], params[3]));
   HloInstruction* d10 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, params[1], d00));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, params[1], d00));
   HloInstruction* d11 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d00, params[4]));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d00, params[4]));
   HloInstruction* d20 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, params[0], d10));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, params[0], d10));
   HloInstruction* d21 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d10, d11));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d10, d11));
   HloInstruction* d22 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d11, params[5]));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d11, params[5]));
   HloInstruction* d30 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d20, d21));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d20, d21));
   HloInstruction* d31 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d21, d22));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d21, d22));
   HloInstruction* d40 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d30, d31));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d30, d31));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build(d40));
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index a0c785ed913109e987d058124c8ef49019c98500..931c0bffab850362dbd2df975657dd47d9cbd3ae 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -70,11 +70,14 @@ class Thunk {
   Kind kind() const { return kind_; }
   const HloInstruction* hlo_instruction() const { return hlo_instruction_; }
 
-  // Prepares for executing the thunk. This method is called only once over
-  // Thunk's lifetime. For example, KernelThunk::Initialize loads the PTX of a
-  // kernel, which is the same in every execution.
-  virtual tensorflow::Status Initialize(const GpuExecutable& executable) {
-    return tensorflow::Status::OK();
+  // Prepares the thunk for execution on the given StreamExecutor.
+  //
+  // This may be called multiple times.  Its main purpose is to give us a chance
+  // to do initialization outside of ExecuteOnStream() so that the
+  // time spent initializing doesn't count towards our execution profile.
+  virtual Status Initialize(const GpuExecutable& /*executable*/,
+                            se::StreamExecutor* /*executor*/) {
+    return Status::OK();
   }
 
   // Users of Thunk should call ShouldHaltAllActivityBeforeRunning(stream)
@@ -89,21 +92,13 @@ class Thunk {
     return false;
   }
 
-  // Indicates whether thunks scheduled after this one should wait for this one
-  // to complete before running. For example, a convolution thunk creates a
-  // scratch allocator, then kicks off a convolution in cudnn via the stream
-  // executor. When the stream executor call returns, the scratch allocator goes
-  // out of scope, and the scratch memory is deallocated. In this case, the
-  // convolution thunk needs to return true so that future thunks wait for the
-  // convolution thunk to avoid reusing the deallocated memory until the
-  // convolution thunk is done with it.
-  virtual bool ShouldBlockFutureThunks() { return false; }
-
   // Execute the kernel for the thunk on the given stream. This method must be
   // called after Initialize and can be called multiple times over Thunk's
   // lifetime. Stream argument must be non-null.
-  virtual tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) = 0;
+  //
+  // Precondition: Initialize(stream->parent()) has been called.
+  virtual Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                 se::Stream* stream) = 0;
 
  private:
   Kind kind_;
diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
index ecb54857ccc40ead21e5a18d79a37b545680021d..97cb04c38fbf18e516857f5269c984696ca204c3 100644
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
@@ -20,8 +20,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-tensorflow::Status TupleThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+Status TupleThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                   se::Stream* stream) {
   std::vector<void*> tuple_element_buffer_addresses;
   for (BufferAllocation::Slice tuple_element_buffer : tuple_element_buffers_) {
     tuple_element_buffer_addresses.push_back(
@@ -40,7 +40,7 @@ tensorflow::Status TupleThunk::ExecuteOnStream(
         tuple_element_buffer_addresses.data(), dest_buffer_address.opaque(),
         sizeof(void*) * tuple_element_buffer_addresses.size());
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
index 8b459c29a136a6e7853e68a1bead7d12c0d08ad0..951f809b51937c97a6e7de0345ec58a8b66a4242 100644
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
@@ -45,8 +45,8 @@ class TupleThunk : public Thunk {
   TupleThunk(const TupleThunk&) = delete;
   TupleThunk& operator=(const TupleThunk&) = delete;
 
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
  private:
   const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index a9f3d619a3ffd6d849572355e2902375e43508fa..30b9640c4c75dae61e9a90da5fb10e9d4a90cd26 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -34,9 +34,11 @@ WhileThunk::WhileThunk(
       body_thunk_sequence_(
           MakeUnique<SequentialThunk>(std::move(*body_thunk_sequence), hlo)) {}
 
-Status WhileThunk::Initialize(const GpuExecutable& executable) {
-  TF_RETURN_IF_ERROR(condition_thunk_sequence_->Initialize(executable));
-  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable));
+Status WhileThunk::Initialize(const GpuExecutable& executable,
+                              se::StreamExecutor* executor) {
+  TF_RETURN_IF_ERROR(
+      condition_thunk_sequence_->Initialize(executable, executor));
+  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index e589ca78a7ea00e7592d6e09ead9c270f902702f..22176685a92df9c95b10f755b209309843c0fa3a 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -45,7 +45,8 @@ class WhileThunk : public Thunk {
   WhileThunk(const WhileThunk&) = delete;
   WhileThunk& operator=(const WhileThunk&) = delete;
 
-  Status Initialize(const GpuExecutable& executable) override;
+  Status Initialize(const GpuExecutable& executable,
+                    se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
                          se::Stream* stream) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.cc b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
index e6caec8625f0d622dbb92bcc20802d254fe23f94..ad55728c45599c801aad7e12fac95ae9f0c4fc3b 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
@@ -144,7 +144,7 @@ class ExprTree {
       TF_RETURN_IF_ERROR(pair.second->Match(instruction->operand(pair.first),
                                             tagged_instructions));
     }
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 
  private:
@@ -169,7 +169,7 @@ class MatcherBase {
 
   // Attempts to match each ExprTree in 'expr_trees_'.
   // Returns OK on the first successful match, error status otherwise.
-  virtual tensorflow::Status Run() {
+  virtual Status Run() {
     Status status;
     for (const ExprTree& expr_tree : expr_trees_) {
       status = MatchExprTree(expr_tree);
@@ -201,7 +201,7 @@ class MatcherBase {
     } else if (type == S64) {
       *const_value = literal.GetFirstElement<int64>();
     }
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 
   StatusOr<const HloInstruction*> GetTaggedInstruction(
@@ -315,7 +315,7 @@ class WhileConditionComputationMatcher : public MatcherBase {
                              gte_fusion_param0->name().c_str());
     }
 
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 
   const HloComputation* computation_;
@@ -379,7 +379,7 @@ class WhileInitOperandMatcher : public MatcherBase {
         GetTaggedInstruction("loop_start", tagged_instructions));
     TF_RETURN_IF_ERROR(ParseConstInteger(const_hlo, &loop_start_));
 
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 
   const HloInstruction* while_hlo_;
@@ -477,7 +477,7 @@ class WhileBodyComputationMatcher : public MatcherBase {
         }
       }
     }
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 
   const HloComputation* computation_;
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 3dd4c4a0794e5c41b877078c4e69c6c9584ce6c0..06a5e0351b63270b61b998ca2211f480f256f759 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -32,7 +31,7 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
     const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_fn, const Options& options) {
+    const BufferValue::SizeFunction& size_fn, const Options& options) {
   HeapSimulator heap(std::move(algorithm), size_fn, options, &module_sequence);
   const HloComputation* entry_computation = module.entry_computation();
   const std::vector<const HloInstruction*>& instruction_sequence =
@@ -47,7 +46,7 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::unique_ptr<HeapAlgorithm> algorithm, const HloComputation& computation,
     const std::vector<const HloInstruction*>& instruction_sequence,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_fn, const Options& options) {
+    const BufferValue::SizeFunction& size_fn, const Options& options) {
   HeapSimulator heap(std::move(algorithm), size_fn, options,
                      /*module_sequence=*/nullptr);
   TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence,
@@ -73,11 +72,11 @@ Status HeapSimulator::RunComputation(
   // 'used_buffers' is the reverse map - it tracks which buffers were used by an
   // instruction, so that we can remove the instructions from a buffer's live
   // set after they are visited.
-  FlatMap<const LogicalBuffer*, FlatSet<const HloInstruction*>> live_buffers;
-  FlatMap<const HloInstruction*, FlatSet<const LogicalBuffer*>> used_buffers;
+  FlatMap<const BufferValue*, FlatSet<const HloInstruction*>> live_buffers;
+  FlatMap<const HloInstruction*, FlatSet<const BufferValue*>> used_buffers;
   auto add_user_to_buffer = [this, &live_buffers, &used_buffers](
                                 const HloInstruction* user,
-                                const LogicalBuffer* buffer) {
+                                const BufferValue* buffer) {
     if (!IgnoreBuffer(buffer)) {
       VLOG(4) << "  Adding user " << user->name() << " to buffer "
               << buffer->ToString();
@@ -96,7 +95,7 @@ Status HeapSimulator::RunComputation(
     const PointsToSet::BufferSet& buffer_set = points_to.CreateFlattenedSet();
     for (const HloInstruction* user : instruction->users()) {
       if (user->opcode() != HloOpcode::kGetTupleElement) {
-        for (const LogicalBuffer* buffer : buffer_set) {
+        for (const BufferValue* buffer : buffer_set) {
           add_user_to_buffer(user, buffer);
         }
       } else {
@@ -104,12 +103,12 @@ Status HeapSimulator::RunComputation(
         // alive. It only needs the buffers that relate to the element its
         // extracting, and the tuple it's extracting from, but not the buffers
         // for the other elements.
-        for (const LogicalBuffer* buffer : points_to.element({})) {
+        for (const BufferValue* buffer : points_to.element({})) {
           add_user_to_buffer(user, buffer);
         }
         const PointsToSet& gte_points_to =
             points_to_analysis.GetPointsToSet(user);
-        for (const LogicalBuffer* buffer : gte_points_to.CreateFlattenedSet()) {
+        for (const BufferValue* buffer : gte_points_to.CreateFlattenedSet()) {
           add_user_to_buffer(user, buffer);
         }
       }
@@ -117,24 +116,25 @@ Status HeapSimulator::RunComputation(
   }
 
   const HloInstruction* root = computation.root_instruction();
-  auto output_source_buffers =
-      points_to_analysis.GetPointsToSet(root).CreateFlattenedSet();
+  BufferValueCompactPointerSet output_source_buffers =
+      ToBufferValueCompactPointerSet(
+          points_to_analysis.GetPointsToSet(root).CreateFlattenedSet());
 
-  std::vector<const LogicalBuffer*> dead_buffers_to_free;
-  std::vector<const LogicalBuffer*> operand_buffers_to_free;
+  std::vector<const BufferValue*> dead_buffers_to_free;
+  std::vector<const BufferValue*> operand_buffers_to_free;
   for (const HloInstruction* instruction : instruction_sequence) {
     const TuplePointsToAnalysis::BufferDefinitionVector&
         buffers_defined_by_instruction =
             points_to_analysis.GetBuffersDefinedByInstruction(instruction);
 
     VLOG(3) << "Instruction: " << instruction->ToString();
-    for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
+    for (const BufferValue* buffer : buffers_defined_by_instruction) {
       VLOG(4) << "  Defines: " << buffer->ToString()
               << (IgnoreBuffer(buffer) ? " (Ignored)" : "");
     }
 
     dead_buffers_to_free.clear();
-    for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
+    for (const BufferValue* buffer : buffers_defined_by_instruction) {
       if (IgnoreBuffer(buffer)) {
         continue;
       }
@@ -161,7 +161,7 @@ Status HeapSimulator::RunComputation(
     // have no instructions left to visit are moved from live_buffers to
     // operand_buffers_to_free.
     operand_buffers_to_free.clear();
-    for (const LogicalBuffer* operand_buffer : used_buffers[instruction]) {
+    for (const BufferValue* operand_buffer : used_buffers[instruction]) {
       if (IgnoreBuffer(operand_buffer)) {
         continue;
       }
@@ -177,7 +177,7 @@ Status HeapSimulator::RunComputation(
     }
     // Sort to get a deterministic iteration order.
     std::sort(operand_buffers_to_free.begin(), operand_buffers_to_free.end(),
-              [](const LogicalBuffer* x, const LogicalBuffer* y) {
+              [](const BufferValue* x, const BufferValue* y) {
                 return x->id() < y->id();
               });
 
@@ -188,7 +188,7 @@ Status HeapSimulator::RunComputation(
     //
     // INVARIANT: Either Alloc or ShareBuffer will be called for each buffer
     // that we should assign.
-    for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
+    for (const BufferValue* buffer : buffers_defined_by_instruction) {
       if (IgnoreBuffer(buffer)) {
         continue;
       }
@@ -199,12 +199,12 @@ Status HeapSimulator::RunComputation(
       // we must be the last user of the buffer.
       bool shared = false;
       if (options_.may_reuse_operand_buffers) {
-        for (const LogicalBuffer* operand_buffer : operand_buffers_to_free) {
+        for (const BufferValue* operand_buffer : operand_buffers_to_free) {
           if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) &&
               buffer->instruction()->opcode() != HloOpcode::kCopy &&
-              CanShareOperandBufferWithUser(
+              points_to_analysis.CanShareOperandBufferWithUser(
                   operand_buffer->instruction(), operand_buffer->index(),
-                  buffer->instruction(), buffer->index(), points_to_analysis)) {
+                  buffer->instruction(), buffer->index())) {
             VLOG(3) << "  Sharing: " << buffer->ToString() << " with "
                     << operand_buffer->ToString();
             ShareBuffer(buffer, operand_buffer, instruction);
@@ -248,11 +248,11 @@ Status HeapSimulator::RunComputation(
 
     // Free buffers that are no longer live.  This is the earliest point that we
     // can de-allocate; right after the last use of the buffer.
-    for (const LogicalBuffer* buffer : dead_buffers_to_free) {
+    for (const BufferValue* buffer : dead_buffers_to_free) {
       VLOG(3) << "  Freeing dead: " << buffer->ToString();
       Free(buffer, instruction);
     }
-    for (const LogicalBuffer* buffer : operand_buffers_to_free) {
+    for (const BufferValue* buffer : operand_buffers_to_free) {
       VLOG(3) << "  Freeing operand: " << buffer->ToString();
       Free(buffer, instruction);
     }
@@ -261,10 +261,10 @@ Status HeapSimulator::RunComputation(
   // Any remaining live buffers must be entry parameters or output source
   // buffers, which had a nullptr sentry added.  Free them now, in a
   // deterministic order.
-  std::vector<const LogicalBuffer*> to_free;
+  std::vector<const BufferValue*> to_free;
   to_free.reserve(live_buffers.size());
   for (const auto& buffer_pending : live_buffers) {
-    const LogicalBuffer* buffer = buffer_pending.first;
+    const BufferValue* buffer = buffer_pending.first;
     const FlatSet<const HloInstruction*>& pending = buffer_pending.second;
     CHECK_EQ(pending.size(), 1) << *buffer;
     CHECK(*pending.begin() == nullptr) << *buffer;
@@ -272,10 +272,10 @@ Status HeapSimulator::RunComputation(
   }
 
   std::sort(to_free.begin(), to_free.end(),
-            [](const LogicalBuffer* x, const LogicalBuffer* y) {
+            [](const BufferValue* x, const BufferValue* y) {
               return x->id() < y->id();
             });
-  for (const LogicalBuffer* buffer : to_free) {
+  for (const BufferValue* buffer : to_free) {
     VLOG(3) << "Freeing pending: " << buffer->ToString();
     Free(buffer, root);
   }
@@ -285,7 +285,7 @@ Status HeapSimulator::RunComputation(
 
 HeapSimulator::HeapSimulator(
     std::unique_ptr<HeapAlgorithm> algorithm,
-    const LogicalBuffer::SizeFunction& size_fn, const Options& options,
+    const BufferValue::SizeFunction& size_fn, const Options& options,
     const SequentialHloOrdering::HloModuleSequence* module_sequence)
     : no_fragmentation_stats_(MakeUnique<NoFragmentationStatsHeap>()),
       algorithm_(std::move(algorithm)),
@@ -297,7 +297,7 @@ HeapSimulator::HeapSimulator(
 
 HeapSimulator::~HeapSimulator() {}
 
-bool HeapSimulator::IgnoreBuffer(const LogicalBuffer* buffer) const {
+bool HeapSimulator::IgnoreBuffer(const BufferValue* buffer) const {
   // Buffers for constants are ignored unless the alloc_constants option is
   // set. Also ignore buffers that we're not meant to assign.
   //
@@ -311,7 +311,7 @@ bool HeapSimulator::IgnoreBuffer(const LogicalBuffer* buffer) const {
 }
 
 // Alloc always calls the underlying heap algorithm.
-void HeapSimulator::Alloc(const LogicalBuffer* buffer,
+void HeapSimulator::Alloc(const BufferValue* buffer,
                           const HloInstruction* instruction) {
   CHECK(allocated_buffers_.count(buffer) == 0)
       << "Alloc called on allocated buffer: " << *buffer;
@@ -331,7 +331,7 @@ void HeapSimulator::Alloc(const LogicalBuffer* buffer,
 // buffers whose group liveness has expired.  Shared group liveness is tracked
 // by maintaining a refcount; the Free call on the last buffer in the group
 // causes Free to be called on the underlying algorithm.
-void HeapSimulator::Free(const LogicalBuffer* buffer,
+void HeapSimulator::Free(const BufferValue* buffer,
                          const HloInstruction* instruction) {
   auto shared_it = shared_buffers_.find(buffer);
   if (shared_it != shared_buffers_.end()) {
@@ -362,8 +362,8 @@ void HeapSimulator::Free(const LogicalBuffer* buffer,
 // The 'buffer' must be a non-allocated, non-freed buffer, just like in calls to
 // Alloc.  The 'shared' buffer must be a previously allocated or shared buffer.
 // Both 'buffer' and 'shared' will be associated with the same SharedGroup.
-void HeapSimulator::ShareBuffer(const LogicalBuffer* buffer,
-                                const LogicalBuffer* shared,
+void HeapSimulator::ShareBuffer(const BufferValue* buffer,
+                                const BufferValue* shared,
                                 const HloInstruction* instruction) {
   CHECK_LE(size_fn_(*buffer), size_fn_(*shared))
       << "ShareBuffer oversized buffer" << *buffer << " shared: " << *shared;
@@ -374,7 +374,7 @@ void HeapSimulator::ShareBuffer(const LogicalBuffer* buffer,
   CHECK(freed_buffers_.count(shared) == 0)
       << "ShareBuffer called on freed shared buffer: " << *shared;
 
-  const LogicalBuffer* canonical = nullptr;
+  const BufferValue* canonical = nullptr;
   auto shared_it = shared_buffers_.find(shared);
   if (shared_it != shared_buffers_.end()) {
     // The 'shared' buffer already has a group; it might be the canonical, but
@@ -408,7 +408,7 @@ HeapSimulator::Result HeapSimulator::Finish() {
   // collecting statistics, e.g. NoFragmentationStatsHeap.
   if (!result.chunk_map.empty()) {
     for (const auto& share_pair : shared_buffers_) {
-      const LogicalBuffer* buffer = share_pair.first;
+      const BufferValue* buffer = share_pair.first;
       std::shared_ptr<SharedGroup> group = share_pair.second;
       if (buffer != group->canonical) {
         // The canonical must already exist in the chunk_map, since we called
@@ -437,9 +437,9 @@ HeapSimulator::Result HeapSimulator::Finish() {
 }
 
 void HeapSimulator::FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
-                                   const LogicalBuffer* buffer,
+                                   const BufferValue* buffer,
                                    const HloInstruction* instruction,
-                                   const LogicalBuffer* share_with_canonical) {
+                                   const BufferValue* share_with_canonical) {
   HeapSimulatorTrace::Event* event = debug_trace_.add_events();
   event->set_kind(kind);
   event->set_buffer_id(buffer->id());
@@ -453,14 +453,14 @@ void HeapSimulator::FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
   }
 }
 
-void NoFragmentationStatsHeap::Alloc(const LogicalBuffer* buffer, int64 size) {
+void NoFragmentationStatsHeap::Alloc(const BufferValue* buffer, int64 size) {
   current_heap_size_ += size;
   if (current_heap_size_ > max_heap_size_) {
     max_heap_size_ = current_heap_size_;
   }
 }
 
-void NoFragmentationStatsHeap::Free(const LogicalBuffer* buffer, int64 size) {
+void NoFragmentationStatsHeap::Free(const BufferValue* buffer, int64 size) {
   current_heap_size_ -= size;
 }
 
@@ -472,12 +472,12 @@ HeapSimulator::Result NoFragmentationStatsHeap::Finish() {
   return result;
 }
 
-void DecreasingSizeRunsHeap::Alloc(const LogicalBuffer* buffer, int64 size) {
+void DecreasingSizeRunsHeap::Alloc(const BufferValue* buffer, int64 size) {
   SetMode(kAlloc);
   run_.emplace_back(Op{buffer, size});
 }
 
-void DecreasingSizeRunsHeap::Free(const LogicalBuffer* buffer, int64 size) {
+void DecreasingSizeRunsHeap::Free(const BufferValue* buffer, int64 size) {
   CHECK(mode_ != kInit) << "Free called on empty heap: " << *buffer;
   SetMode(kFree);
   run_.emplace_back(Op{buffer, size});
@@ -518,7 +518,7 @@ void DecreasingSizeRunsHeap::CallAndDrainRun() {
   run_.clear();
 }
 
-void LazyBestFitHeap::Alloc(const LogicalBuffer* buffer, int64 size) {
+void LazyBestFitHeap::Alloc(const BufferValue* buffer, int64 size) {
   // Degenerate case: 0-sized buffers are always allocated at offset 0.
   if (size == 0) {
     result_.chunk_map.emplace(buffer, Chunk{0, 0});
@@ -586,7 +586,7 @@ void LazyBestFitHeap::Alloc(const LogicalBuffer* buffer, int64 size) {
   result_.chunk_map.emplace(buffer, Chunk{kLazyAllocOffset, size});
 }
 
-void LazyBestFitHeap::Free(const LogicalBuffer* buffer, int64 size) {
+void LazyBestFitHeap::Free(const BufferValue* buffer, int64 size) {
   auto alloc_it = result_.chunk_map.find(buffer);
   CHECK(alloc_it != result_.chunk_map.end())
       << "Free called on non-allocated buffer: " << *buffer;
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index 636f19dd39f09721bd82fc4b44785f196f281ad7..8b2b43a37a5c41d334e5338c6a6fad160f03a51e 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -21,11 +21,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/buffer_value.h"
+#include "tensorflow/compiler/xla/service/buffer_value_containers.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
-#include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
@@ -43,7 +44,7 @@ class HeapAlgorithm;
 // don't need to return the assignment of buffer offsets until the very end.
 class HeapSimulator {
  public:
-  // Chunk represents a contiguous piece of memory.  Each LogicalBuffer will be
+  // Chunk represents a contiguous piece of memory.  Each BufferValue will be
   // associated with a chunk in the assignment result.
   struct Chunk {
     int64 offset;
@@ -55,7 +56,7 @@ class HeapSimulator {
   // Result represents the result of the heap simulation.
   struct Result {
     // The assignment of buffers to chunks.
-    tensorflow::gtl::FlatMap<const LogicalBuffer*, Chunk> chunk_map;
+    tensorflow::gtl::FlatMap<const BufferValue*, Chunk> chunk_map;
 
     // The total size in bytes of the heap, containing all assigned chunks.
     int64 heap_size = 0;
@@ -81,7 +82,7 @@ class HeapSimulator {
     bool alloc_constants;
     // If 'buffers_to_assign' is provided, only those buffers are assigned
     // offsets, otherwise all buffers defined by the instructions are assigned.
-    const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign;
+    const BufferValueFlatSet* buffers_to_assign;
   };
 
   // Run the heap simulation with the given algorithm, assuming the given
@@ -97,7 +98,7 @@ class HeapSimulator {
       std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
       const SequentialHloOrdering::HloModuleSequence& module_sequence,
       const TuplePointsToAnalysis& points_to_analysis,
-      const LogicalBuffer::SizeFunction& size_fn,
+      const BufferValue::SizeFunction& size_fn,
       const Options& options = Options());
 
   // Same as above, but runs on a single computation. The 'instruction_sequence'
@@ -109,7 +110,7 @@ class HeapSimulator {
       const HloComputation& computation,
       const std::vector<const HloInstruction*>& instruction_sequence,
       const TuplePointsToAnalysis& points_to_analysis,
-      const LogicalBuffer::SizeFunction& size_fn,
+      const BufferValue::SizeFunction& size_fn,
       const Options& options = Options());
 
  private:
@@ -118,7 +119,7 @@ class HeapSimulator {
   // be run recursively. I.e. the simulation is run over the whole module.
   HeapSimulator(
       std::unique_ptr<HeapAlgorithm> algorithm,
-      const LogicalBuffer::SizeFunction& size_fn, const Options& options,
+      const BufferValue::SizeFunction& size_fn, const Options& options,
       const SequentialHloOrdering::HloModuleSequence* module_sequence);
   ~HeapSimulator();
 
@@ -127,21 +128,21 @@ class HeapSimulator {
       const std::vector<const HloInstruction*>& instruction_sequence,
       const TuplePointsToAnalysis& points_to_analysis);
 
-  bool IgnoreBuffer(const LogicalBuffer* buffer) const;
-  void Alloc(const LogicalBuffer* buffer, const HloInstruction* instruction);
-  void Free(const LogicalBuffer* buffer, const HloInstruction* instruction);
-  void ShareBuffer(const LogicalBuffer* buffer, const LogicalBuffer* shared,
+  bool IgnoreBuffer(const BufferValue* buffer) const;
+  void Alloc(const BufferValue* buffer, const HloInstruction* instruction);
+  void Free(const BufferValue* buffer, const HloInstruction* instruction);
+  void ShareBuffer(const BufferValue* buffer, const BufferValue* shared,
                    const HloInstruction* instruction);
   Result Finish();
 
   void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
-                      const LogicalBuffer* buffer,
+                      const BufferValue* buffer,
                       const HloInstruction* instruction,
-                      const LogicalBuffer* shared_with_canonical);
+                      const BufferValue* shared_with_canonical);
 
   const std::unique_ptr<HeapAlgorithm> no_fragmentation_stats_;
   const std::unique_ptr<HeapAlgorithm> algorithm_;
-  const LogicalBuffer::SizeFunction size_fn_;
+  const BufferValue::SizeFunction size_fn_;
   const Options options_;
   const SequentialHloOrdering::HloModuleSequence* module_sequence_;
 
@@ -160,15 +161,15 @@ class HeapSimulator {
   // The shared_buffers_ map associates each shared buffer (including the
   // canonical) to its SharedGroup control block.
   struct SharedGroup {
-    const LogicalBuffer* canonical = nullptr;
+    const BufferValue* canonical = nullptr;
     int64 refcount = 0;
   };
-  tensorflow::gtl::FlatMap<const LogicalBuffer*, std::shared_ptr<SharedGroup>>
+  tensorflow::gtl::FlatMap<const BufferValue*, std::shared_ptr<SharedGroup>>
       shared_buffers_;
 
   // Hold some sets for error-checking the sequence of Alloc and Free calls.
-  tensorflow::gtl::FlatSet<const LogicalBuffer*> allocated_buffers_;
-  tensorflow::gtl::FlatSet<const LogicalBuffer*> freed_buffers_;
+  tensorflow::gtl::FlatSet<const BufferValue*> allocated_buffers_;
+  tensorflow::gtl::FlatSet<const BufferValue*> freed_buffers_;
 
   // Debugging information filled in while the heap simulator runs.
   HeapSimulatorTrace debug_trace_;
@@ -186,10 +187,10 @@ class HeapAlgorithm {
   virtual ~HeapAlgorithm() = default;
 
   // Alloc allocates a buffer of 'size' bytes.
-  virtual void Alloc(const LogicalBuffer* buffer, int64 size) = 0;
+  virtual void Alloc(const BufferValue* buffer, int64 size) = 0;
 
   // Free de-allocates a previously allocated buffer.
-  virtual void Free(const LogicalBuffer* buffer, int64 size) = 0;
+  virtual void Free(const BufferValue* buffer, int64 size) = 0;
 
   // Finish collects the buffer offset assignment results.  Free may only be
   // called once, after the Alloc and Free calls.
@@ -205,8 +206,8 @@ class NoFragmentationStatsHeap : public HeapAlgorithm {
   NoFragmentationStatsHeap() = default;
   ~NoFragmentationStatsHeap() override = default;
 
-  void Alloc(const LogicalBuffer* buffer, int64 size) override;
-  void Free(const LogicalBuffer* buffer, int64 size) override;
+  void Alloc(const BufferValue* buffer, int64 size) override;
+  void Free(const BufferValue* buffer, int64 size) override;
   Result Finish() override;
 
  private:
@@ -223,14 +224,14 @@ class DecreasingSizeRunsHeap : public HeapAlgorithm {
       : algorithm_(std::move(algorithm)) {}
   ~DecreasingSizeRunsHeap() override {}
 
-  void Alloc(const LogicalBuffer* buffer, int64 size) override;
-  void Free(const LogicalBuffer* buffer, int64 size) override;
+  void Alloc(const BufferValue* buffer, int64 size) override;
+  void Free(const BufferValue* buffer, int64 size) override;
   Result Finish() override;
 
  private:
   // A single Alloc or Free operation that we've buffered in run_.
   struct Op {
-    const LogicalBuffer* buffer;
+    const BufferValue* buffer;
     int64 size;
   };
 
@@ -266,8 +267,8 @@ class LazyBestFitHeap : public HeapAlgorithm {
   LazyBestFitHeap(int64 alignment) : alignment_(alignment) {}
   ~LazyBestFitHeap() override {}
 
-  void Alloc(const LogicalBuffer* buffer, int64 size) override;
-  void Free(const LogicalBuffer* buffer, int64 size) override;
+  void Alloc(const BufferValue* buffer, int64 size) override;
+  void Free(const BufferValue* buffer, int64 size) override;
   Result Finish() override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index fd56a603bb6f849b1c1f1578fe7395d9b372e2d5..6271652412c2979ff926702f12722102344b0dfb 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
-#include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -39,7 +39,7 @@ const char kFree[] = "Free";
 const char kFinish[] = "Finish";
 
 // CallSequence records a sequence of Alloc/Free/Finish calls.
-using CallSequence = std::vector<std::pair<string, const LogicalBuffer*>>;
+using CallSequence = std::vector<std::pair<string, const BufferValue*>>;
 
 // HeapCallRecorder is a dummy heap algorithm that simply records its calls.
 class HeapCallRecorder : public HeapAlgorithm {
@@ -47,7 +47,7 @@ class HeapCallRecorder : public HeapAlgorithm {
   explicit HeapCallRecorder(CallSequence* calls) : calls_(calls) {}
   ~HeapCallRecorder() override {}
 
-  void Alloc(const LogicalBuffer* buffer, int64 size) override {
+  void Alloc(const BufferValue* buffer, int64 size) override {
     calls_->emplace_back(kAlloc, buffer);
     // Instead of assigning a real offset, we set the cardinality of the Alloc
     // call.  This isn't a valid assignment, but allows us to easily test for
@@ -55,7 +55,7 @@ class HeapCallRecorder : public HeapAlgorithm {
     const int64 offset = result_.chunk_map.size();
     result_.chunk_map.emplace(buffer, Chunk{offset, size});
   }
-  void Free(const LogicalBuffer* buffer, int64 size) override {
+  void Free(const BufferValue* buffer, int64 size) override {
     calls_->emplace_back(kFree, buffer);
   }
   Result Finish() override {
@@ -118,7 +118,7 @@ class HeapSimulatorTracker {
 
     // Hack the size_fn so that it returns a decreasing value as we step through
     // the sequence. This lets us ensure the Alloc calls are in the sequence
-    // order. The Free calls are sorted by LogicalBuffer.id, which is at least
+    // order. The Free calls are sorted by BufferValue.id, which is at least
     // deterministic.
     auto size_fn = [&reverse_position](const BufferValue& buffer) {
       return reverse_position[buffer.instruction()];
@@ -133,8 +133,8 @@ class HeapSimulatorTracker {
   HloModule* module() { return module_.get(); }
 
   // Returns the buffer defined at the given instruction and index.
-  const LogicalBuffer* BufferAt(const HloInstruction* instruction,
-                                const ShapeIndex& index) const {
+  const BufferValue* BufferAt(const HloInstruction* instruction,
+                              const ShapeIndex& index) const {
     return points_to_analysis_->GetBufferDefinedAt(instruction, index)
         .ConsumeValueOrDie();
   }
@@ -150,8 +150,8 @@ class HeapSimulatorTracker {
                            const ShapeIndex& index_a,
                            const HloInstruction* instruction_b,
                            const ShapeIndex& index_b) {
-    const LogicalBuffer* a = BufferAt(instruction_a, index_a);
-    const LogicalBuffer* b = BufferAt(instruction_b, index_b);
+    const BufferValue* a = BufferAt(instruction_a, index_a);
+    const BufferValue* b = BufferAt(instruction_b, index_b);
     EXPECT_EQ(result_.chunk_map[a].offset, result_.chunk_map[b].offset)
         << *a << ", " << *b;
   }
@@ -525,7 +525,7 @@ TEST_F(HeapSimulatorTest, WholeModule) {
       // Now the final cond less-than buffer is allocated.
       {kAlloc, tracker.BufferAt(cond_lt, {})},
 
-      // The order of the remaining Free calls is based on the LogicalBuffer.id,
+      // The order of the remaining Free calls is based on the BufferValue.id,
       // which is deterministic, but not obvious.
       {kFree, tracker.BufferAt(param, {})},
       {kFree, tracker.BufferAt(param, {0})},
@@ -547,40 +547,40 @@ TEST_F(HeapSimulatorTest, WholeModule) {
 class HeapAlgorithmTestBase : public ::testing::Test {
  protected:
   HeapAlgorithmTestBase() : builder_("heap_simulator_test") {
-    buffer_a_ = DummyLogicalBuffer();
-    buffer_b_ = DummyLogicalBuffer();
-    buffer_c_ = DummyLogicalBuffer();
-    buffer_d_ = DummyLogicalBuffer();
-    buffer_e_ = DummyLogicalBuffer();
-    buffer_f_ = DummyLogicalBuffer();
-    buffer_g_ = DummyLogicalBuffer();
-    buffer_h_ = DummyLogicalBuffer();
-    buffer_i_ = DummyLogicalBuffer();
+    buffer_a_ = DummyBufferValue();
+    buffer_b_ = DummyBufferValue();
+    buffer_c_ = DummyBufferValue();
+    buffer_d_ = DummyBufferValue();
+    buffer_e_ = DummyBufferValue();
+    buffer_f_ = DummyBufferValue();
+    buffer_g_ = DummyBufferValue();
+    buffer_h_ = DummyBufferValue();
+    buffer_i_ = DummyBufferValue();
   }
   ~HeapAlgorithmTestBase() override {}
 
-  const LogicalBuffer* buffer_a_;
-  const LogicalBuffer* buffer_b_;
-  const LogicalBuffer* buffer_c_;
-  const LogicalBuffer* buffer_d_;
-  const LogicalBuffer* buffer_e_;
-  const LogicalBuffer* buffer_f_;
-  const LogicalBuffer* buffer_g_;
-  const LogicalBuffer* buffer_h_;
-  const LogicalBuffer* buffer_i_;
+  const BufferValue* buffer_a_;
+  const BufferValue* buffer_b_;
+  const BufferValue* buffer_c_;
+  const BufferValue* buffer_d_;
+  const BufferValue* buffer_e_;
+  const BufferValue* buffer_f_;
+  const BufferValue* buffer_g_;
+  const BufferValue* buffer_h_;
+  const BufferValue* buffer_i_;
 
  private:
-  // Create a dummy LogicalBuffer to pass to the heap algorithm.
-  const LogicalBuffer* DummyLogicalBuffer() {
-    const LogicalBuffer::Id id = buffers_.size();
+  // Create a dummy BufferValue to pass to the heap algorithm.
+  const BufferValue* DummyBufferValue() {
+    const BufferValue::Id id = buffers_.size();
     auto const0 = builder_.AddInstruction(
         HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
-    buffers_.emplace_back(MakeUnique<LogicalBuffer>(const0, ShapeIndex{}, id));
+    buffers_.emplace_back(MakeUnique<HloValue>(id, const0, ShapeIndex{}));
     return buffers_.back().get();
   }
 
   HloComputation::Builder builder_;
-  std::vector<std::unique_ptr<LogicalBuffer>> buffers_;
+  std::vector<std::unique_ptr<BufferValue>> buffers_;
 };
 
 class NoFragmentationStatsHeapTest : public HeapAlgorithmTestBase {};
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index aa6860880b7a1308d3ecabb52318daa7d2852af2..1f7c1cffd324ad2f4e4cdf11046c8459b8ceb6d5 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -147,6 +147,9 @@ message HloInstructionProto {
   repeated int64 called_computation_ids = 38;
 
   xla.OpSharding sharding = 40;
+
+  // Backend configuration for the instruction. Has backend-specific meaning.
+  string backend_config = 43;
 }
 
 // Serialization of HloComputation.
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 594413e88fb26e86b198d08b2e4db77fad671348..63c3dc4a5932f754a9ccdd70d03c999fe528a448 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -347,6 +347,11 @@ std::list<HloComputation*> HloComputation::MakeEmbeddedComputationsList()
   // To avoid special handling of this computation, cast away const of
   // 'this'. 'this' is immediately removed from the post order after
   // construction.
+  //
+  // TODO(b/78350259): This violates const-correctness, since while the original
+  // computation is not returned, we still retrieve non-const computations from
+  // a const one. Consider also avoiding const for HloComputation, or review XLA
+  // for const-correctness of non-HloInstruction* types like this.
   ComputeComputationPostOrder(const_cast<HloComputation*>(this), &visited,
                               &post_order);
 
@@ -360,25 +365,38 @@ std::list<HloComputation*> HloComputation::MakeEmbeddedComputationsList()
 string HloComputation::ToString(const HloPrintOptions& options) const {
   std::ostringstream s;
   for (int i = 0; i < options.indent_amount(); i++) {
-    s << "    ";
+    s << "  ";
   }
-  if (options.print_percent()) {
-    s << "%";
+
+  if (!options.is_in_nested_computation()) {
+    if (options.print_percent()) {
+      s << "%";
+    }
+    s << name() << " ";
   }
-  s << name();
+
   if (options.print_program_shape()) {
-    s << " " << ShapeUtil::HumanString(ComputeProgramShape());
-  }
-  s << " {\n";
-  for (const HloInstruction* instruction : MakeInstructionPostOrder()) {
-    for (int i = 0; i < options.indent_amount(); i++) {
-      s << "    ";
+    s << ShapeUtil::HumanString(ComputeProgramShape()) << " ";
+  }
+  s << "{\n";
+  {
+    // Print the instructions in this computation.
+    HloPrintOptions new_options = options;
+    new_options.set_indent_amount(options.indent_amount() + 1)
+        .set_is_in_nested_computation(true);
+    CanonicalNameMap name_map;
+    for (const HloInstruction* instruction : MakeInstructionPostOrder()) {
+      for (int i = 0; i < new_options.indent_amount(); i++) {
+        s << "  ";
+      }
+      s << (instruction == root_instruction_ ? "ROOT " : "")
+        << instruction->ToStringWithCanonicalNameMap(new_options, &name_map)
+        << "\n";
     }
-    s << "  " << (instruction == root_instruction_ ? "ROOT " : "")
-      << instruction->ToString(options) << "\n";
   }
+
   for (int i = 0; i < options.indent_amount(); i++) {
-    s << "    ";
+    s << "  ";
   }
   s << "}";
   return s.str();
@@ -402,27 +420,37 @@ HloComputationProto HloComputation::ToProto() const {
 
 /* static */ StatusOr<std::unique_ptr<HloComputation>>
 HloComputation::CreateFromProto(
-    HloModule* module, const HloComputationProto& proto,
+    const HloComputationProto& proto,
     const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map) {
-  std::vector<std::unique_ptr<HloInstruction>> instructions;
   tensorflow::gtl::FlatMap<int64, HloInstruction*> instruction_map;
+  tensorflow::gtl::FlatMap<HloInstruction*, int64> to_proto_id;
+  std::vector<std::unique_ptr<HloInstruction>> instructions;
   int64 parameter_count = 0;
   for (const HloInstructionProto& instruction_proto : proto.instructions()) {
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloInstruction> instruction,
-        HloInstruction::CreateFromProto(module, instruction_proto,
-                                        instruction_map, computation_map));
+        HloInstruction::CreateFromProto(instruction_proto, instruction_map,
+                                        computation_map));
     if (instruction->opcode() == HloOpcode::kParameter) {
       parameter_count++;
     }
     TF_RET_CHECK(!ContainsKey(instruction_map, instruction_proto.id()));
     instruction_map[instruction_proto.id()] = instruction.get();
+    to_proto_id[instruction.get()] = instruction_proto.id();
     instructions.push_back(std::move(instruction));
   }
 
   TF_RET_CHECK(proto.root_id() != -1);
   TF_RET_CHECK(ContainsKey(instruction_map, proto.root_id()));
   HloInstruction* root = instruction_map.at(proto.root_id());
+
+  // Sort the instructions in the proto id's order.
+  std::sort(instructions.begin(), instructions.end(),
+            [&](const std::unique_ptr<HloInstruction>& a,
+                const std::unique_ptr<HloInstruction>& b) {
+              return to_proto_id[a.get()] < to_proto_id[b.get()];
+            });
+
   return WrapUnique(new HloComputation(proto.name(), parameter_count,
                                        &instructions, root,
                                        /*fusion_instruction=*/nullptr));
@@ -723,18 +751,25 @@ Status HloComputation::Accept(
   return this->Accept(&visitor);
 }
 
-std::unique_ptr<HloComputation> HloComputation::Clone(const string& suffix,
-                                                      HloModule* module) {
+std::unique_ptr<HloComputation> HloComputation::Clone(
+    const string& suffix, HloModule* module,
+    HloInstruction::CloneMap* clone_map) {
   return CloneWithReplacements(
       /*replacements=*/std::unordered_map<const HloInstruction*,
                                           std::unique_ptr<HloInstruction>>(),
-      module, suffix);
+      module, clone_map, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
     std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
         replacements,
-    HloModule* module, const string& suffix) {
+    HloModule* module, HloInstruction::CloneMap* clone_map,
+    const string& suffix) {
+  HloInstruction::CloneMap local_clone_map;
+  if (clone_map == nullptr) {
+    clone_map = &local_clone_map;
+  }
+
   // Look up instr in the replacements map, and return either the replacement,
   // or instr, if the replacement isn't present.
   //
@@ -756,24 +791,19 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
     }
   }
 
-  std::unordered_map<HloInstruction*, HloInstruction*> clone_map;
   std::vector<std::unique_ptr<HloInstruction>> instructions;
   std::unique_ptr<HloInstruction> new_instr = nullptr;
   for (auto instr : postorder) {
     std::vector<HloInstruction*> new_operands;
     for (auto operand : instr->operands()) {
       auto replaced_operand = replace(operand);
-      // If replaced_operand is null, that means 'replacements' asked us not to
-      // include operand in the new computation.  But we can't do that, because
-      // operand is used by instr.
       CHECK_NE(replaced_operand, nullptr)
-          << "replacements map tried to eliminate a used instruction "
-          << operand->ToString() << ", used by " << instr->ToString();
-      new_operands.push_back(FindOrDie(clone_map, replaced_operand));
+          << "Replacements map specifies to leave out " << operand->ToString()
+          << ", but it is used by " << instr->ToString() << ".";
+      new_operands.push_back(FindOrDie(*clone_map, replaced_operand));
     }
-    new_instr =
-        instr->CloneWithNewOperands(instr->shape(), new_operands, module);
-    InsertOrDie(&clone_map, instr, new_instr.get());
+    new_instr = instr->CloneWithNewOperands(instr->shape(), new_operands,
+                                            module, clone_map);
     instructions.push_back(std::move(new_instr));
   }
   Builder builder(name() + "." + suffix);
@@ -781,27 +811,24 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
     builder.AddInstruction(std::move(instr));
   }
   auto result = builder.Build(
-      /*root_instruction=*/FindOrDie(clone_map, replace(root_instruction())));
+      /*root_instruction=*/FindOrDie(*clone_map, replace(root_instruction())));
 
   // Clone control dependencies.
   for (auto instr : postorder) {
-    HloInstruction* new_instr = FindOrDie(clone_map, instr);
+    HloInstruction* new_instr = FindOrDie(*clone_map, instr);
     for (auto successor : instr->control_successors()) {
       auto replaced_successor = replace(successor);
-
-      // successor may not be in clone_map, because it might have been
-      // removed by the replacements map.
-      if (replaced_successor == nullptr) {
-        continue;
-      }
+      CHECK_NE(replaced_successor, nullptr)
+          << "Replacements map specifies to leave out " << successor->ToString()
+          << ", but it is control-depended-on by " << instr->ToString() << ".";
 
       TF_CHECK_OK(new_instr->AddControlDependencyTo(
-          FindOrDie(clone_map, replaced_successor)));
+          FindOrDie(*clone_map, replaced_successor)));
     }
   }
 
   // We cloned the elements of 'replacements', so they're all going to be
-  // destroyed.  HloInstructions need to be detached from their operands before
+  // destroyed. HloInstructions need to be detached from their operands before
   // they're destroyed, otherwise they stick around in the operands' users lists
   // and cause use-after-frees.
   for (auto& kv : replacements) {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 9d3f6e9a2c2efd97681a22b6b0f6d929afc553de..8bc97df0365a32bdc89d4636ad4c7076ffb08296 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -49,9 +49,20 @@ class HloModule;
 
 // Describes a computation at the HLO level.
 //
-// An HloComputation contains a directed acyclic graph of HLO instructions. The
-// computation has a single root instruction which produces the output of the
-// computation.
+// You can think of an HloComputation like a function.  It has some inputs
+// (parameters) and returns exactly one value (the value of its root node).  If
+// you want to return multiple values, you can return a tuple.
+//
+// The instructions inside of a computation do not have an explicit total order.
+// Instead, they have a partial order determined by their data and control
+// dependencies.
+//
+// An HloModule contains one "entry computation" -- this is like main() in a C
+// program.  Every other computation inside of a module is attached to one or
+// more HloInstructions, as a "nested computation".  For example, the kMap
+// instruction has a nested computation and "applies" it to every element of its
+// input, elementwise.  (That is, the input [x, y, z] is transformed to [f(x),
+// f(y), f(z)].)
 class HloComputation {
  public:
   // Builder class for HloComputation.
@@ -157,14 +168,12 @@ class HloComputation {
 
   // Creates a computation from the given proto. Arguments:
   //
-  //   module: the module which will contain the computation. The newly created
-  //     computation is *not* added to the module, however.
   //   proto: the proto to convert from.
   //   computation_map: a map from computation id to HloComputation*. This map
   //     must contain all computations which the newly constructed computation
   //     calls.
   static StatusOr<std::unique_ptr<HloComputation>> CreateFromProto(
-      HloModule* module, const HloComputationProto& proto,
+      const HloComputationProto& proto,
       const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map);
 
   // Gets the instructions in this computation.
@@ -291,11 +300,17 @@ class HloComputation {
       const std::function<Status(const HloInstruction*)>& visitor_func) const;
 
   // Returns a deep copy of this computation including all instructions.
-  // If the module pointer is not nullptr, it will be the module where
-  // the cloned computations will be added to (in order to support deep
-  // cloning).
-  std::unique_ptr<HloComputation> Clone(const string& suffix = "clone",
-                                        HloModule* module = nullptr);
+  //
+  // If the module pointer is not nullptr, then the cloned computations will be
+  // added to this module in order to support deep cloning. Otherwise the module
+  // of the computation is used.
+  //
+  // If clone_map is not nullptr, then each original instruction that is cloned
+  // will be inserted and map to its clone. clone_map should not already contain
+  // any of the instructions to clone.
+  std::unique_ptr<HloComputation> Clone(
+      const string& suffix = "clone", HloModule* module = nullptr,
+      HloInstruction::CloneMap* clone_map = nullptr);
 
   // Like Clone(), but if an instruction is present in replacement_map, we use
   // the map's value to replace that instruction in the cloned computation.
@@ -305,7 +320,9 @@ class HloComputation {
   std::unique_ptr<HloComputation> CloneWithReplacements(
       std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
           replacements,
-      HloModule* module = nullptr, const string& suffix = "clone");
+      HloModule* module = nullptr,
+      HloInstruction::CloneMap* clone_map = nullptr,
+      const string& suffix = "clone");
 
   // Returns true if the given instruction can be removed from the computation.
   // Parameter instructions cannot be removed without violating invariants of
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 7b7588f4ba9aa622677db6f9d5022cc8cc029e04..25469a54c48f4f5cab478aba929f1cc18de8b81f 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -550,6 +550,108 @@ TEST_F(HloComputationTest, Reachability) {
   EXPECT_FALSE(reachability->IsReachable(constant2, copy));
 }
 
+TEST_F(HloComputationTest, Stringification) {
+  const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
+  const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
+  const Shape s2t = ShapeUtil::MakeShape(F32, {10, 20});
+  const Shape sout = ShapeUtil::MakeShape(F32, {5, 20});
+
+  HloComputation::Builder builder("TransposeDot");
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "x"));
+  HloInstruction* y =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
+  HloInstruction* reshape =
+      builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  builder.AddInstruction(
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
+
+  auto options = HloPrintOptions().set_print_metadata(false);
+  EXPECT_EQ(computation->ToString(options),
+            R"(%TransposeDot (x: f32[5,10], y: f32[20,10]) -> f32[5,20] {
+  %x = f32[5,10]{1,0} parameter(0)
+  %y = f32[20,10]{1,0} parameter(1)
+  %transpose = f32[10,20]{1,0} transpose(f32[20,10]{1,0} %y), dimensions={1,0}
+  ROOT %dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} %transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})");
+}
+
+TEST_F(HloComputationTest, StringificationIndent) {
+  const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
+  const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
+  const Shape s2t = ShapeUtil::MakeShape(F32, {10, 20});
+  const Shape sout = ShapeUtil::MakeShape(F32, {5, 20});
+
+  HloComputation::Builder builder("TransposeDot");
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "x"));
+  HloInstruction* y =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
+  HloInstruction* reshape =
+      builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  builder.AddInstruction(
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
+
+  auto options =
+      HloPrintOptions().set_print_metadata(false).set_indent_amount(2);
+  EXPECT_EQ(computation->ToString(options),
+            R"(    %TransposeDot (x: f32[5,10], y: f32[20,10]) -> f32[5,20] {
+      %x = f32[5,10]{1,0} parameter(0)
+      %y = f32[20,10]{1,0} parameter(1)
+      %transpose = f32[10,20]{1,0} transpose(f32[20,10]{1,0} %y), dimensions={1,0}
+      ROOT %dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} %transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })");
+}
+
+TEST_F(HloComputationTest, StringificationCanonical) {
+  const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
+  const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
+  const Shape s2t = ShapeUtil::MakeShape(F32, {10, 20});
+  const Shape sout = ShapeUtil::MakeShape(F32, {5, 20});
+
+  HloComputation::Builder builder("TransposeDot");
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "x"));
+  HloInstruction* y =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
+  HloInstruction* reshape =
+      builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  builder.AddInstruction(
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
+
+  auto options = HloPrintOptions().set_print_metadata(false);
+  EXPECT_EQ(computation->ToString(options),
+            R"(%TransposeDot (x: f32[5,10], y: f32[20,10]) -> f32[5,20] {
+  %x = f32[5,10]{1,0} parameter(0)
+  %y = f32[20,10]{1,0} parameter(1)
+  %transpose = f32[10,20]{1,0} transpose(f32[20,10]{1,0} %y), dimensions={1,0}
+  ROOT %dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} %transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})");
+
+  options = HloPrintOptions().Canonical();
+  EXPECT_EQ(computation->ToString(options), R"(TransposeDot {
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  tmp_1 = f32[20,10]{1,0} parameter(1)
+  tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0}
+  ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})");
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index 7b552ee5b1798c4c7e24884a392c5982d7fb17ff..5d05ccfc0b223d8749a2577ba1bf96b1ab3e761b 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -149,7 +149,7 @@ TEST_F(HloConstantFoldingTest, Slice) {
   const int64 slice_limits[] = {10, 8, 6, 5, 9};
   const int64 slice_strides[] = {1, 1, 1, 1, 1};
   TF_ASSERT_OK_AND_ASSIGN(auto literal,
-                          LiteralTestUtil::CreateRandomLiteral<F32>(
+                          Literal::CreateRandomLiteral<F32>(
                               ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
   HloInstruction* literal_instruction = builder.AddInstruction(
       HloInstruction::CreateConstant(std::move(literal)));
@@ -172,7 +172,7 @@ TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
   HloComputation::Builder builder(TestName());
   const int64 dimensions[] = {11, 8, 7, 5, 9};
   TF_ASSERT_OK_AND_ASSIGN(auto literal,
-                          LiteralTestUtil::CreateRandomLiteral<F32>(
+                          Literal::CreateRandomLiteral<F32>(
                               ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
   auto literal_clone = literal->Literal::CloneToUnique();
   HloInstruction* literal_instruction = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 44e4f75f75b275653e1a07111943843fc6f78b33..94c9c7eabcc99d4cf61f535925c068a9b55ed136 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -142,19 +142,25 @@ Status HloCostAnalysis::HandleReducePrecision(const HloInstruction* hlo) {
 }
 
 Status HloCostAnalysis::HandleParameter(const HloInstruction*) {
+  current_should_compute_bottleneck_time_ = false;
   current_properties_[kBytesAccessedKey] = 0;
+  current_properties_[kOptimalSecondsKey] = 0;
   return Status::OK();
 }
 
 Status HloCostAnalysis::HandleConstant(const HloInstruction*) {
+  current_should_compute_bottleneck_time_ = false;
   current_properties_[kBytesAccessedKey] = 0;
+  current_properties_[kOptimalSecondsKey] = 0;
   return Status::OK();
 }
 
 Status HloCostAnalysis::HandleGetTupleElement(const HloInstruction*) {
   // GetTupleElement forwards a pointer and does not touch each element in the
   // output.
+  current_should_compute_bottleneck_time_ = false;
   current_properties_[kBytesAccessedKey] = 0;
+  current_properties_[kOptimalSecondsKey] = 0;
   return Status::OK();
 }
 
@@ -329,6 +335,7 @@ Status HloCostAnalysis::HandleSelectAndScatter(
 Status HloCostAnalysis::HandleBitcast(const HloInstruction*) {
   // A bitcast does no computation and touches no memory.
   current_properties_[kBytesAccessedKey] = 0;
+  current_properties_[kOptimalSecondsKey] = 0;
   return Status::OK();
 }
 
@@ -555,11 +562,13 @@ Status HloCostAnalysis::HandleCall(const HloInstruction* call) {
 }
 
 Status HloCostAnalysis::HandleCustomCall(const HloInstruction*) {
-  // We can't do anything sane with CustomCalls, since we don't know what they
-  // do, and returning an error status will stop iteration over this
-  // computation, which is probably also not what we want.  So just punt and
-  // return OK.  This will cause all of the properties to be reported as 0,
-  // which is fine.
+  // Mark applicable fields as "unknown", since we don't know what CustomCall
+  // does.  This is better than returning an error, which would stop iteration,
+  // and therefore would prevent us from getting *any* stats for a computation
+  // which contains a CustomCall.
+  current_properties_[kOptimalSecondsKey] = -1;
+  current_properties_[kBytesAccessedKey] = -1;
+  current_properties_[kFlopsKey] = -1;
   current_should_compute_bottleneck_time_ = false;
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index 9a89888480b8c79dfb1f79a50e9686bf45aa49b3..0fb65c845a6d4407c81171f6c1569fee98b1d16d 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -162,6 +162,17 @@ StatusOr<HloInstruction*> MakeConcatHlo(ArraySlice<HloInstruction*> operands,
       HloInstruction::CreateConcatenate(concat_shape, operands, dimension));
 }
 
+StatusOr<HloInstruction*> MakeDotHlo(HloInstruction* lhs, HloInstruction* rhs,
+                                     const DotDimensionNumbers& dim_numbers) {
+  HloComputation* computation = lhs->parent();
+  CHECK_EQ(computation, rhs->parent());
+  TF_ASSIGN_OR_RETURN(
+      Shape dot_shape,
+      ShapeInference::InferDotOpShape(lhs->shape(), rhs->shape(), dim_numbers));
+  return computation->AddInstruction(
+      HloInstruction::CreateDot(dot_shape, lhs, rhs, dim_numbers));
+}
+
 StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n) {
   CHECK_GT(n, 0);
 
@@ -269,7 +280,7 @@ StatusOr<HloInstruction*> BroadcastZeros(
 StatusOr<std::unique_ptr<HloComputation>> CreateComputationWithSignature(
     ArraySlice<const Shape*> domain, const Shape& range,
     tensorflow::StringPiece name) {
-  HloComputation::Builder b(name.ToString());
+  HloComputation::Builder b{std::string(name)};
   int64 param_idx = 0;
   for (const Shape* param_shape : domain) {
     b.AddInstruction(HloInstruction::CreateParameter(
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index c9a7361a6af0c2a0839c59a0ea695ec1b9a98bd4..49b1402d689a74874e34423a1832a0b6aa15f469 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -97,6 +97,11 @@ StatusOr<HloInstruction*> MakeGetTupleElementHlo(HloInstruction* operand,
 StatusOr<HloInstruction*> MakeConcatHlo(
     tensorflow::gtl::ArraySlice<HloInstruction*> operands, int64 dimension);
 
+// Creates a Dot HLO instruction and adds it to the computation containing `lhs`
+// and `rhs` (both must be in the same computation).
+StatusOr<HloInstruction*> MakeDotHlo(HloInstruction* lhs, HloInstruction* rhs,
+                                     const DotDimensionNumbers& dim_numbers);
+
 // -----------------------------------------------------------------------------
 // Some other miscellaneous helpers to generate common HLO patterns.  All of
 // these add all the instructions they generate into the computation containing
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index 3b22c93733af293e4d73a2b1b3ac8822dec6d5f5..c17c26c5a435fe34dd1024d596004cf6b5fdce8c 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 
 namespace xla {
@@ -88,6 +89,20 @@ bool CombineConstants(HloComputation* computation, bool is_layout_sensitive) {
   return changed;
 }
 
+// An instruction is considered to be equivalent to another only if they
+// share the exact same set of operands.
+int64 CseHash(const HloInstruction* instruction) {
+  int64 hash = std::hash<int64>()(static_cast<int64>(instruction->opcode()));
+  hash = tensorflow::Hash64Combine(
+      hash, instruction->opcode() == HloOpcode::kGetTupleElement
+                ? instruction->tuple_index()
+                : -1);
+  for (auto operand : instruction->operands()) {
+    hash = tensorflow::Hash64Combine(hash, operand->unique_id());
+  }
+  return hash;
+}
+
 }  // namespace
 
 StatusOr<bool> HloCSE::Run(HloModule* module) {
@@ -95,7 +110,14 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
   const std::function<bool(const HloInstruction*, const HloInstruction*)>
       eq_instructions = std::equal_to<const HloInstruction*>();
   const std::function<bool(const HloComputation*, const HloComputation*)>
-      eq_computations = std::equal_to<const HloComputation*>();
+      eq_computations = [](const HloComputation* lhs,
+                           const HloComputation* rhs) { return *lhs == *rhs; };
+
+  auto cse_equal = [&](const HloInstruction* lhs, const HloInstruction* rhs) {
+    return lhs->Identical(*rhs, eq_instructions, eq_computations,
+                          is_layout_sensitive_);
+  };
+
   for (auto* computation : module->computations()) {
     if (only_fusion_computations_ && !computation->IsFusionComputation()) {
       continue;
@@ -103,13 +125,17 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
 
     changed |= CombineConstants(computation, is_layout_sensitive_);
 
-    std::list<HloInstruction*> post_order =
-        computation->MakeInstructionPostOrder();
-    std::set<HloInstruction*> removed_instructions;
-    for (auto instruction : post_order) {
-      // If the instruction has already been removed by CSE skip over it.
-      if (removed_instructions.count(instruction) > 0 ||
-          instruction->operand_count() == 0) {
+    // HLO instructions are grouped into equivalency classes by using the
+    // cse_equal predicate defined above. This set holds a representative
+    // instruction for each class.
+    tensorflow::gtl::FlatSet<HloInstruction*, decltype(&CseHash),
+                             decltype(cse_equal)>
+        representatives(/*N=*/1024, &CseHash, cse_equal);
+
+    for (auto instruction : computation->MakeInstructionPostOrder()) {
+      // If the instruction has zero operands (constants, parameters, etc.) skip
+      // over it.
+      if (instruction->operand_count() == 0) {
         continue;
       }
 
@@ -118,31 +144,16 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
         continue;
       }
 
-      // An instruction is considered to be equivalent to another only if they
-      // share the exact same set of operands. So to find equivalent
-      // instructions, we just search among instructions which share operand(0)
-      // of this instruction.
-      const HloInstruction* operand = instruction->operand(0);
-
-      tensorflow::gtl::InlinedVector<HloInstruction*, 8>
-          equivalent_instructions;
-      for (HloInstruction* user : operand->users()) {
-        if (user != instruction && !user->HasSideEffect() &&
-            user->Identical(*instruction, eq_instructions, eq_computations,
-                            is_layout_sensitive_)) {
-          equivalent_instructions.push_back(user);
-        }
-      }
-
-      // Replace all equivalent instructions with this instruction.
-      for (HloInstruction* equivalent_instruction : equivalent_instructions) {
+      auto it = representatives.find(instruction);
+      if (it != representatives.end()) {
+        HloInstruction* equivalent_instruction = *it;
         TF_RETURN_IF_ERROR(
-            equivalent_instruction->ReplaceAllUsesWith(instruction));
-        TF_RETURN_IF_ERROR(
-            computation->RemoveInstruction(equivalent_instruction));
-        removed_instructions.insert(equivalent_instruction);
+            instruction->ReplaceAllUsesWith(equivalent_instruction));
+        TF_RETURN_IF_ERROR(computation->RemoveInstruction(instruction));
         changed = true;
+        continue;
       }
+      representatives.insert(instruction);
     }
   }
   return changed;
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index df8853f34f6a72c52d1cde7332ada3809d2f3d96..9735764b692238d6a320bcff51e43b98dcadabda 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -72,7 +73,7 @@ TEST_F(HloCseTest, CombineTwoConstants) {
 
   auto result = ExecuteAndTransfer(std::move(module), {});
   auto expected = Literal::CreateR0<float>(84.0);
-  LiteralTestUtil::ExpectNear(*expected, *result, ErrorSpec(1e-4));
+  EXPECT_TRUE(LiteralTestUtil::Near(*expected, *result, ErrorSpec(1e-4)));
 }
 
 TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndInsensitive) {
@@ -104,7 +105,7 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndInsensitive) {
 
   auto result = ExecuteAndTransfer(std::move(module), {});
   auto expected = Literal::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
-  LiteralTestUtil::ExpectNear(*expected, *result, ErrorSpec(1e-4));
+  EXPECT_TRUE(LiteralTestUtil::Near(*expected, *result, ErrorSpec(1e-4)));
 }
 
 TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndSensitive) {
@@ -134,7 +135,7 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndSensitive) {
 
   auto result = ExecuteAndTransfer(std::move(module), {});
   auto expected = Literal::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
-  LiteralTestUtil::ExpectNear(*expected, *result, ErrorSpec(1e-4));
+  EXPECT_TRUE(LiteralTestUtil::Near(*expected, *result, ErrorSpec(1e-4)));
 }
 
 TEST_F(HloCseTest, ConstantsSameValueDifferentType) {
@@ -469,5 +470,36 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) {
   EXPECT_THAT(root, op::Add(op::Map(op::Constant()), op::Map(op::Constant())));
 }
 
+TEST_F(HloCseTest, CompareComputations) {
+  auto module = tools::Parse(R"(
+    HloModule m
+
+    add_computation {
+      add_lhs = f32[] parameter(0)
+      add_rhs = f32[] parameter(1)
+      ROOT add_root = f32[] add(add_lhs, add_rhs)
+    }
+
+    add_computation2 {
+      add_lhs2 = f32[] parameter(0)
+      add_rhs2 = f32[] parameter(1)
+      ROOT add_root2 = f32[] add(add_lhs2, add_rhs2)
+    }
+
+    ENTRY entry {
+      p = f32[10]{0} parameter(0)
+      c = f32[] constant(0)
+      r1 = f32[] reduce(p, c), dimensions={0}, to_apply=add_computation
+      r2 = f32[] reduce(p, c), dimensions={0}, to_apply=add_computation2
+      ROOT f2 = (f32[],f32[]) tuple(r1, r2)
+    })")
+                    .ValueOrDie();
+
+  HloCSE cse(/*is_layout_sensitive=*/false);
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->operand(0), root->operand(1));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 0c37a8d75f38dabaad886cc9d4adce8ab29ddf18..b06e6c9f3e62f375a9e48f8ef81efe7121bbef94 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -878,4 +878,128 @@ Status HloDataflowAnalysis::Verify() const {
   return Status::OK();
 }
 
+bool HloDataflowAnalysis::DoesNotUseOperandBuffer(
+    const HloInstruction* operand, const ShapeIndex& index,
+    const HloInstruction* user) const {
+  CHECK(user->IsUserOf(operand))
+      << "user: " << user->ToString() << " operand: " << operand->ToString();
+  if (user->opcode() == HloOpcode::kFusion &&
+      user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
+    // Find fusion parameter associated with 'operand'.
+    HloInstruction* fusion_param =
+        user->fused_parameter(user->operand_index(operand));
+    // Iterate through all users of all uses of the fusion parameter value.
+    // Return false if any uses are detected, returns true otherwise.
+    const HloValue& value = GetValueDefinedAt(fusion_param, index);
+    return value.uses().empty();
+  } else {
+    // Return false if no value at 'operand' and 'index' is used at 'user'.
+    for (const HloValue* value : GetValueSet(operand, index).values()) {
+      for (const HloUse& use : value->uses()) {
+        if (use.instruction == user) {
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
+    HloInstruction* operand, const ShapeIndex& operand_index,
+    HloInstruction* user, const ShapeIndex& user_index) const {
+  CHECK(user->IsUserOf(operand))
+      << "user: " << user->ToString() << " operand: " << operand->ToString();
+  const Shape& operand_subshape =
+      ShapeUtil::GetSubshape(operand->shape(), operand_index);
+  const Shape& user_subshape =
+      ShapeUtil::GetSubshape(user->shape(), user_index);
+  // Check that operand and user emit the same shape and layout.
+  if (!ShapeUtil::Equal(operand_subshape, user_subshape)) {
+    return false;
+  }
+
+  if (user->opcode() == HloOpcode::kFusion) {
+    // Get the parameter associated with 'operand';
+    HloInstruction* fusion_param =
+        user->fused_parameter(user->operand_index(operand));
+
+    const HloValue& value = GetValueDefinedAt(fusion_param, operand_index);
+    if (value.uses().size() != 1) {
+      return false;
+    }
+    const HloUse& use = value.uses()[0];
+
+    if (user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
+        user->fused_expression_root()->opcode() ==
+            HloOpcode::kDynamicUpdateSlice) {
+      // Loop fusion with kDynamicUpdateSlice fused root.
+      //
+      // Returns true iff there is exactly one use of 'operand' at shape index
+      // 'operand_index', and this singleton use is the fused root at operand
+      // index 0.
+      return use.instruction == user->fused_expression_root() &&
+             use.operand_number == 0;
+    } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
+               user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
+      // Output fusion with kAdd fused root.
+
+      // Check if one operand of kAdd fused root is kDot or kConvolution.
+      auto* add = user->fused_expression_root();
+      auto add_operand_it =
+          std::find_if(add->operands().begin(), add->operands().end(),
+                       [&](HloInstruction* operand) {
+                         return operand->opcode() == HloOpcode::kConvolution ||
+                                operand->opcode() == HloOpcode::kDot;
+                       });
+      if (add_operand_it == add->operands().end()) {
+        return false;
+      }
+      auto* matched_add_operand = *add_operand_it;
+      // Calculate operand index of 'add' operand which was not matched above.
+      const int64 other_add_operand_index =
+          matched_add_operand == add->operand(0) ? 1 : 0;
+      // Returns true iff there is exactly one use of 'operand' at shape index
+      // 'operand_index', and this singleton use is the fused root (at operand
+      // index 'other_add_operand_index').
+      return use.instruction == user->fused_expression_root() &&
+             use.operand_number == other_add_operand_index;
+    }
+  }
+  if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
+      user->opcode() == HloOpcode::kWhile) {
+    // We eliminated other users in BufferLiveness::live_range_strictly_before,
+    // so here we just need to check that the use is at operand index 0.
+    std::vector<int64> operand_indices = user->OperandIndices(operand);
+    return operand_indices.size() == 1 && operand_indices[0] == 0;
+  }
+  if (user->opcode() == HloOpcode::kCall) {
+    // Get all uses of value defined by 'operand' at 'operand_index'.
+    const auto& uses = GetValueDefinedAt(operand, operand_index).uses();
+    // Return true iff:
+    // *) There exists two uses of 'operand'.
+    // *) One use is by 'user' (caller).
+    // *) One use is by root instruction of called computation (callee root).
+    //    (Note: we check the root of the called computation, because the
+    //     root result buffer is required to alias with the Call result buffer).
+    // *) The root instruction of the called computation is element-wise on
+    //    'operand'.
+    const bool found_caller_use =
+        std::find_if(uses.begin(), uses.end(), [user](const HloUse& use) {
+          return use.instruction == user;
+        }) != uses.end();
+    auto* callee_root = user->to_apply()->root_instruction();
+    const bool found_elementwise_callee_use =
+        std::find_if(
+            uses.begin(), uses.end(), [callee_root](const HloUse& use) {
+              return use.instruction == callee_root &&
+                     callee_root->IsElementwiseOnOperand(use.operand_number);
+            }) != uses.end();
+    return uses.size() == 2 && found_caller_use && found_elementwise_callee_use;
+  }
+  // Check if 'user' is element-wise.
+  return user->IsElementwise();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index 7b8a74b096ff48733717e78ada5bb56a28caed72..9868746b6113881949e388cd2a4aa9f610b1fdb7 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -118,6 +118,23 @@ class HloDataflowAnalysis {
 
   string ToString() const;
 
+  // Returns true if 'user' cannot possibly use the buffer at 'index' in
+  // 'operand'. Returns false otherwise.
+  //
+  // REQUIRES: 'operand' is an operand of 'user'.
+  bool DoesNotUseOperandBuffer(const HloInstruction* operand,
+                               const ShapeIndex& index,
+                               const HloInstruction* user) const;
+
+  // Returns true if 'user' (at 'user_index') can share a buffer with its
+  // operand 'operand' (at 'operand_index'). Returns false otherwise.
+  //
+  // REQUIRES: 'operand' is an operand of 'user'.
+  bool CanShareOperandBufferWithUser(HloInstruction* operand,
+                                     const ShapeIndex& operand_index,
+                                     HloInstruction* user,
+                                     const ShapeIndex& user_index) const;
+
  protected:
   HloDataflowAnalysis(const HloModule& module, bool ssa_form,
                       bool bitcast_defines_value = false);
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 07f69b8e1339fed636e4eb54791941b85e09fd17..5798326dcbf65c3c34748afb02afab1dc7af9147 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -1873,5 +1873,346 @@ INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
                         HloDataflowAnalysisTest,
                         ::testing::Values(false, true));
 
+class HloDataflowAnalysisTestBase : public HloTestBase {
+ protected:
+  void BuildModule(std::unique_ptr<HloComputation> computation) {
+    module_ = CreateNewModule();
+    computation_ = module_->AddEntryComputation(std::move(computation));
+  }
+
+  void RunAnalysis() {
+    CHECK_NOTNULL(module_.get());
+    dataflow_analysis_ = HloDataflowAnalysis::Run(*module_).ConsumeValueOrDie();
+  }
+
+  void BuildModuleAndRunAnalysis(std::unique_ptr<HloComputation> computation) {
+    BuildModule(std::move(computation));
+    RunAnalysis();
+  }
+
+  std::unique_ptr<HloModule> module_;
+  HloComputation* computation_ = nullptr;
+  std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
+};
+
+class DoesNotUseOperandBufferTest : public HloDataflowAnalysisTestBase {};
+
+TEST_F(DoesNotUseOperandBufferTest, GetTupleElement) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape elem_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({elem_shape, elem_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(elem_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(elem_shape, tuple, 1));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(elem_shape, HloOpcode::kAdd, gte0, gte1));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  // GetTupleElement instructions only access the top-level buffer of their
+  // operand.
+  EXPECT_TRUE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {0}, gte0));
+  EXPECT_TRUE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {1}, gte1));
+  EXPECT_FALSE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {}, gte0));
+  EXPECT_FALSE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {}, gte1));
+}
+
+TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
+
+  // Create a DynamicUpdateSlice instruction of tuple element 1.
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
+  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR1<float>({2.f, 2.f, 2.f})));
+  auto dynamic_update_slice =
+      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          data_shape, gte1, update, starts));
+  builder.AddInstruction(
+      HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {dynamic_update_slice, starts, update, gte1},
+      HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  // The fusion instruction never uses tuple element 0, but does use element 1.
+  EXPECT_TRUE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {0}, fusion));
+  EXPECT_FALSE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {1}, fusion));
+}
+
+class CanShareOperandBufferWithUserTest : public HloDataflowAnalysisTestBase {};
+
+TEST_F(CanShareOperandBufferWithUserTest, ElementWiseSameShape) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kExp, param));
+  auto log = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kLog, exp));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_TRUE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(param, {}, exp, {}));
+  EXPECT_TRUE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(exp, {}, log, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, ElementWiseDifferentShape) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape in_shape = ShapeUtil::MakeShape(F32, {8});
+  Shape out_shape = ShapeUtil::MakeShape(PRED, {8});
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, in_shape, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, in_shape, "param1"));
+  auto result = builder.AddInstruction(
+      HloInstruction::CreateBinary(out_shape, HloOpcode::kEq, param0, param1));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(param0, {},
+                                                                 result, {}));
+  EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(param1, {},
+                                                                 result, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, CopyShares) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kExp, param));
+  auto copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCopy, exp));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_TRUE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(param, {}, exp, {}));
+  EXPECT_TRUE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(exp, {}, copy, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
+
+  // Create a DynamicUpdateSlice instruction of tuple element 1.
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
+  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR1<float>({2.f, 2.f, 2.f})));
+  auto dynamic_update_slice =
+      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          data_shape, gte1, update, starts));
+  builder.AddInstruction(
+      HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {dynamic_update_slice, starts, update, gte1},
+      HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  // The fusion instruction can share with tuple element 1.
+  EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(tuple, {0},
+                                                                 fusion, {}));
+  EXPECT_TRUE(dataflow_analysis_->CanShareOperandBufferWithUser(tuple, {1},
+                                                                fusion, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  Shape update_shape = ShapeUtil::MakeShape(F32, {4});
+  Shape starts_shape = ShapeUtil::MakeShape(S32, {1});
+  auto data = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape, "data"));
+  auto update = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, update_shape, "update"));
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, starts_shape, "starts"));
+  auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      data_shape, data, update, starts));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  // The DynamicUpdateSlice instruction can share with the data operand, but not
+  // with update or starts.
+  EXPECT_TRUE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(data, {}, dus, {}));
+  EXPECT_FALSE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(update, {}, dus, {}));
+  EXPECT_FALSE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(starts, {}, dus, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
+  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateDot(data_shape, a, b, dot_dnums));
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto add_operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      data_shape, HloOpcode::kAdd, dot, add_operand));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, dot}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused dot add should be able to share buffer with 'add_operand'.
+  EXPECT_TRUE(dataflow_analysis_->CanShareOperandBufferWithUser(add_operand, {},
+                                                                fusion, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto reverse = builder.AddInstruction(
+      HloInstruction::CreateReverse(data_shape, operand, {0, 1}));
+
+  auto two = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, reverse, two));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, two, reverse}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused operand->reverse->add cannot alias operand buffer 'operand'.
+  EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(operand, {},
+                                                                 fusion, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+
+  auto make_cond = [this, &data_shape]() {
+    auto builder = HloComputation::Builder(TestName() + ".Cond");
+    auto data = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, data_shape, "data"));
+    builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kEq, data, data));
+    return builder.Build();
+  };
+
+  auto make_body = [this, &data_shape]() {
+    auto builder = HloComputation::Builder(TestName() + ".Body");
+    auto data = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, data_shape, "data"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, data, data));
+    return builder.Build();
+  };
+
+  module_ = CreateNewModule();
+  HloComputation* cond_computation =
+      module_->AddEmbeddedComputation(make_cond());
+  HloComputation* body_computation =
+      module_->AddEmbeddedComputation(make_body());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto data = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape, "data"));
+  auto whil = builder.AddInstruction(HloInstruction::CreateWhile(
+      data_shape, cond_computation, body_computation, data));
+  computation_ = module_->AddEntryComputation(builder.Build());
+
+  RunAnalysis();
+
+  // The While instruction can share with the data operand.
+  EXPECT_TRUE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(data, {}, whil, {}));
+}
+
+// Tests that Call can alias operand buffer if the only use of the operand
+// in the called computation is an elementwise instruction.
+TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  // Build sub-computation with fusion root.
+  auto sub_builder = HloComputation::Builder(TestName() + "_sub");
+  auto sub_param = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "sub_param"));
+  auto one = sub_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto ones = sub_builder.AddInstruction(
+      HloInstruction::CreateBroadcast(shape, one, {1}));
+  auto add = sub_builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sub_param, ones));
+
+  module_ = CreateNewModule();
+  auto sub_computation = module_->AddEmbeddedComputation(sub_builder.Build());
+  sub_computation->CreateFusionInstruction({add, ones},
+                                           HloInstruction::FusionKind::kLoop);
+
+  // Build entry-computation with kCall which calls 'sub_computation'.
+  auto builder = HloComputation::Builder(TestName());
+
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto reverse =
+      builder.AddInstruction(HloInstruction::CreateReverse(shape, param, {0}));
+  auto call = builder.AddInstruction(
+      HloInstruction::CreateCall(shape, {reverse}, sub_computation));
+  computation_ = module_->AddEntryComputation(builder.Build());
+
+  RunAnalysis();
+
+  EXPECT_TRUE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(reverse, {}, call, {}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 1071f5b184bd77cb9c57fa3fc28d4005711369b5..fa59a5fb2030b22aa9e6a59abbfba521d19adb51 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
@@ -42,7 +43,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -52,25 +52,11 @@ namespace xla {
 namespace {
 
 using tensorflow::gtl::ArraySlice;
-using tensorflow::gtl::FlatSet;
-using tensorflow::gtl::optional;
-
-template <typename T>
-struct is_complex_t : public std::false_type {};
-
-template <>
-struct is_complex_t<complex64> : public std::true_type {};
-
-template <typename T>
-struct is_complex64_t : public std::false_type {};
-
-template <>
-struct is_complex64_t<complex64> : public std::true_type {};
 
 template <typename OperandT>
 StatusOr<std::unique_ptr<Literal>> Compare(const Shape& shape, HloOpcode opcode,
-                                           const Literal& lhs_literal,
-                                           const Literal& rhs_literal) {
+                                           LiteralSlice lhs_literal,
+                                           LiteralSlice rhs_literal) {
   std::function<bool(OperandT, OperandT)> compare_op;
   switch (opcode) {
     case HloOpcode::kEq:
@@ -108,7 +94,7 @@ StatusOr<std::unique_ptr<Literal>> Compare(const Shape& shape, HloOpcode opcode,
                  << HloOpcodeString(opcode);
   }
 
-  auto result = Literal::CreateFromShape(shape);
+  auto result = MakeUnique<Literal>(shape);
   TF_RETURN_IF_ERROR(result->Populate<bool>([&](ArraySlice<int64> multi_index) {
     return compare_op(lhs_literal.Get<OperandT>(multi_index),
                       rhs_literal.Get<OperandT>(multi_index));
@@ -119,8 +105,8 @@ StatusOr<std::unique_ptr<Literal>> Compare(const Shape& shape, HloOpcode opcode,
 
 template <>
 StatusOr<std::unique_ptr<Literal>> Compare<complex64>(
-    const Shape& shape, HloOpcode opcode, const Literal& lhs_literal,
-    const Literal& rhs_literal) {
+    const Shape& shape, HloOpcode opcode, LiteralSlice lhs_literal,
+    LiteralSlice rhs_literal) {
   std::function<bool(complex64, complex64)> compare_op;
   switch (opcode) {
     case HloOpcode::kEq:
@@ -138,7 +124,7 @@ StatusOr<std::unique_ptr<Literal>> Compare<complex64>(
                  << HloOpcodeString(opcode);
   }
 
-  auto result = Literal::CreateFromShape(shape);
+  auto result = MakeUnique<Literal>(shape);
   TF_RETURN_IF_ERROR(result->Populate<bool>([&](ArraySlice<int64> multi_index) {
     return compare_op(lhs_literal.Get<complex64>(multi_index),
                       rhs_literal.Get<complex64>(multi_index));
@@ -147,2092 +133,48 @@ StatusOr<std::unique_ptr<Literal>> Compare<complex64>(
   return std::move(result);
 }
 
-template <typename ReturnT, typename NativeT>
-StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOpImpl(
-    HloInstruction* instruction,
-    const std::function<ReturnT(NativeT)>& unary_op,
-    const Literal& operand_literal) {
-  const auto shape = instruction->shape();
-  const auto* operand = instruction->operand(0);
-
-  // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
-  // removed.
-  if (!ShapeUtil::SameDimensions(shape, operand->shape())) {
-    return Unimplemented(
-        "Implicit broadcasting is currently unsupported in HLO evaluator "
-        "Shape Mismatch: %s vs %s",
-        ShapeUtil::HumanString(shape).c_str(),
-        ShapeUtil::HumanString(operand->shape()).c_str());
-  }
-
-  auto result = Literal::CreateFromShape(shape);
-
-  TF_RETURN_IF_ERROR(
-      result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
-        return unary_op(operand_literal.Get<NativeT>(multi_index));
-      }));
-  return std::move(result);
-}
-
-// For one particular placement of a window in a base shape (the placement is
-// represented as `window_count_index`), iterates inside the window. Translates
-// the window index into base index. If the base index is within bound, call `f`
-// with the base index.
-void IterateThroughWindow(
-    const Shape& window_shape, const Window& window, const Shape& base_shape,
-    const ArraySlice<int64>& window_count_index,
-    const std::function<void(const std::vector<int64>&)>& f) {
-  const int64 rank = ShapeUtil::Rank(base_shape);
-  DimensionVector window_index(rank);
-  std::fill(window_index.begin(), window_index.end(), 0);
-  do {
-    std::vector<int64> base_index(rank);
-    bool out_of_bound = false;
-    for (int64 i = 0; i < rank; ++i) {
-      base_index[i] = window_count_index[i] * window.dimensions(i).stride() +
-                      window_index[i] - window.dimensions(i).padding_low();
-      if (base_index[i] < 0 || base_index[i] >= base_shape.dimensions(i)) {
-        out_of_bound = true;
-        break;
-      }
-    }
-    if (!out_of_bound) {
-      f(base_index);
-    }
-  } while (IndexUtil::BumpIndices(window_shape, &window_index));
-}
-
-// Creates a vector of multipliers which can be used to create a linear index
-// into shape.
-//
-// Given the multidimensional index {i1, ..., iN} and
-// M = MakeDimMultipliers(shape), the corresponding linear index LI is simply
-//
-//   LI = i1 * M[1] + i2 * M[2] + ... + iN * M[N].
-//
-// This lets you calculate LI given the multidimensional indices in any order.
-DimensionVector MakeDimMultipliers(const Shape& shape) {
-  DimensionVector v(ShapeUtil::Rank(shape));
-  int64 scale = 1;
-  for (auto dim : LayoutUtil::MinorToMajor(shape)) {
-    v[dim] = scale;
-    scale *= shape.dimensions(dim);
-  }
-  return v;
-}
-
 }  // namespace
 
-template <typename ReturnT, typename ElementwiseT>
-class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
- public:
-  explicit TypedVisitor(HloEvaluator* p) : parent_(p) {}
-
-  // The following higher-order functions convert a function with ElementwiseT
-  // to a function with ReturnT.
-  std::function<ReturnT(ReturnT)> ConvertUnaryFunction(
-      const std::function<ElementwiseT(ElementwiseT)>& unary_op) {
-    return [&unary_op](ReturnT arg) {
-      return static_cast<ReturnT>(unary_op(static_cast<ElementwiseT>(arg)));
-    };
-  }
-  std::function<ReturnT(ReturnT, ReturnT)> ConvertBinaryFunction(
-      const std::function<ElementwiseT(ElementwiseT, ElementwiseT)>&
-          binary_op) {
-    return [&binary_op](ReturnT arg1, ReturnT arg2) {
-      return static_cast<ReturnT>(binary_op(static_cast<ElementwiseT>(arg1),
-                                            static_cast<ElementwiseT>(arg2)));
-    };
-  }
-  std::function<ReturnT(ReturnT, ReturnT, ReturnT)> ConvertTernaryFunction(
-      const std::function<ElementwiseT(ElementwiseT, ElementwiseT,
-                                       ElementwiseT)>& ternary_op) {
-    return [&ternary_op](ReturnT arg1, ReturnT arg2, ReturnT arg3) {
-      return static_cast<ReturnT>(ternary_op(static_cast<ElementwiseT>(arg1),
-                                             static_cast<ElementwiseT>(arg2),
-                                             static_cast<ElementwiseT>(arg3)));
-    };
-  }
-
-  Status DefaultAction(HloInstruction* hlo_instruction) override {
-    return Unimplemented("unhandled HLO ops for HloEvaluator: %s.",
-                         HloOpcodeString(hlo_instruction->opcode()).c_str());
-  }
-
-  // TODO(b/35950897): many of the stl functions used in the handlers are not
-  // overloaded for every XLA primitive types.
-
-  template <typename NativeT,
-            typename std::enable_if<std::is_unsigned<NativeT>::value>::type* =
-                nullptr>
-  Status HandleAbs(HloInstruction* abs) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
-                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
-                          return elem_operand;
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<std::is_signed<NativeT>::value>::type* = nullptr>
-  Status HandleAbs(HloInstruction* abs) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
-                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
-                          return std::abs(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex64_t<NativeT>::value>::type* = nullptr>
-  Status HandleAbs(HloInstruction* abs) {
-    const Literal& operand_literal =
-        parent_->GetEvaluatedLiteralFor(abs->operand(0));
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[abs],
-        (ElementWiseUnaryOpImpl<float, NativeT>(
-            abs, [](NativeT elem_operand) { return std::abs(elem_operand); },
-            operand_literal)));
-
-    return Status::OK();
-  }
-
-  Status HandleAbs(HloInstruction* abs) override {
-    // If the operand is of C64 type, the return type of abs will be F32.
-    // However, ElementwiseT would still be the return type, F32, and thus
-    // specifying the ElementwiseT explicitly as C64 is needed below.
-    if (abs->operand(0)->shape().element_type() == C64) {
-      return HandleAbs<complex64>(abs);
-    }
-    return HandleAbs<ElementwiseT>(abs);
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleRound(HloInstruction* round) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[round],
-        ElementWiseUnaryOp(round, [](ElementwiseT elem_operand) {
-          return std::round(elem_operand);
-        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleRound(HloInstruction* round) {
-    return InvalidArgument("Unsupported type for Round");
-  }
-
-  Status HandleRound(HloInstruction* round) override {
-    return HandleRound<ReturnT>(round);
-  }
-
-  Status HandleBroadcast(HloInstruction* broadcast) override {
-    parent_->evaluated_[broadcast] =
-        Literal::CreateFromShape(broadcast->shape());
-    auto output = parent_->evaluated_[broadcast].get();
-    const Literal& operand_to_broadcast =
-        parent_->GetEvaluatedLiteralFor(broadcast->operand(0));
-    std::vector<int64> broadcast_indices(
-        ShapeUtil::Rank(broadcast->operand(0)->shape()), 0);
-
-    TF_RET_CHECK(broadcast->dimensions().size() ==
-                 ShapeUtil::Rank(operand_to_broadcast.shape()))
-        << "broadcast dimensions is of size: " << broadcast->dimensions().size()
-        << " and rank of operand_to_broadcast is: "
-        << ShapeUtil::Rank(operand_to_broadcast.shape());
-    // Checks that operand's dimensions are the same as the broadcast's
-    // dimensions along the dimensions to be broadcasted.
-    for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
-      TF_RET_CHECK(broadcast->shape().dimensions(broadcast->dimensions(i)) ==
-                   operand_to_broadcast.shape().dimensions(i));
-    }
-
-    return output->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
-      for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
-        broadcast_indices[i] = multi_index[broadcast->dimensions(i)];
-      }
-      return operand_to_broadcast.Get<ReturnT>(broadcast_indices);
-    });
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleCeil(HloInstruction* ceil) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[ceil],
-                        ElementWiseUnaryOp(ceil, [](ElementwiseT elem_operand) {
-                          return std::ceil(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleCeil(HloInstruction* ceil) {
-    return InvalidArgument("Unsupported type for Ceil");
-  }
-
-  Status HandleCeil(HloInstruction* ceil) override {
-    return HandleCeil<ReturnT>(ceil);
-  }
-
-  Status HandleConvert(HloInstruction* convert) override {
-    const HloInstruction* operand = convert->operand(0);
-    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> result,
-                        parent_->GetEvaluatedLiteralFor(operand).Convert(
-                            convert->shape().element_type()));
-
-    if (LayoutUtil::LayoutsInShapesEqual(result->shape(), convert->shape())) {
-      parent_->evaluated_[convert] = std::move(result);
-    } else {
-      parent_->evaluated_[convert] =
-          result->Relayout(convert->shape().layout());
-    }
-    return Status::OK();
-  }
-
-  Status HandleBitcastConvert(HloInstruction* convert) override {
-    const HloInstruction* operand = convert->operand(0);
-    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> result,
-                        parent_->GetEvaluatedLiteralFor(operand).BitcastConvert(
-                            convert->shape().element_type()));
-
-    if (LayoutUtil::LayoutsInShapesEqual(result->shape(), convert->shape())) {
-      parent_->evaluated_[convert] = std::move(result);
-    } else {
-      parent_->evaluated_[convert] =
-          result->Relayout(convert->shape().layout());
-    }
-    return Status::OK();
-  }
-
-  Status HandleExp(HloInstruction* exp) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
-                        ElementWiseUnaryOp(exp, [](ElementwiseT elem_operand) {
-                          return std::exp(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleFloor(HloInstruction* floor) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[floor],
-        ElementWiseUnaryOp(floor, [](ElementwiseT elem_operand) {
-          return std::floor(elem_operand);
-        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleFloor(HloInstruction* floor) {
-    return InvalidArgument("Unsupported type for Floor");
-  }
-
-  Status HandleFloor(HloInstruction* floor) override {
-    return HandleFloor<ReturnT>(floor);
-  }
-
-  Status HandleLog(HloInstruction* log) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[log],
-                        ElementWiseUnaryOp(log, [](ElementwiseT elem_operand) {
-                          return std::log(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_integral<NativeT>::value &&
-                !std::is_same<NativeT, bool>::value>::type* = nullptr>
-  Status HandleNot(HloInstruction* not_) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
-                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
-                          return ~elem_operand;
-                        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleNot(HloInstruction* not_) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
-                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
-                          return !elem_operand;
-                        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<std::is_same<NativeT, bool>::value>::type* =
-                nullptr>
-  Status HandleNot(HloInstruction* not_) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
-                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
-                          return !elem_operand;
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleNot(HloInstruction* not_) {
-    return InvalidArgument("Unsupported type for Not");
-  }
-
-  Status HandleNot(HloInstruction* not_) override {
-    return HandleNot<ElementwiseT>(not_);
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_signed<NativeT>::value &&
-                !std::is_floating_point<NativeT>::value>::type* = nullptr>
-  Status HandleNegate(HloInstruction* negate) {
-    using type = typename std::make_unsigned<NativeT>::type;
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[negate],
-        ElementWiseUnaryOp(negate, [](ElementwiseT elem_operand) {
-          return NativeT(-type(elem_operand));
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<
-                !std::is_signed<NativeT>::value ||
-                std::is_floating_point<NativeT>::value>::type* = nullptr>
-  Status HandleNegate(HloInstruction* negate) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[negate],
-        ElementWiseUnaryOp(
-            negate, [](ElementwiseT elem_operand) { return -elem_operand; }));
-    return Status::OK();
-  }
-
-  Status HandleNegate(HloInstruction* negate) override {
-    return HandleNegate<ReturnT>(negate);
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleSign(HloInstruction* sign) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
-                        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
-                          return (ElementwiseT(0) < elem_operand) -
-                                 (elem_operand < ElementwiseT(0));
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleSign(HloInstruction* sign) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
-                        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
-                          auto abs_val = std::abs(elem_operand);
-                          return 0 == abs_val ? ElementwiseT(0)
-                                              : elem_operand / abs_val;
-                        }));
-    return Status::OK();
-  }
-
-  Status HandleSign(HloInstruction* sign) override {
-    return HandleSign<ReturnT>(sign);
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleAtan2(HloInstruction* atan2) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[atan2],
-                        ElementWiseBinaryOp(atan2, [](ElementwiseT lhs_elem,
-                                                      ElementwiseT rhs_elem) {
-                          return std::atan2(lhs_elem, rhs_elem);
-                        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT, typename std::enable_if<!std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleAtan2(HloInstruction* atan2) {
-    return InvalidArgument("Unsupported type for Atan2");
-  }
-
-  Status HandleAtan2(HloInstruction* atan2) override {
-    return HandleAtan2<ElementwiseT>(atan2);
-  }
-
-  Status HandleTanh(HloInstruction* tanh) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[tanh],
-                        ElementWiseUnaryOp(tanh, [](ElementwiseT elem_operand) {
-                          return std::tanh(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_signed<NativeT>::value &&
-                !std::is_floating_point<NativeT>::value>::type* = nullptr>
-  Status HandleMultiply(HloInstruction* multiply) {
-    using type = typename std::make_unsigned<NativeT>::type;
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[multiply],
-        ElementWiseBinaryOp(multiply,
-                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
-                              return NativeT(type(lhs_elem) * type(rhs_elem));
-                            }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<std::is_unsigned<NativeT>::value ||
-                              std::is_floating_point<NativeT>::value ||
-                              is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleMultiply(HloInstruction* multiply) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[multiply],
-        ElementWiseBinaryOp(multiply,
-                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
-                              return lhs_elem * rhs_elem;
-                            }));
-    return Status::OK();
-  }
-
-  Status HandleMultiply(HloInstruction* multiply) override {
-    return HandleMultiply<ElementwiseT>(multiply);
-  }
-
-  Status HandleSubtract(HloInstruction* subtract) override {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[subtract],
-        ElementWiseBinaryOp(subtract,
-                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
-                              return lhs_elem - rhs_elem;
-                            }));
-    return Status::OK();
-  }
-
-  Status HandleAdd(HloInstruction* add) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[add],
-                        ElementWiseBinaryOp(add, [](ElementwiseT lhs_elem,
-                                                    ElementwiseT rhs_elem) {
-                          return lhs_elem + rhs_elem;
-                        }));
-    return Status::OK();
-  }
-
-  Status HandleDivide(HloInstruction* divide) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[divide],
-                        ElementWiseBinaryOp(divide, [](ElementwiseT lhs_elem,
-                                                       ElementwiseT rhs_elem) {
-                          return lhs_elem / rhs_elem;
-                        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
-                nullptr>
-  Status HandleMaximum(HloInstruction* maximum) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[maximum],
-        ElementWiseBinaryOp(maximum, [](ElementwiseT lhs, ElementwiseT rhs) {
-          return std::max(lhs, rhs);
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleMaximum(HloInstruction* maximum) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[maximum],
-        ElementWiseBinaryOp(maximum, [](ElementwiseT lhs, ElementwiseT rhs) {
-          return ((lhs >= rhs) || std::isnan(lhs)) ? lhs : rhs;
-        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleMaximum(HloInstruction* maximum) {
-    return InvalidArgument("Unsupported type for Maximum");
-  }
-
-  Status HandleMaximum(HloInstruction* maximum) override {
-    return HandleMaximum<ElementwiseT>(maximum);
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
-                nullptr>
-  Status HandleMinimum(HloInstruction* minimum) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[minimum],
-                        ElementWiseBinaryOp(minimum, [](ElementwiseT lhs_el,
-                                                        ElementwiseT rhs_el) {
-                          return std::min(lhs_el, rhs_el);
-                        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleMinimum(HloInstruction* minimum) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[minimum],
-        ElementWiseBinaryOp(minimum, [](ElementwiseT lhs_el,
-                                        ElementwiseT rhs_el) {
-          return ((lhs_el <= rhs_el) || std::isnan(lhs_el)) ? lhs_el : rhs_el;
-        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleMinimum(HloInstruction* minimum) {
-    return InvalidArgument("Unsupported type for Minimum");
-  }
-
-  Status HandleMinimum(HloInstruction* minimum) override {
-    return HandleMinimum<ElementwiseT>(minimum);
-  }
-
-  Status HandlePower(HloInstruction* power) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[power],
-                        ElementWiseBinaryOp(power, [](ElementwiseT lhs_el,
-                                                      ElementwiseT rhs_el) {
-                          return std::pow(lhs_el, rhs_el);
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleRemainder(HloInstruction* remainder) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[remainder],
-                        ElementWiseBinaryOp(remainder, [](ElementwiseT lhs_el,
-                                                          ElementwiseT rhs_el) {
-                          return std::fmod(lhs_el, rhs_el);
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleRemainder(HloInstruction* remainder) {
-    return InvalidArgument("Unsupported type for Remainder");
-  }
-
-  Status HandleRemainder(HloInstruction* remainder) override {
-    return HandleRemainder<ElementwiseT>(remainder);
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
-                nullptr>
-  Status HandleAnd(HloInstruction* and_) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[and_],
-        ElementWiseBinaryOp(and_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
-          return lhs_el & rhs_el;
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleAnd(HloInstruction* and_) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[and_],
-        ElementWiseBinaryOp(and_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
-          return lhs_el && rhs_el;
-        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleAnd(HloInstruction* and_) {
-    return InvalidArgument("Unsupported type for And");
-  }
-
-  Status HandleAnd(HloInstruction* and_) override {
-    return HandleAnd<ElementwiseT>(and_);
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
-                nullptr>
-  Status HandleOr(HloInstruction* or_) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[or_],
-        ElementWiseBinaryOp(or_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
-          return lhs_el | rhs_el;
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleOr(HloInstruction* or_) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[or_],
-        ElementWiseBinaryOp(or_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
-          return lhs_el || rhs_el;
-        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleOr(HloInstruction* or_) {
-    return InvalidArgument("Unsupported type for Or");
-  }
-
-  Status HandleOr(HloInstruction* or_) override {
-    return HandleOr<ElementwiseT>(or_);
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_integral<NativeT>::value &&
-                !std::is_same<NativeT, bool>::value>::type* = nullptr>
-  Status HandleShiftLeft(HloInstruction* shl) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[shl],
-        ElementWiseBinaryOp(shl, [](NativeT lhs_elem, NativeT rhs_elem) {
-          return IsShiftOutOfBounds<NativeT>(rhs_elem) ? 0
-                                                       : (lhs_elem << rhs_elem);
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<!std::is_integral<NativeT>::value ||
-                                    std::is_same<NativeT, bool>::value>::type* =
-                nullptr>
-  Status HandleShiftLeft(HloInstruction*) {
-    return InvalidArgument("Unsupported type for ShiftLeft");
-  }
-
-  Status HandleShiftLeft(HloInstruction* shl) override {
-    return HandleShiftLeft<ElementwiseT>(shl);
-  }
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_integral<NativeT>::value &&
-                !std::is_same<NativeT, bool>::value>::type* = nullptr>
-  Status HandleShiftRightArithmetic(HloInstruction* shr) {
-    typedef typename std::make_signed<NativeT>::type SignedT;
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[shr],
-        ElementWiseBinaryOp(shr, [](NativeT lhs_elem, NativeT rhs_elem) {
-          SignedT lhs_signed = static_cast<SignedT>(lhs_elem);
-          if (IsShiftOutOfBounds<NativeT>(rhs_elem)) {
-            return lhs_signed < 0 ? static_cast<SignedT>(-1) : 0;
-          } else {
-            return lhs_signed >> rhs_elem;
-          }
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<!std::is_integral<NativeT>::value ||
-                                    std::is_same<NativeT, bool>::value>::type* =
-                nullptr>
-  Status HandleShiftRightArithmetic(HloInstruction*) {
-    return InvalidArgument("Unsupported type for ShiftRightArithmetic");
-  }
-
-  Status HandleShiftRightArithmetic(HloInstruction* shra) override {
-    return HandleShiftRightArithmetic<ElementwiseT>(shra);
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_integral<NativeT>::value &&
-                !std::is_same<NativeT, bool>::value>::type* = nullptr>
-  Status HandleShiftRightLogical(HloInstruction* shr) {
-    typedef typename std::make_unsigned<NativeT>::type UnsignedT;
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[shr],
-        ElementWiseBinaryOp(shr, [](NativeT lhs_elem, NativeT rhs_elem) {
-          // If shift amount is greater than the number of bits, then return 0.
-          if (IsShiftOutOfBounds<NativeT>(rhs_elem)) {
-            return static_cast<NativeT>(0);
-          }
-          return static_cast<NativeT>(static_cast<UnsignedT>(lhs_elem) >>
-                                      rhs_elem);
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<!std::is_integral<NativeT>::value ||
-                                    std::is_same<NativeT, bool>::value>::type* =
-                nullptr>
-  Status HandleShiftRightLogical(HloInstruction*) {
-    return InvalidArgument("Unsupported type for ShiftRightLogical");
-  }
-
-  Status HandleShiftRightLogical(HloInstruction* shrl) override {
-    return HandleShiftRightLogical<ElementwiseT>(shrl);
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleClamp(HloInstruction* clamp) {
-    std::function<ElementwiseT(ElementwiseT, ElementwiseT, ElementwiseT)>
-        clamp_op = [](ElementwiseT low, ElementwiseT value, ElementwiseT high) {
-          return std::fmin(high, std::fmax(value, low));
-        };
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[clamp],
-        ElementwiseTernaryOp(clamp,
-                             std::move(ConvertTernaryFunction(clamp_op))));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleClamp(HloInstruction*) {
-    return InvalidArgument("Unsupported type for Clamp");
-  }
-
-  Status HandleClamp(HloInstruction* clamp) override {
-    return HandleClamp<ElementwiseT>(clamp);
-  }
-
-  Status HandleSelect(HloInstruction* select) override {
-    CHECK(!ShapeUtil::IsScalar(select->operand(0)->shape()));
-    CHECK(!ShapeUtil::IsTuple(select->shape()));
-    std::function<ReturnT(bool, ReturnT, ReturnT)> select_op =
-        [](bool pred, ReturnT on_true, ReturnT on_false) {
-          if (pred) {
-            return on_true;
-          }
-          return on_false;
-        };
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[select],
-                        ElementwiseTernaryOp(select, std::move(select_op)));
-    return Status::OK();
-  }
-
-  Status HandleReverse(HloInstruction* reverse) override {
-    const auto result_shape = reverse->shape();
-    const auto reverse_dimensions = reverse->dimensions();
-
-    auto operand = reverse->operand(0);
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferReverseShape(operand->shape(),
-                                                          reverse_dimensions));
-
-    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
-        << "return shape set to: " << ShapeUtil::HumanString(result_shape)
-        << " but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    auto result = Literal::CreateFromShape(result_shape);
-
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> out_index) {
-          std::vector<int64> from_index(out_index.begin(), out_index.end());
-          for (const int64 dim : reverse_dimensions) {
-            from_index[dim] = result_shape.dimensions(dim) - 1 - out_index[dim];
-          }
-          return operand_literal.Get<ReturnT>(from_index);
-        }));
-
-    parent_->evaluated_[reverse] = std::move(result);
-    return Status::OK();
-  }
-
-  Status HandleConvolution(HloInstruction* conv) override {
-    auto lhs = conv->operand(0);
-    auto rhs = conv->operand(1);
-    const auto& window = conv->window();
-    const Shape& result_shape = conv->shape();
-    const Shape& lhs_shape = lhs->shape();
-    const Shape& rhs_shape = rhs->shape();
-
-    TF_CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
-    TF_CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
-    CHECK(ShapeUtil::IsArray(lhs_shape));
-    CHECK(ShapeUtil::IsArray(rhs_shape));
-    CHECK(ShapeUtil::SameElementType(lhs_shape, rhs_shape));
-    CHECK(ShapeUtil::SameElementType(lhs_shape, result_shape));
-
-    const auto& dnums = conv->convolution_dimension_numbers();
-    const int64 num_spatial_dims = dnums.output_spatial_dimensions_size();
-    CHECK_EQ(num_spatial_dims, dnums.input_spatial_dimensions_size());
-    CHECK_EQ(num_spatial_dims, dnums.kernel_spatial_dimensions_size());
-    CHECK_GE(num_spatial_dims, 0);
-    CHECK_EQ(window.dimensions_size(), num_spatial_dims);
-
-    const auto lhs_rank = ShapeUtil::Rank(lhs_shape);
-    const auto rhs_rank = ShapeUtil::Rank(rhs_shape);
-
-    CHECK_EQ(num_spatial_dims + 2, lhs_rank);
-    CHECK_EQ(num_spatial_dims + 2, rhs_rank);
-
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferConvolveShape(lhs_shape, rhs_shape,
-                                                           window, dnums));
-    CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
-        << "return shape set to: " << ShapeUtil::HumanString(result_shape)
-        << " but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-
-    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
-    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
-
-    std::vector<int64> window_dimension_sizes;
-    for (auto i : dnums.kernel_spatial_dimensions()) {
-      window_dimension_sizes.push_back(ShapeUtil::GetDimension(rhs_shape, i));
-    }
-
-    const Shape& window_shape =
-        ShapeUtil::MakeShape(rhs_shape.element_type(), window_dimension_sizes);
-
-    DimensionVector lhs_dim_multipliers = MakeDimMultipliers(lhs_shape);
-    DimensionVector rhs_dim_multipliers = MakeDimMultipliers(rhs_shape);
-
-    auto lhs_literal_data = lhs_literal.data<ReturnT>();
-    auto rhs_literal_data = rhs_literal.data<ReturnT>();
-
-    auto func = [&window_shape, &dnums, &lhs_shape, &rhs_shape, &window,
-                 &lhs_dim_multipliers, &rhs_dim_multipliers, lhs_literal_data,
-                 rhs_literal_data](ArraySlice<int64> out_index) {
-      // Dimension number applicable for input (lhs).
-      const int64 input_batch_dim = dnums.input_batch_dimension();
-      const int64 input_z_dim = dnums.input_feature_dimension();
-      // Dimension number applicable for kernel (rhs).
-      const int64 kernel_input_z_dim = dnums.kernel_input_feature_dimension();
-      const int64 kernel_output_z_dim = dnums.kernel_output_feature_dimension();
-      // Dimension number applicable for output.
-      const int64 output_batch_dim = dnums.output_batch_dimension();
-      const int64 output_z_dim = dnums.output_feature_dimension();
-
-      const int64 z_size = ShapeUtil::GetDimension(lhs_shape, input_z_dim);
-
-      ElementwiseT result_val = static_cast<ElementwiseT>(0);
-      DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size(),
-                                        0);
-
-      // Convolve input feature with kernel.
-      do {
-        for (int64 iz = 0; iz < z_size; ++iz) {
-          int64 lhs_linear_index = 0;
-          lhs_linear_index += out_index[output_batch_dim] *
-                              lhs_dim_multipliers[input_batch_dim];
-          lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
-
-          int64 rhs_linear_index = 0;
-          rhs_linear_index += out_index[output_z_dim] *
-                              rhs_dim_multipliers[kernel_output_z_dim];
-          rhs_linear_index += iz * rhs_dim_multipliers[kernel_input_z_dim];
-
-          // Find corresponding spatial dimension index for input (lhs).
-          for (int64 ki = 0; ki < rhs_spatial_index.size(); ++ki) {
-            // Spatial dimension number for input (lhs) and output.
-            const int64 input_spatial_dim = dnums.input_spatial_dimensions(ki);
-            const int64 output_spatial_dim =
-                dnums.output_spatial_dimensions(ki);
-
-            // Calculate lhs (input) index without taking base dilation into
-            // account.
-            const auto& window_dim = window.dimensions(ki);
-            const int64 undilated_index =
-                out_index[output_spatial_dim] * window_dim.stride() -
-                window_dim.padding_low() +
-                rhs_spatial_index[ki] * window_dim.window_dilation();
-            // Skip if the lhs (input) index is to be dilated.  As an
-            // optimization, skip this mod if there's no dilation.
-            if (window_dim.base_dilation() > 1 &&
-                undilated_index % window_dim.base_dilation() != 0) {
-              goto cnt;
-            }
-
-            // Calculate the actual lhs (input) index after dilation.  As an
-            // optimization, skip this integer divide if there's no dilation.
-            int64 lhs_spatial_index;
-            if (window_dim.base_dilation() > 1) {
-              lhs_spatial_index = undilated_index / window_dim.base_dilation();
-            } else {
-              lhs_spatial_index = undilated_index;
-            }
-            lhs_linear_index +=
-                lhs_spatial_index * lhs_dim_multipliers[input_spatial_dim];
-
-            // Skip if input index is not in bounds.
-            if (!(lhs_spatial_index >= 0 &&
-                  lhs_spatial_index <
-                      lhs_shape.dimensions(input_spatial_dim))) {
-              goto cnt;
-            }
-
-            rhs_linear_index +=
-                (window_dim.window_reversal()
-                     ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
-                     : rhs_spatial_index[ki]) *
-                rhs_dim_multipliers[dnums.kernel_spatial_dimensions(ki)];
-          }
-
-          result_val +=
-              static_cast<ElementwiseT>(lhs_literal_data[lhs_linear_index]) *
-              static_cast<ElementwiseT>(rhs_literal_data[rhs_linear_index]);
-        }
-      cnt : {}
-      } while (IndexUtil::BumpIndices(window_shape, &rhs_spatial_index));
-
-      return static_cast<ReturnT>(result_val);
-    };
-
-    auto result = Literal::CreateFromShape(result_shape);
-    TF_RETURN_IF_ERROR(result->PopulateParallel<ReturnT>(func));
-
-    parent_->evaluated_[conv] = std::move(result);
-    return Status::OK();
-  }
-
-  Status HandleDot(HloInstruction* dot) override {
-    auto lhs = dot->operand(0);
-    auto rhs = dot->operand(1);
-    CHECK(ShapeUtil::IsArray(dot->shape()));
-    CHECK(ShapeUtil::IsArray(lhs->shape()));
-    CHECK(ShapeUtil::IsArray(rhs->shape()));
-
-    const auto& dnums = dot->dot_dimension_numbers();
-
-    const auto lhs_rank = ShapeUtil::Rank(lhs->shape());
-    const auto rhs_rank = ShapeUtil::Rank(rhs->shape());
-
-    CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
-    CHECK(ShapeUtil::SameElementType(lhs->shape(), dot->shape()));
-
-    // There must be 1 and only 1 Contracting dimension for lhs and rhs.
-    CHECK_EQ(dnums.lhs_contracting_dimensions_size(), 1);
-    CHECK_EQ(dnums.rhs_contracting_dimensions_size(), 1);
-    const int64 lhs_contracting_dimension = dnums.lhs_contracting_dimensions(0);
-    const int64 rhs_contracting_dimension = dnums.rhs_contracting_dimensions(0);
-    // Contracted dimension sizes must be the same.
-    CHECK_EQ(lhs->shape().dimensions(lhs_contracting_dimension),
-             rhs->shape().dimensions(rhs_contracting_dimension))
-        << "lhs contracted dimension: "
-        << lhs->shape().dimensions(lhs_contracting_dimension)
-        << " rhs contracted dimension: "
-        << rhs->shape().dimensions(rhs_contracting_dimension);
-    const int64 contracted_dimension_size =
-        lhs->shape().dimensions(lhs_contracting_dimension);
-
-    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
-    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
-
-    auto result = Literal::CreateFromShape(dot->shape());
-
-    CHECK_EQ(dnums.lhs_batch_dimensions_size(),
-             dnums.rhs_batch_dimensions_size());
-
-    std::vector<int64> lhs_non_contracting_dims;
-    for (int64 i = 0; i < lhs_rank; i++) {
-      if (i != lhs_contracting_dimension) {
-        lhs_non_contracting_dims.push_back(i);
-      }
-    }
-
-    std::vector<int64> rhs_non_batch_non_contracting_dims;
-    FlatSet<int64> batch_dims_set(dnums.rhs_batch_dimensions().begin(),
-                                  dnums.rhs_batch_dimensions().end());
-    for (int64 i = 0; i < rhs_rank; i++) {
-      if (i != rhs_contracting_dimension && batch_dims_set.count(i) == 0) {
-        rhs_non_batch_non_contracting_dims.push_back(i);
-      }
-    }
-
-    const int64 batch_dim_size = dnums.lhs_batch_dimensions_size();
-    const int64 lhs_non_contracting_size = lhs_non_contracting_dims.size();
-
-    DimensionVector lhs_index(lhs_rank);
-    DimensionVector rhs_index(rhs_rank);
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> result_index) {
-          ElementwiseT result_val = static_cast<ElementwiseT>(0);
-
-          // Find the corresponding non-contracting indices for lhs and rhs.
-          //
-          // For `result_index`, its batch dimension, if exists, will be at the
-          // same dimension as the batch dimension of lhs and rhs. More
-          // specifically:
-          // - For lhs, the non-contracting dimensions, including the batch
-          // dimension have the same index as the `result_index`.
-          // - For rhs, the batch dimension is set separately from other
-          // non-contracting dimensions, since these other non-contracting
-          // dimensions in rhs follow the non-contracting dimensions of lhs in
-          // the resulting index.
-          //
-          // As an example, for a resulting index:
-          //  result_index [result_batch, result_x, result_y]
-          // the effecting lhs and rhs indices are:
-          //  lhs [result_batch, lhs_non_contracting_dim, contracting_dim
-          //  rhs [result_batch, contracting_dim, rhs_non_contracting_dim]
-          // `result_x` is only affected by the lhs_non_contracting_dim and
-          // likewise `result_y` only depends on rhs_non_contracting_dim.
-          //
-          // so we can look up the lhs and rhs indices by:
-          //
-          // lhs:
-          //  batch index is the same as `result_batch`.
-          //    non-contracting dimension is the same as
-          //    result_index[lhs_non_contracting_dim]
-          // rhs:
-          //  batch index: the same as `result_batch`.
-          //  non-contracting dimension index: *not* the same as
-          //    result_index[rhs_non_contractng_dim], since the
-          //    non-contracting dimensions of lhs are included in the
-          //    result_index first. Instead, the non_contracting_dim of rhs must
-          //    be calculated as following:
-          //      lhs_non_contracting_dimensions_size +
-          //      (rhs_non_batch_non_contracting_dim - batch_dim_size) - 1
-          //
-          //    Note that (rhs_non_batch_contracting_dim - batch_dim_size) is
-          //    the index offset to the result_index that only depends on
-          //    the non_batch and non-contracting dimensions of rhs. -1 at the
-          //    end translates size to index.
-          for (auto i : lhs_non_contracting_dims) {
-            lhs_index[i] = result_index[i];
-          }
-          for (auto i : dnums.rhs_batch_dimensions()) {
-            rhs_index[i] = result_index[i];
-          }
-          for (auto i : rhs_non_batch_non_contracting_dims) {
-            const int64 rhs_non_batch_non_contracting_dim =
-                lhs_non_contracting_size + (i - batch_dim_size) - 1;
-            rhs_index[i] = result_index[rhs_non_batch_non_contracting_dim];
-          }
-
-          // Accumulates resulting product along the contracted dimension.
-          for (int64 i = 0; i < contracted_dimension_size; ++i) {
-            lhs_index[lhs_contracting_dimension] = i;
-            rhs_index[rhs_contracting_dimension] = i;
-
-            result_val +=
-                static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
-                static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
-          }
-
-          return static_cast<ReturnT>(result_val);
-        }));
-
-    parent_->evaluated_[dot] = std::move(result);
-    return Status::OK();
-  }
-
-  Status HandlePad(HloInstruction* pad) override {
-    CHECK(!ShapeUtil::IsTuple(pad->operand(0)->shape()));
-    // Padding value must be scalar.
-    CHECK(ShapeUtil::IsScalar(pad->operand(1)->shape()));
-    CHECK_EQ(ShapeUtil::Rank(pad->operand(0)->shape()),
-             pad->padding_config().dimensions_size());
-
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferPadShape(
-                            /*operand_shape=*/pad->operand(0)->shape(),
-                            /*padding_value_shape=*/pad->operand(1)->shape(),
-                            /*padding_config=*/pad->padding_config()));
-    CHECK(ShapeUtil::Compatible(pad->shape(), inferred_return_shape))
-        << "return shape is set to: " << ShapeUtil::HumanString(pad->shape())
-        << "but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-
-    // Create new HLO of padded shape with padding value.
-    ReturnT scalar =
-        parent_->GetEvaluatedLiteralFor(pad->operand(1)).Get<ReturnT>({});
-    auto result = Literal::CreateFromShape(pad->shape());
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&scalar](ArraySlice<int64> multi_index) { return scalar; }));
-
-    const Literal& evaluated_operand =
-        parent_->GetEvaluatedLiteralFor(pad->operand(0));
-
-    std::vector<int64> input_index(ShapeUtil::Rank(evaluated_operand.shape()),
-                                   0);
-    std::vector<int64> target_index(ShapeUtil::Rank(result->shape()), 0);
-
-    // Loop through each element of the operand, assign them to the
-    // corresponding index of the resulting padded literal.
-    const PaddingConfig& pad_config = pad->padding_config();
-
-    auto func = [&](ArraySlice<int64> input_index) {
-      for (auto i = 0; i < input_index.size(); ++i) {
-        // Interior padding occurs logically before edge padding, so in the case
-        // of negative edge padding elements are removed from the
-        // interior-padded operand.
-        target_index[i] =
-            pad_config.dimensions(i).edge_padding_low() +
-            input_index[i] * (pad_config.dimensions(i).interior_padding() + 1);
-
-        // Account for negative low and high padding: skip assignment if the
-        // any target index is out of range.
-        if (!(target_index[i] >= 0 &&
-              target_index[i] < pad->shape().dimensions(i))) {
-          return true;
-        }
-      }
-      result->Set<ReturnT>(target_index,
-                           evaluated_operand.Get<ReturnT>(input_index));
-      return true;
-    };
-
-    std::vector<int64> zero_base(evaluated_operand.shape().dimensions_size(),
-                                 0);
-    std::vector<int64> step(evaluated_operand.shape().dimensions_size(), 1);
-
-    ShapeUtil::ForEachIndex(
-        evaluated_operand.shape(), zero_base,
-        AsInt64Slice(evaluated_operand.shape().dimensions()), step, func);
-
-    parent_->evaluated_[pad] = std::move(result);
-    return Status::OK();
-  }
-
-  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override {
-    auto operand = dynamic_slice->operand(0);
-    auto start_indices = dynamic_slice->operand(1);
-    auto result_shape = dynamic_slice->shape();
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferDynamicSliceShape(
-                            operand->shape(), start_indices->shape(),
-                            dynamic_slice->dynamic_slice_sizes()));
-    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
-        << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
-        << "but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-    TF_RET_CHECK(
-        primitive_util::IsIntegralType(start_indices->shape().element_type()));
-
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    const Literal& start_indices_literal =
-        parent_->GetEvaluatedLiteralFor(start_indices);
-
-    switch (start_indices->shape().element_type()) {
-      case S32: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_slice],
-            DynamicSlice<int32>(operand_literal, start_indices_literal,
-                                result_shape));
-      } break;
-      case S64: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_slice],
-            DynamicSlice<int64>(operand_literal, start_indices_literal,
-                                result_shape));
-      } break;
-      case U32: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_slice],
-            DynamicSlice<uint32>(operand_literal, start_indices_literal,
-                                 result_shape));
-      } break;
-      case U64: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_slice],
-            DynamicSlice<uint64>(operand_literal, start_indices_literal,
-                                 result_shape));
-      } break;
-      default:
-        LOG(FATAL) << "HandleDynamicSlice: unhandled primitive type for "
-                      "start_indices: "
-                   << PrimitiveType_Name(start_indices->shape().element_type());
-    }
-
-    return Status::OK();
-  }
-
-  Status HandleDynamicUpdateSlice(
-      HloInstruction* dynamic_update_slice) override {
-    auto operand = dynamic_update_slice->operand(0);
-    auto update = dynamic_update_slice->operand(1);
-    auto start_indices = dynamic_update_slice->operand(2);
-    auto result_shape = dynamic_update_slice->shape();
-    TF_ASSIGN_OR_RETURN(
-        auto inferred_return_shape,
-        ShapeInference::InferDynamicUpdateSliceShape(
-            operand->shape(), update->shape(), start_indices->shape()));
-    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
-        << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
-        << "but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-    TF_RET_CHECK(
-        primitive_util::IsIntegralType(start_indices->shape().element_type()));
-    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, operand->shape()));
-
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    const Literal& update_literal = parent_->GetEvaluatedLiteralFor(update);
-    const Literal& start_indices_literal =
-        parent_->GetEvaluatedLiteralFor(start_indices);
-
-    switch (start_indices->shape().element_type()) {
-      case S32: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<int32>(operand_literal, update_literal,
-                                      start_indices_literal));
-      } break;
-      case S64: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<int64>(operand_literal, update_literal,
-                                      start_indices_literal));
-      } break;
-      case U32: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<uint32>(operand_literal, update_literal,
-                                       start_indices_literal));
-      } break;
-      case U64: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<uint64>(operand_literal, update_literal,
-                                       start_indices_literal));
-      } break;
-      default:
-        LOG(FATAL) << "HandleDynamicUpdateSlice: unhandled primitive type for "
-                      "start_indices: "
-                   << PrimitiveType_Name(start_indices->shape().element_type());
-    }
-
-    return Status::OK();
-  }
-
-  template <typename NativeT>
-  StatusOr<std::unique_ptr<Literal>> MapImpl(HloInstruction* map) {
-    auto operands = map->operands();
-    HloComputation* computation = map->to_apply();
-
-    auto result = Literal::CreateFromShape(map->shape());
-
-    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
-          std::vector<std::unique_ptr<Literal>> arg_literals;
-          arg_literals.reserve(operands.size());
-
-          // Construct scalar literal parameters to be passed to the map
-          // computation.
-          for (auto operand : operands) {
-            const Literal& arg_literal =
-                parent_->GetEvaluatedLiteralFor(operand);
-
-            auto curr_val = arg_literal.Get<NativeT>(multi_index);
-            auto curr_val_literal = Literal::CreateR0<NativeT>(curr_val);
-
-            arg_literals.push_back(std::move(curr_val_literal));
-          }
-
-          std::unique_ptr<Literal> computed_result =
-              embedded_evaluator
-                  .Evaluate<std::unique_ptr<Literal>>(*computation,
-                                                      arg_literals)
-                  .ConsumeValueOrDie();
-          // Clear visit states so that the we can use the evaluate again on
-          // the same computation.
-          embedded_evaluator.ResetVisitStates();
-
-          return computed_result->Get<ReturnT>({});
-        }));
-    return std::move(result);
-  }
-
-  Status HandleMap(HloInstruction* map) override {
-    switch (map->operand(0)->shape().element_type()) {
-      case PRED: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<bool>(map));
-        break;
-      }
-      case U8: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint8>(map));
-        break;
-      }
-      case U32: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint32>(map));
-        break;
-      }
-      case U64: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint64>(map));
-        break;
-      }
-      case S8: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int8>(map));
-        break;
-      }
-      case S32: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int32>(map));
-        break;
-      }
-      case S64: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int64>(map));
-        break;
-      }
-      case F16: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map],
-                            MapImpl<Eigen::half>(map));
-        break;
-      }
-      case F32: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<float>(map));
-        break;
-      }
-      case F64: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<double>(map));
-        break;
-      }
-      case C64: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<complex64>(map));
-        break;
-      }
-      default:
-        LOG(FATAL) << "HandleMap: unhandled primitive type for "
-                      "input operand: "
-                   << PrimitiveType_Name(
-                          map->operand(0)->shape().element_type());
-    }
-
-    return Status::OK();
-  }
-
-  Status HandleReduce(HloInstruction* reduce) override {
-    auto arg = reduce->operand(0);
-    auto init_value = reduce->operand(1);
-    ArraySlice<int64> dimensions(reduce->dimensions());
-    HloComputation* function = reduce->to_apply();
-    TF_RET_CHECK(ShapeUtil::Rank(reduce->shape()) ==
-                 ShapeUtil::Rank(arg->shape()) - dimensions.size());
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferReduceShape(
-                            /*arg=*/arg->shape(),
-                            /*init_value=*/init_value->shape(),
-                            /*dimensions_to_reduce=*/dimensions,
-                            /*to_apply=*/function->ComputeProgramShape()));
-    TF_RET_CHECK(ShapeUtil::Compatible(reduce->shape(), inferred_return_shape))
-        << "return shape is set to: " << ShapeUtil::HumanString(reduce->shape())
-        << "but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-
-    const Literal& arg_literal = parent_->GetEvaluatedLiteralFor(arg);
-    VLOG(3) << "HandleReduce arg_literal: " << arg_literal.ToString();
-    const Literal& init_literal = parent_->GetEvaluatedLiteralFor(init_value);
-    VLOG(3) << "HandleReduce init_literal: " << init_literal.ToString();
-    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
-    auto init_scalar = init_literal.Get<ReturnT>({});
-
-    auto result = Literal::CreateFromShape(reduce->shape());
-
-    const auto arg_dimensions = AsInt64Slice(arg_literal.shape().dimensions());
-    std::vector<int64> arg_dim_steps(arg_dimensions.size());
-    std::vector<int64> arg_dim_counts(arg_dimensions.size());
-    for (const int64 dim : dimensions) {
-      arg_dim_steps[dim] = 1;
-      arg_dim_counts[dim] = arg_dimensions[dim];
-    }
-
-    // Map each dimension in the result to a dimension in arg that isn't
-    // being reduced.
-    std::vector<int64> result_to_arg_index;
-    for (int64 i = 0; i < arg_dimensions.size(); ++i) {
-      if (arg_dim_steps[i] == 0) {
-        result_to_arg_index.push_back(i);
-      }
-    }
-
-    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
-    // For each resulting dimension, calculate and assign computed value.
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
-          ReturnT result_val = init_scalar;
-
-          std::vector<int64> base(arg_dimensions.size());
-          for (int64 i = 0; i < multi_index.size(); ++i) {
-            base[result_to_arg_index[i]] = multi_index[i];
-          }
-
-          // When the reduction is addition of floats, accumulate in a double
-          // for better precision. Also, avoid creating Literals for the
-          // intermediate results; it's much faster.
-          if (ShapeUtil::ElementIsFloating(init_literal.shape()) &&
-              IsScalarAdd(function)) {
-            double computed_result = 0;
-            auto func = [&](ArraySlice<int64> input_index) {
-              computed_result += arg_literal.Get<float>(input_index);
-              return true;
-            };
-            ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
-                                    arg_dim_steps, func);
-            return static_cast<ReturnT>(computed_result);
-          }
-          auto func = [&](ArraySlice<int64> input_index) {
-            auto curr_val = arg_literal.Get<ReturnT>(input_index);
-
-            // Evaluate computation with specified literal operands.
-            auto curr_val_literal = Literal::CreateR0<ReturnT>(curr_val);
-            auto result_val_literal = Literal::CreateR0<ReturnT>(result_val);
-            std::vector<const Literal*> args = {result_val_literal.get(),
-                                                curr_val_literal.get()};
-
-            std::unique_ptr<Literal> computed_result =
-                embedded_evaluator.Evaluate<const Literal*>(*function, args)
-                    .ConsumeValueOrDie();
-            // Clear visit states so that we can use the evaluator again on
-            // the same computation.
-            embedded_evaluator.ResetVisitStates();
-            // Assign computed result to result_val.
-            result_val = computed_result->Get<ReturnT>({});
-            return true;
-          };
-          // Computes one element of the result, reducing all dimensions that
-          // contribute to that element.
-          ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
-                                  arg_dim_steps, func);
-          return result_val;
-        }));
-
-    parent_->evaluated_[reduce] = std::move(result);
-    return Status::OK();
-  }
-
-  bool IsScalarAdd(HloComputation* computation) {
-    HloInstruction* instruction = computation->root_instruction();
-    if (instruction->opcode() == HloOpcode::kAdd &&
-        computation->num_parameters() == 2) {
-      const HloInstruction* lhs = instruction->operand(0);
-      const HloInstruction* rhs = instruction->operand(1);
-      return lhs->opcode() == HloOpcode::kParameter &&
-             ShapeUtil::IsScalar(lhs->shape()) &&
-             rhs->opcode() == HloOpcode::kParameter &&
-             ShapeUtil::IsScalar(rhs->shape()) && lhs != rhs;
-    }
-    return false;
-  }
-
-  Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override {
-    auto operand = select_and_scatter->operand(0);
-    auto source = select_and_scatter->operand(1);
-    const Window& window = select_and_scatter->window();
-
-    const Literal& init_literal =
-        parent_->GetEvaluatedLiteralFor(select_and_scatter->operand(2));
-    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
-    auto init_scalar = init_literal.Get<ReturnT>({});
-
-    auto result = Literal::CreateFromShape(select_and_scatter->shape());
-
-    // Initialize result array with the init value.
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&](ArraySlice<int64> output_index) { return init_scalar; }));
-
-    std::vector<int64> window_dimension_sizes;
-    for (const auto& window_dimension : window.dimensions()) {
-      window_dimension_sizes.push_back(window_dimension.size());
-    }
-    const Shape window_shape = ShapeUtil::MakeShape(
-        operand->shape().element_type(), window_dimension_sizes);
-
-    HloComputation* select = select_and_scatter->select();
-    HloComputation* scatter = select_and_scatter->scatter();
-
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    const Literal& source_literal = parent_->GetEvaluatedLiteralFor(source);
-
-    int64 rank = ShapeUtil::Rank(operand_literal.shape());
-
-    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
-    DimensionVector source_index(rank);
-
-    std::fill(source_index.begin(), source_index.end(), 0);
-    do {
-      // For each element in `source`, we place a window in `operand`. For each
-      // window placement, we iterate inside the window twice:
-      //
-      // 1. Find the selected index by applying `select` function to all
-      // elements. E.g., If the `select` function is GreaterEqual, the first
-      // iteration through the window finds the biggest value and returns its
-      // index.
-      //
-      // 2. Using the selected index, scatter value from `source` to result. We
-      // do this by iterating through the window, and compare each index with
-      // the selected index.
-      optional<ReturnT> selected_val;
-      optional<std::vector<int64>> selected_index;
-
-      IterateThroughWindow(
-          window_shape, window, operand_literal.shape(), source_index,
-          [&](const std::vector<int64>& operand_index) {
-            auto curr_val = operand_literal.Get<ReturnT>(operand_index);
-            if (!selected_val) {
-              selected_val = curr_val;
-              selected_index = operand_index;
-            }
-            const auto curr_val_literal = Literal::CreateR0<ReturnT>(curr_val);
-            const auto selected_val_literal =
-                Literal::CreateR0<ReturnT>(*selected_val);
-
-            const std::vector<const Literal*> args = {
-                selected_val_literal.get(), curr_val_literal.get()};
-            std::unique_ptr<Literal> computed_result =
-                embedded_evaluator.Evaluate<const Literal*>(*select, args)
-                    .ConsumeValueOrDie();
-            bool selected = !computed_result->Get<bool>({});
-            if (selected) {
-              selected_val = curr_val;
-              selected_index = operand_index;
-            }
-            embedded_evaluator.ResetVisitStates();
-          });
-
-      IterateThroughWindow(
-          window_shape, window, operand_literal.shape(), source_index,
-          [&](const std::vector<int64>& operand_index) {
-            if (std::equal(operand_index.begin(), operand_index.end(),
-                           selected_index->begin())) {
-              auto source = source_literal.Get<ReturnT>(source_index);
-              auto scattered = result->Get<ReturnT>(operand_index);
-              const auto source_literal = Literal::CreateR0<ReturnT>(source);
-              const auto scattered_literal =
-                  Literal::CreateR0<ReturnT>(scattered);
-
-              const std::vector<const Literal*> args = {
-                  source_literal.get(), scattered_literal.get()};
-              std::unique_ptr<Literal> computed_result =
-                  embedded_evaluator.Evaluate<const Literal*>(*scatter, args)
-                      .ConsumeValueOrDie();
-              result->Set(operand_index, computed_result->Get<ReturnT>({}));
-              // Clear visit states so that the we can use the evaluator again
-              // on the same computation.
-              embedded_evaluator.ResetVisitStates();
-            }
-          });
-    } while (IndexUtil::BumpIndices(source->shape(), &source_index));
-
-    parent_->evaluated_[select_and_scatter] = std::move(result);
-    return Status::OK();
-  }
-
-  Status HandleReduceWindow(HloInstruction* reduce_window) override {
-    auto operand = reduce_window->operand(0);
-    const Window& window = reduce_window->window();
-    HloComputation* function = reduce_window->to_apply();
-    TF_ASSIGN_OR_RETURN(
-        auto inferred_return_shape,
-        ShapeInference::InferReduceWindowShape(
-            /*operand_shape=*/reduce_window->operand(0)->shape(),
-            /*init_value=*/reduce_window->operand(1)->shape(), window,
-            /*to_apply_shape=*/function->ComputeProgramShape()));
-    TF_RET_CHECK(
-        ShapeUtil::Compatible(reduce_window->shape(), inferred_return_shape))
-        << "return shape is set to: "
-        << ShapeUtil::HumanStringWithLayout(reduce_window->shape())
-        << "but is inferred to be: "
-        << ShapeUtil::HumanStringWithLayout(inferred_return_shape);
-
-    const Literal& operand_literal =
-        parent_->GetEvaluatedLiteralFor(reduce_window->operand(0));
-    VLOG(3) << "HandleReduceWindow arg_literal: " << operand_literal.ToString();
-    const Literal& init_literal =
-        parent_->GetEvaluatedLiteralFor(reduce_window->operand(1));
-    VLOG(3) << "HandleReduceWindow init_literal: " << init_literal.ToString();
-    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
-    auto init_scalar = init_literal.Get<ReturnT>({});
-
-    auto result = Literal::CreateFromShape(reduce_window->shape());
-
-    // Creates a Shape object from window, for iteration below.
-    std::vector<int64> window_dimension_sizes;
-    for (const auto& window_dimension : window.dimensions()) {
-      window_dimension_sizes.push_back(window_dimension.size());
-    }
-    const Shape window_shape = ShapeUtil::MakeShape(
-        operand->shape().element_type(), window_dimension_sizes);
-
-    DimensionVector window_index(window.dimensions_size());
-    DimensionVector operand_index(ShapeUtil::Rank(operand_literal.shape()));
-
-    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
-    // For each resulting dimension, calculate and assign computed value.
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> output_index) {
-          ReturnT result_val = init_scalar;
-
-          std::fill(window_index.begin(), window_index.end(), 0);
-          std::fill(operand_index.begin(), operand_index.end(), 0);
-
-          IterateThroughWindow(
-              window_shape, window, operand_literal.shape(), output_index,
-              [&](const std::vector<int64>& operand_index) {
-                auto curr_val = operand_literal.Get<ReturnT>(operand_index);
-
-                // Evaluate computation with specified literal operands.
-                const auto curr_val_literal =
-                    Literal::CreateR0<ReturnT>(curr_val);
-                const auto result_val_literal =
-                    Literal::CreateR0<ReturnT>(result_val);
-                const std::vector<const Literal*> args = {
-                    result_val_literal.get(), curr_val_literal.get()};
-                std::unique_ptr<Literal> computed_result =
-                    embedded_evaluator.Evaluate<const Literal*>(*function, args)
-                        .ConsumeValueOrDie();
-
-                // Clear visit states so that the we can use the evaluate again
-                // on the same computation.
-                embedded_evaluator.ResetVisitStates();
-
-                result_val = computed_result->Get<ReturnT>({});
-              });
-
-          return result_val;
-        }));
-
-    parent_->evaluated_[reduce_window] = std::move(result);
-    return Status::OK();
-  }
-
-  Status HandleSlice(HloInstruction* slice) override {
-    auto operand = slice->operand(0);
-    const Shape& shape = slice->shape();
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferSliceShape(
-                            operand->shape(), slice->slice_starts(),
-                            slice->slice_limits(), slice->slice_strides()));
-    TF_RET_CHECK(ShapeUtil::Compatible(shape, inferred_return_shape))
-        << "return shape set to: " << ShapeUtil::HumanString(shape)
-        << " but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-
-    const int64 rank = ShapeUtil::Rank(operand->shape());
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    auto func = [&](ArraySlice<int64> out_index) {
-      DimensionVector operand_index(rank);
-      for (int64 i = 0; i < rank; ++i) {
-        operand_index[i] =
-            slice->slice_starts(i) + out_index[i] * slice->slice_strides(i);
-      }
-      return operand_literal.Get<ReturnT>(operand_index);
-    };
-
-    auto result = Literal::CreateFromDimensions(
-        shape.element_type(), AsInt64Slice(shape.dimensions()));
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(func));
-    parent_->evaluated_[slice] = std::move(result);
-    return Status::OK();
-  }
-
-  // Enable CLZ only for int32 and uint32.
-  template <
-      typename NativeT,
-      typename std::enable_if<
-          (std::is_floating_point<NativeT>::value ||
-           std::is_integral<NativeT>::value || is_complex_t<NativeT>::value) &&
-          !(std::is_same<NativeT, uint32>::value ||
-            std::is_same<NativeT, int32>::value)>::type* = nullptr>
-  Status HandleClz(HloInstruction* clz) {
-    return InvalidArgument("Unsupported type for Clz");
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_same<NativeT, uint32>::value ||
-                std::is_same<NativeT, int32>::value>::type* = nullptr>
-  Status HandleClz(HloInstruction* clz) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clz],
-                        ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) {
-                          return 31 - tensorflow::Log2Floor(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  Status HandleClz(HloInstruction* clz) override {
-    return HandleClz<ElementwiseT>(clz);
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleSin(HloInstruction* sin) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sin],
-                        ElementWiseUnaryOp(sin, [](ElementwiseT elem_operand) {
-                          return std::sin(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<std::is_integral<NativeT>::value ||
-                              is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleSin(HloInstruction* sin) {
-    return InvalidArgument("Unsupported type for Sin");
-  }
-
-  Status HandleSin(HloInstruction* sin) override {
-    return HandleSin<ElementwiseT>(sin);
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleCos(HloInstruction* cos) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[cos],
-                        ElementWiseUnaryOp(cos, [](ElementwiseT elem_operand) {
-                          return std::cos(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<std::is_integral<NativeT>::value ||
-                              is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleCos(HloInstruction* cos) {
-    return InvalidArgument("Unsupported type for Cos");
-  }
-
-  Status HandleCos(HloInstruction* cos) override {
-    return HandleCos<ElementwiseT>(cos);
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_same<
-                                  float, NativeT>::value>::type* = nullptr>
-  Status HandleReducePrecision(HloInstruction* reduce_precision) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[reduce_precision],
-        ElementWiseUnaryOp(reduce_precision, [reduce_precision](
-                                                 ElementwiseT elem) {
-          uint32_t value_as_int = tensorflow::bit_cast<uint32_t>(elem);
-          const uint32_t mantissa_bits = reduce_precision->mantissa_bits();
-          const uint32_t exponent_bits = reduce_precision->exponent_bits();
-
-          // Code is based on the CPU/GPU implementation in LLVM-emitting code.
-          //
-          // Bits in float type:
-          //   mantissa : bits [0:22]
-          //   exponent : bits [23:30]
-          //   sign     : bits [31]
-          if (mantissa_bits < 23) {
-            const uint32_t last_mantissa_bit_mask = 1u << (23 - mantissa_bits);
-
-            // Compute rounding bias for round-to-nearest with ties to even.
-            // This is equal to a base value of 0111... plus one bit if the last
-            // remaining mantissa bit is 1.
-            const uint32_t base_rounding_bias =
-                (last_mantissa_bit_mask >> 1) - 1;
-            const uint32_t x_last_mantissa_bit =
-                (value_as_int & last_mantissa_bit_mask) >> (23 - mantissa_bits);
-            const uint32_t x_rounding_bias =
-                x_last_mantissa_bit + base_rounding_bias;
-
-            // Add rounding bias, and mask out truncated bits.  Note that the
-            // case where adding the rounding bias overflows into the exponent
-            // bits is correct; the non-masked mantissa bits will all be zero,
-            // and the exponent will be incremented by one.
-            const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
-            value_as_int = value_as_int + x_rounding_bias;
-            value_as_int = value_as_int & truncation_mask;
-          }
-          if (exponent_bits < 8) {
-            // Masks for f32 values.
-            const uint32_t f32_sign_bit_mask = 1u << 31;
-            const uint32_t f32_exp_bits_mask = 0xffu << 23;
-
-            // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the
-            // most- significant bit -- is equal to 1.0f for all exponent sizes.
-            // Adding 2^(n-1)-1 to this gives us the highest non-infinite
-            // exponent for a bit- size of n, and subtracting 2^(n-1)-1 from
-            // this gives us the lowest' exponent (corresponding to 0.0f).
-            //
-            // Thus, the f32 exponent corresponding to the highest non-infinite
-            // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
-            // exponent corresponding to the lowest exponent for a bit size of n
-            // is (2^7-1) - 2^(n-1)-1.
-            //
-            // Note that we have already checked that exponents_bits >= 1.
-            const uint32_t f32_exponent_bias = (1 << 7) - 1;
-            const uint32_t reduced_exponent_bias =
-                (1 << (exponent_bits - 1)) - 1;
-            const uint32_t reduced_max_exponent =
-                f32_exponent_bias + reduced_exponent_bias;
-            const uint32_t reduced_min_exponent =
-                f32_exponent_bias - reduced_exponent_bias;
-
-            // Do we overflow or underflow?
-            const uint32_t x_exponent = value_as_int & f32_exp_bits_mask;
-            const bool x_overflows = x_exponent > (reduced_max_exponent << 23);
-            const bool x_underflows =
-                x_exponent <= (reduced_min_exponent << 23);
-
-            // Compute appropriately-signed values of zero and infinity.
-            const uint32_t x_signed_zero = value_as_int & f32_sign_bit_mask;
-            const uint32_t x_signed_inf = x_signed_zero | f32_exp_bits_mask;
-
-            // Force to zero or infinity if overflow or underflow.  (Note that
-            // this truncates all denormal values to zero, rather than rounding
-            // them.)
-            value_as_int = x_overflows ? x_signed_inf : value_as_int;
-            value_as_int = x_underflows ? x_signed_zero : value_as_int;
-          }
-
-          float reduced_result = tensorflow::bit_cast<float>(value_as_int);
-          if (std::isnan(elem)) {
-            reduced_result = mantissa_bits > 0
-                                 ? elem
-                                 : std::numeric_limits<float>::infinity();
-          }
-          return reduced_result;
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_same<
-                                  double, NativeT>::value>::type* = nullptr>
-  Status HandleReducePrecision(HloInstruction* reduce_precision) {
-    return InvalidArgument("Double not supported for reduce precision");
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<std::is_integral<NativeT>::value ||
-                              is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleReducePrecision(HloInstruction* reduce_precision) {
-    return InvalidArgument("Unsupported type for reduce precision");
-  }
-
-  Status HandleReducePrecision(HloInstruction* reduce_precision) override {
-    return HandleReducePrecision<ElementwiseT>(reduce_precision);
-  }
-
- private:
-  template <typename IndexT>
-  StatusOr<std::unique_ptr<Literal>> DynamicSlice(
-      const Literal& operand_literal, const Literal& start_indices_literal,
-      const Shape& result_shape) {
-    auto start_indices_typed = start_indices_literal.data<IndexT>();
-    std::vector<int64> start(start_indices_typed.begin(),
-                             start_indices_typed.end());
-
-    std::vector<int64> operand_indices(start.size());
-
-    auto result = Literal::CreateFromShape(result_shape);
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
-          for (int64 i = 0; i < operand_indices.size(); ++i) {
-            CHECK_GE(multi_index[i] + start[i], 0);
-            // Mod is only used here to be consistent with the existing
-            // backends' behavior.
-            operand_indices[i] = (multi_index[i] + start[i]) %
-                                 operand_literal.shape().dimensions(i);
-          }
-
-          auto result = operand_literal.Get<ReturnT>(operand_indices);
-          return result;
-        }));
-
-    return std::move(result);
-  }
-
-  template <typename IndexT>
-  StatusOr<std::unique_ptr<Literal>> DynamicUpdateSlice(
-      const Literal& operand_literal, const Literal& update_literal,
-      const Literal& start_indices_literal) {
-    auto result = operand_literal.CloneToUnique();
-    auto start_indices_typed = start_indices_literal.data<IndexT>();
-    const auto rank = ShapeUtil::Rank(result->shape());
-    std::vector<int64> start(rank, 0);
-    for (int64 i = 0; i < rank; ++i) {
-      // All other implementations currently wrap-around the index, so this
-      // should do so as well.
-      start[i] = (start_indices_typed[i] % result->shape().dimensions(i));
-      start[i] += (start[i] < 0) * result->shape().dimensions(i);
-    }
-    std::vector<int64> result_index(rank, 0);
-
-    auto func = [&](ArraySlice<int64> update_index) {
-      std::transform(update_index.begin(), update_index.end(), start.begin(),
-                     result_index.begin(), std::plus<int64>());
-      // Same as above, wrap-around only to match other implementations'
-      // semantics.
-      std::transform(result_index.begin(), result_index.end(),
-                     result->shape().dimensions().begin(), result_index.begin(),
-                     std::modulus<int64>());
-      result->Set<ReturnT>(result_index,
-                           update_literal.Get<ReturnT>(update_index));
-      return true;
-    };
-
-    std::vector<int64> base(update_literal.shape().dimensions_size(), 0);
-    std::vector<int64> step(update_literal.shape().dimensions_size(), 1);
-    ShapeUtil::ForEachIndex(update_literal.shape(), base,
-                            AsInt64Slice(update_literal.shape().dimensions()),
-                            step, func);
-
-    return std::move(result);
-  }
-
-  StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOp(
-      HloInstruction* instruction,
-      const std::function<ElementwiseT(ElementwiseT)>& unary_op) {
-    const Literal& operand_literal =
-        parent_->GetEvaluatedLiteralFor(instruction->operand(0));
-    TF_ASSIGN_OR_RETURN(
-        auto result_literal,
-        (ElementWiseUnaryOpImpl<ReturnT, ReturnT>(
-            instruction, ConvertUnaryFunction(unary_op), operand_literal)));
-
-    return std::move(result_literal);
-  }
-
-  StatusOr<std::unique_ptr<Literal>> ElementWiseBinaryOp(
-      HloInstruction* instruction,
-      const std::function<ElementwiseT(ElementwiseT, ElementwiseT)>&
-          binary_op) {
-    const auto shape = instruction->shape();
-    const auto* lhs = instruction->operand(0);
-    const auto* rhs = instruction->operand(1);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast
-    // is removed.
-    if (!(ShapeUtil::SameDimensions(shape, rhs->shape()) &&
-          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s vs %s: ",
-          ShapeUtil::HumanString(shape).c_str(),
-          ShapeUtil::HumanString(lhs->shape()).c_str(),
-          ShapeUtil::HumanString(rhs->shape()).c_str());
-    }
-
-    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
-    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
-
-    auto result = Literal::CreateFromShape(shape);
-
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
-          return ConvertBinaryFunction(binary_op)(
-              lhs_literal.Get<ReturnT>(multi_index),
-              rhs_literal.Get<ReturnT>(multi_index));
-        }));
-    return std::move(result);
-  }
-
-  template <typename LhsType, typename RhsType, typename EhsType>
-  StatusOr<std::unique_ptr<Literal>> ElementwiseTernaryOp(
-      HloInstruction* instruction,
-      const std::function<ReturnT(LhsType, RhsType, EhsType)>& ternary_op) {
-    const auto shape = instruction->shape();
-    const auto* lhs = instruction->operand(0);
-    const auto* rhs = instruction->operand(1);
-    const auto* ehs = instruction->operand(2);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit
-    // broadcast is removed.
-    if (!(ShapeUtil::SameDimensions(shape, lhs->shape()) &&
-          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()) &&
-          ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()))) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s vs %s vs %s: ",
-          ShapeUtil::HumanString(shape).c_str(),
-          ShapeUtil::HumanString(lhs->shape()).c_str(),
-          ShapeUtil::HumanString(rhs->shape()).c_str(),
-          ShapeUtil::HumanString(ehs->shape()).c_str());
-    }
-
-    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
-    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
-    const Literal& ehs_literal = parent_->GetEvaluatedLiteralFor(ehs);
-
-    auto result = Literal::CreateFromShape(shape);
-
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
-          return ternary_op(lhs_literal.Get<LhsType>(multi_index),
-                            rhs_literal.Get<RhsType>(multi_index),
-                            ehs_literal.Get<EhsType>(multi_index));
-        }));
-
-    return std::move(result);
-  }
-
-  template <typename NativeT>
-  static bool IsShiftOutOfBounds(NativeT rhs) {
-    typedef typename std::make_unsigned<NativeT>::type UnsignedT;
-    UnsignedT lhs_size_unsigned = sizeof(NativeT) * CHAR_BIT;
-    UnsignedT rhs_unsigned = static_cast<UnsignedT>(rhs);
-    return rhs_unsigned >= lhs_size_unsigned;
-  }
-
-  HloEvaluator* parent_;
-};  // class HloEvaluator::TypedVisitor
 
 HloEvaluator::HloEvaluator(int64 max_loop_iterations)
     : max_loop_iterations_(max_loop_iterations) {
-  typed_visitors_[PRED] = MakeUnique<TypedVisitor<bool>>(this);
-  typed_visitors_[U8] = MakeUnique<TypedVisitor<uint8>>(this);
+  typed_visitors_[PRED] = MakeUnique<HloEvaluatorTypedVisitor<bool>>(this);
+  typed_visitors_[U8] = MakeUnique<HloEvaluatorTypedVisitor<uint8>>(this);
   typed_visitors_[U16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
     return Unimplemented(
-        "HloEvaluator::TypedVisitor: unhandled primitive type: U16.");
+        "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: "
+        "U16.");
   });
-  typed_visitors_[U32] = MakeUnique<TypedVisitor<uint32>>(this);
-  typed_visitors_[U64] = MakeUnique<TypedVisitor<uint64>>(this);
-  typed_visitors_[S8] = MakeUnique<TypedVisitor<int8>>(this);
+  typed_visitors_[U32] = MakeUnique<HloEvaluatorTypedVisitor<uint32>>(this);
+  typed_visitors_[U64] = MakeUnique<HloEvaluatorTypedVisitor<uint64>>(this);
+  typed_visitors_[S8] = MakeUnique<HloEvaluatorTypedVisitor<int8>>(this);
   typed_visitors_[S16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
     return Unimplemented(
-        "HloEvaluator::TypedVisitor: unhandled primitive type: S16.");
+        "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: "
+        "S16.");
   });
-  typed_visitors_[S32] = MakeUnique<TypedVisitor<int32>>(this);
-  typed_visitors_[S64] = MakeUnique<TypedVisitor<int64>>(this);
-  typed_visitors_[F16] = MakeUnique<TypedVisitor<Eigen::half, float>>(this);
-  typed_visitors_[F32] = MakeUnique<TypedVisitor<float>>(this);
-  typed_visitors_[F64] = MakeUnique<TypedVisitor<double>>(this);
-  typed_visitors_[C64] = MakeUnique<TypedVisitor<complex64>>(this);
+  typed_visitors_[S32] = MakeUnique<HloEvaluatorTypedVisitor<int32>>(this);
+  typed_visitors_[S64] = MakeUnique<HloEvaluatorTypedVisitor<int64>>(this);
+  typed_visitors_[F16] =
+      MakeUnique<HloEvaluatorTypedVisitor<Eigen::half, float>>(this);
+  typed_visitors_[F32] = MakeUnique<HloEvaluatorTypedVisitor<float>>(this);
+  typed_visitors_[F64] = MakeUnique<HloEvaluatorTypedVisitor<double>>(this);
+  typed_visitors_[C64] = MakeUnique<HloEvaluatorTypedVisitor<complex64>>(this);
 
   // Most of the evaluator computations we use don't support BF16 (e.g.,
   // std::ceil, std::tanh). To make evaluator work with BF16, we set all
   // elementwise computations to be done in F32 and do BF16<->F32 conversion
   // around the input and the output of the computations.
-  typed_visitors_[BF16] = MakeUnique<TypedVisitor<bfloat16, float>>(this);
+  typed_visitors_[BF16] =
+      MakeUnique<HloEvaluatorTypedVisitor<bfloat16, float>>(this);
 
   typed_visitors_[TUPLE] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
     return Unimplemented(
-        "HloEvaluator::TypedVistor: unhandled primitive type: TUPLE.");
+        "HloEvaluatorTypedVisitor: unhandled primitive type: TUPLE.");
   });
   typed_visitors_[OPAQUE] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
     return Unimplemented(
-        "HloEvaluator::TypedVisitor: unhandled primitive type: OPAQUE.");
+        "HloEvaluatorTypedVisitor: unhandled primitive type: OPAQUE.");
   });
 }
 
@@ -3009,8 +951,8 @@ Status HloEvaluator::HandleConditional(HloInstruction* conditional) {
   auto* true_computation = conditional->true_computation();
   auto* false_computation = conditional->false_computation();
 
-  auto result = Literal::CreateFromShape(conditional->shape());
   HloEvaluator embedded_evaluator;
+  std::unique_ptr<Literal> result;
   if (pred.Get<bool>({})) {
     result = embedded_evaluator
                  .Evaluate<const Literal*>(*true_computation,
@@ -3034,7 +976,7 @@ Status HloEvaluator::HandleSelect(HloInstruction* select) {
 
   // If predicate is of scalar type, no element-wise selection would be needed.
   // This would also handle output array of tuple types as the DefaultAction
-  // would go through the TypedVisitor which doesn't handle tuples.
+  // would go through the HloEvaluatorTypedVisitor which doesn't handle tuples.
   if (ShapeUtil::IsScalar(pred.shape())) {
     if (pred.Get<bool>({})) {
       evaluated_[select] = on_true.CloneToUnique();
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index c0dcee0c3e382f74de72a2b89f39e06f042e2b80..566d53a41427119ea3d429a60a4430068bc953b1 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -109,19 +110,16 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
           substitutions);
 
  protected:
-  // Templated DfsHloVisitor. Typically ReturnT here indicates the resulting
-  // literal type of each evaluated Handle* method of a TypedVisitor.
-  // There are however a few notable exceptions to this rule, notably:
-  // - HandleCompare and HandleIsFinite: where the resulting literal type is
-  // always boolean.
-  // These operations are handled outside of the parent HloEvaluator handlers
-  // instead of from within TypedVisitor.
+  // Make HloEvaluatorTypedVisitor a friend because it is logically part of this
+  // class.
   //
-  // Type params:
-  //   - ReturnT: The type of input and output of each operation.
-  //   - ElementwiseT: The type in which internal computation are done.
-  template <typename ReturnT, typename ElementwiseT = ReturnT>
-  class TypedVisitor;
+  // A straightforward implementation would be to make it a nested class
+  // declared and defined in hlo_evaluator.cc.  Instead HloEvaluatorTypedVisitor
+  // lives as a separate class with its own header because its template gets
+  // instantiated many times and we want to use extern templates to shard out
+  // the compilation of those instantiations across multiple cc files.
+  template <typename ReturnT, typename ElementwiseT>
+  friend class HloEvaluatorTypedVisitor;
 
   // Wraps around instruction handling to infer types before dispatching to
   // the corresponding typed Visitor.
@@ -168,7 +166,6 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleSelect(HloInstruction* select) override;
 
- private:
   // Returns the already-evaluated literal result for the instruction.
   // A Constant instruction is considered evaluated and its literal will be
   // returned directly without looking up the cache.
@@ -183,14 +180,6 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
     return *(it->second);
   }
 
-  // Map from a primitive type to its associated (templated) DfsHloVisitor.
-  // Note: the hash function here is only needed because current gcc std::hash
-  // does not specialize for enum types. This should however be fixed in the
-  // future: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60970#c5
-  tensorflow::gtl::FlatMap<PrimitiveType, std::unique_ptr<DfsHloVisitor>,
-                           std::hash<int>>
-      typed_visitors_;
-
   // Tracks the HLO instruction and its evaluated literal result.
   // TODO(b/35950897): have better memory management here to free instructions
   // that are no longer a parent for any other subsequent instruction in
@@ -199,6 +188,41 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   tensorflow::gtl::FlatMap<const HloInstruction*, std::unique_ptr<Literal>>
       evaluated_;
 
+ private:
+  template <typename ReturnT, typename NativeT>
+  static StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOpImpl(
+      HloInstruction* instruction,
+      const std::function<ReturnT(NativeT)>& unary_op,
+      const Literal& operand_literal) {
+    const auto shape = instruction->shape();
+    const auto* operand = instruction->operand(0);
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
+    // removed.
+    if (!ShapeUtil::SameDimensions(shape, operand->shape())) {
+      return Unimplemented(
+          "Implicit broadcasting is currently unsupported in HLO evaluator "
+          "Shape Mismatch: %s vs %s",
+          ShapeUtil::HumanString(shape).c_str(),
+          ShapeUtil::HumanString(operand->shape()).c_str());
+    }
+
+    auto result = MakeUnique<Literal>(shape);
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          return unary_op(operand_literal.Get<NativeT>(multi_index));
+        }));
+    return std::move(result);
+  }
+
+  // Map from a primitive type to its associated (templated) DfsHloVisitor.
+  // Note: the hash function here is only needed because current gcc std::hash
+  // does not specialize for enum types. This should however be fixed in the
+  // future: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60970#c5
+  tensorflow::gtl::FlatMap<PrimitiveType, std::unique_ptr<DfsHloVisitor>,
+                           std::hash<int>>
+      typed_visitors_;
+
   // Caches pointers to input literals, assuming they are in post-order.
   // Literals are not owned by this class, and they must outlive the lifetime of
   // each invocation to the Evaluate* method.
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index cc16446778cbeac5ec4bed110adc9be8620084fe..ae5b5e0412ef99db9b72d645a954759ca0b9eb8b 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -82,9 +82,9 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
     auto element_type = expected->shape().element_type();
     if (element_type == F32 || element_type == F64) {
       ErrorSpec error(aabs);
-      LiteralTestUtil::ExpectNear(*expected, *result, error);
+      EXPECT_TRUE(LiteralTestUtil::Near(*expected, *result, error));
     } else {
-      LiteralTestUtil::ExpectEqual(*expected, *result);
+      EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
     }
   }
 
@@ -100,7 +100,7 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
 
     std::unique_ptr<Literal> result = Evaluate();
 
-    LiteralTestUtil::ExpectEqual(*expected, *result);
+    EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
   }
 
   bool use_bfloat16_;
@@ -129,7 +129,7 @@ TEST_P(HloEvaluatorTest, DoesClamp) {
 
   auto expected = Literal::CreateR2<float>({{0, 4}, {2, 4}});
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
@@ -150,7 +150,7 @@ TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
 
   auto expected = Literal::CreateR2<float>({{0, 0}, {1, 1}});
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs select
@@ -175,7 +175,7 @@ TEST_P(HloEvaluatorTest, DoesSelect) {
 
   auto expected = Literal::CreateR2<float>({{2, 5}, {0, 4}});
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
@@ -307,7 +307,7 @@ TEST_P(HloEvaluatorTest, DoesTraverseInstructions) {
 
   auto expected = Literal::CreateR2<int64>({{4, -16}, {-196, 12}});
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 // Verifies Reshape operation is correctly evaluated.
@@ -315,7 +315,7 @@ TEST_P(HloEvaluatorTest, DoesReshape) {
   HloComputation::Builder b(TestName());
   const int64 dimensions[] = {11, 8, 7, 5, 9};
   TF_ASSERT_OK_AND_ASSIGN(auto literal,
-                          LiteralTestUtil::CreateRandomLiteral<F32>(
+                          Literal::CreateRandomLiteral<F32>(
                               ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
   auto literal_clone = literal->CloneToUnique();
   HloInstruction* literal_instruction =
@@ -351,7 +351,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcast) {
 
   std::unique_ptr<Literal> result = Evaluate({});
 
-  LiteralTestUtil::ExpectEqual(*result, *output_literal);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result, *output_literal));
 }
 
 TEST_P(HloEvaluatorTest, DoesBroadcastScalar) {
@@ -370,7 +370,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcastScalar) {
 
   std::unique_ptr<Literal> result = Evaluate({});
 
-  LiteralTestUtil::ExpectEqual(*result, *output_literal);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result, *output_literal));
 }
 
 TEST_P(HloEvaluatorTest, DoesConcatenateSimple) {
@@ -392,7 +392,7 @@ TEST_P(HloEvaluatorTest, DoesConcatenateSimple) {
 
   auto expected =
       Literal::CreateR2<int64>({{-1, -2}, {100, 200}, {-2, -3}, {-100, -200}});
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
@@ -413,7 +413,7 @@ TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
   std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR1<int64>({100, 200});
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, ConvertWithSameLayout) {
@@ -432,7 +432,7 @@ TEST_P(HloEvaluatorTest, ConvertWithSameLayout) {
 
   std::unique_ptr<Literal> result = Evaluate();
 
-  LiteralTestUtil::ExpectEqual(*result, *expected);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result, *expected));
 }
 
 TEST_P(HloEvaluatorTest, ConvertWithDifferentLayout) {
@@ -452,7 +452,7 @@ TEST_P(HloEvaluatorTest, ConvertWithDifferentLayout) {
 
   std::unique_ptr<Literal> result = Evaluate();
 
-  LiteralTestUtil::ExpectEqual(*result, *expected);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result, *expected));
 }
 
 PaddingConfig CreatePaddingConfig(
@@ -490,7 +490,7 @@ TEST_P(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
   auto expected = Literal::CreateR2<int32>(
       {{10, 10}, {10, 10}, {10, 10}, {10, 10}, {10, 10}});
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
@@ -525,7 +525,7 @@ TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
 
   auto expected = Literal::CreateR4FromArray4D<float>(*expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, NegativePadding2D) {
@@ -567,7 +567,7 @@ TEST_P(HloEvaluatorTest, NegativePadding2D) {
   (*expected_array)(0, 4) = 2.718f;
   auto expected = Literal::CreateR2FromArray2D<float>(*expected_array);
 
-  LiteralTestUtil::ExpectNear(*expected, *result, ErrorSpec(0x1.0P-5));
+  EXPECT_TRUE(LiteralTestUtil::Near(*expected, *result, ErrorSpec(0x1.0P-5)));
 }
 
 TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
@@ -606,7 +606,7 @@ TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
   auto expected_array = MakeUnique<Array2D<float>>(0, 9);
   auto expected = Literal::CreateR2FromArray2D<float>(*expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
@@ -651,7 +651,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
   // clang-format on
   auto expected = Literal::CreateR2FromArray2D<float>(expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
@@ -688,7 +688,7 @@ TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
 
   auto expected = Literal::CreateR1<float>({22.f, 28.f});
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
@@ -737,7 +737,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
   });
   auto expected = Literal::CreateR2FromArray2D<float>(expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, SimpleConv1D) {
@@ -785,7 +785,7 @@ TEST_P(HloEvaluatorTest, SimpleConv1D) {
   Array3D<float> expected_array = {{{11.f, 18.f, 9.f}}};
   auto expected = Literal::CreateR3FromArray3D<float>(expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
@@ -847,7 +847,7 @@ TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   // clang-format on
   auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
@@ -927,7 +927,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
   auto expected = Literal::CreateR4FromArray4D<float>(
       use_bfloat16_ ? expected_array_bf16 : expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
@@ -1004,7 +1004,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
   auto expected = Literal::CreateR4FromArray4D<float>(
       use_bfloat16_ ? expected_array_bf16 : expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
@@ -1067,7 +1067,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   }));
   auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
@@ -1131,7 +1131,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   }));
   auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest,
@@ -1203,7 +1203,7 @@ TEST_P(HloEvaluatorTest,
   }));
   auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 class HloEvaluatorPreciseReduceTest : public HloVerifiedTestBase {};
@@ -1319,7 +1319,7 @@ TEST_P(HloEvaluatorTest, ReduceAdd) {
 
   auto expected = Literal::CreateR1<float>({6, 18});
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, ReduceWindowMax) {
@@ -1370,7 +1370,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) {
   std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({{6, 7}});
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
@@ -1427,7 +1427,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
   std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({{1, 3, 5}, {5, 11, 13}});
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
@@ -1490,7 +1490,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
   std::vector<int64> output_dims = {4, 3, 3, 3, 4, 4};
   std::unique_ptr<Literal> result_literal =
       Literal::CreateFullWithDescendingLayout<float>(output_dims, 8.0f);
-  LiteralTestUtil::ExpectEqual(*result_literal, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result_literal, *result));
 }
 
 TEST_P(HloEvaluatorTest, StridedSlice) {
@@ -1523,7 +1523,7 @@ TEST_P(HloEvaluatorTest, StridedSlice) {
       {19},
   });
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DynamicSlice) {
@@ -1556,7 +1556,7 @@ TEST_P(HloEvaluatorTest, DynamicSlice) {
       {6, 7, 8},
   });
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 // Verifies that the HloEvaluator's implementation goes along with existing
@@ -1591,7 +1591,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
       {6, 7, 8},
   });
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
@@ -1627,7 +1627,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
       {5, -6, -7},
   });
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, SetAndGetTuples) {
@@ -1662,7 +1662,7 @@ TEST_P(HloEvaluatorTest, SetAndGetTuples) {
       {5, 6, 7},
   });
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
@@ -1703,7 +1703,7 @@ TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
       result_inner_literal.get(),
   });
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, Reverse) {
@@ -1756,7 +1756,7 @@ TEST_P(HloEvaluatorTest, Reverse) {
   });
   // clang-format on
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateWithSubstitutions) {
@@ -1776,8 +1776,8 @@ TEST_P(HloEvaluatorTest, EvaluateWithSubstitutions) {
       add, {{param0, Literal::CreateR1<float>({1, 2, 3, 4}).get()},
             {square, Literal::CreateR1<float>({10, 20, 30, 40}).get()}});
   TF_ASSERT_OK(result.status());
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<float>({11, 22, 33, 44}),
-                               *result.ValueOrDie());
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *Literal::CreateR1<float>({11, 22, 33, 44}), *result.ValueOrDie()));
 }
 
 // Check that EvaluateWithSubstitutions works if one of the operands to the op
@@ -1800,8 +1800,8 @@ TEST_P(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) {
   auto result = evaluator.EvaluateWithSubstitutions(
       add, {{square, Literal::CreateR1<float>({10, 20, 30, 40}).get()}});
   TF_ASSERT_OK(result.status());
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<float>({11, 22, 33, 44}),
-                               *result.ValueOrDie());
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *Literal::CreateR1<float>({11, 22, 33, 44}), *result.ValueOrDie()));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV1) {
@@ -1823,9 +1823,9 @@ ENTRY main {
   std::unique_ptr<Literal> operand =
       Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({0, 2});
-  LiteralTestUtil::ExpectEqual(
-      *Literal::CreateR2<int32>({{1, 2, 3}, {7, 8, 9}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{1, 2, 3}, {7, 8, 9}}),
+                             *Evaluate({operand.get(), gather_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV2) {
@@ -1847,9 +1847,9 @@ ENTRY main {
   std::unique_ptr<Literal> operand =
       Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({0, 2});
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR2<int32>({{1, 3}, {4, 6}, {7, 9}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+      *Evaluate({operand.get(), gather_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherMultipleBatchDims) {
@@ -1872,10 +1872,10 @@ ENTRY main {
       Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   std::unique_ptr<Literal> gather_indices =
       Literal::CreateR2<int32>({{0, 2}, {2, 1}});
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR3<int32>(
           {{{1, 3}, {4, 6}, {7, 9}}, {{3, 2}, {6, 5}, {9, 8}}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+      *Evaluate({operand.get(), gather_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherNd) {
@@ -1900,9 +1900,9 @@ ENTRY main {
                                 {{-7, 7}, {-8, 8}, {-9, 9}}});
   std::unique_ptr<Literal> gather_indices =
       Literal::CreateR2<int32>({{0, 0}, {1, 0}});
-  LiteralTestUtil::ExpectEqual(
-      *Literal::CreateR2<int32>({{-1, 1}, {-4, 4}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{-1, 1}, {-4, 4}}),
+                             *Evaluate({operand.get(), gather_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest,
@@ -1928,9 +1928,9 @@ ENTRY main {
                                 {{-7, 7}, {-8, 8}, {-9, 9}}});
   std::unique_ptr<Literal> gather_indices =
       Literal::CreateR2<int32>({{0, 0}, {1, 0}});
-  LiteralTestUtil::ExpectEqual(
-      *Literal::CreateR2<int32>({{-2, 2}, {-1, 1}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{-2, 2}, {-1, 1}}),
+                             *Evaluate({operand.get(), gather_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_DynamicSlice) {
@@ -1952,9 +1952,9 @@ ENTRY main {
   std::unique_ptr<Literal> operand =
       Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({1, 1});
-  LiteralTestUtil::ExpectEqual(
-      *Literal::CreateR2<int32>({{5}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{5}}),
+                             *Evaluate({operand.get(), gather_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_BatchDynamicSlice) {
@@ -1977,9 +1977,9 @@ ENTRY main {
       Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   std::unique_ptr<Literal> gather_indices =
       Literal::CreateR2<int32>({{2, 1}, {1, 1}});
-  LiteralTestUtil::ExpectEqual(
-      *Literal::CreateR3<int32>({{{8}}, {{5}}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR3<int32>({{{8}}, {{5}}}),
+                             *Evaluate({operand.get(), gather_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_ZeroDimBounds) {
@@ -2000,9 +2000,34 @@ ENTRY main {
   ParseAndVerifyModule(hlo_text);
   std::unique_ptr<Literal> operand = Literal::CreateR2<int32>({{}, {}, {}});
   std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({0, 2});
-  LiteralTestUtil::ExpectEqual(
-      *Literal::CreateR2<int32>({{}, {}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{}, {}}),
+                             *Evaluate({operand.get(), gather_indices.get()})));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateGather_NoOutputWindowDims) {
+  const string hlo_text = R"(
+HloModule GatherXd
+
+ENTRY main {
+  operand = s32[3] parameter(0)
+  indices = s32[2,2,1] parameter(1)
+  ROOT gather = s32[2,2] gather(operand, indices),
+      output_window_dims={},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=2,
+      window_bounds={1}
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+
+  std::unique_ptr<Literal> operand = Literal::CreateR1<int32>({0, 1, 2});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR3<int32>({{{0}, {1}}, {{2}, {1}}});
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{0, 1}, {2, 1}}),
+                             *Evaluate({operand.get(), gather_indices.get()})));
 }
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..024e8751f79b8b73cb868f6cbd4603f3e94ca7ea
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -0,0 +1,2170 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_TYPED_VISITOR_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_TYPED_VISITOR_H_
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/core/lib/core/casts.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+
+namespace xla {
+
+// TODO(b/79274244): We'd like these type traits to live inside of
+// HloEvaluatorTypedVisitor so they don't pollute namespace xla, but that
+// crashes clang in the frontend.
+//
+// Anyway this is relatively safe as-is because hlo_evaluator_typed_visitor.h is
+// a "private" header that's not exposed outside of hlo_evaluator.cc.
+template <typename T>
+using is_complex_t = std::is_same<T, complex64>;
+template <typename T>
+using is_complex64_t = std::is_same<T, complex64>;
+
+// Templated DfsHloVisitor for use by HloEvaluator.
+//
+// Typically ReturnT here indicates the resulting literal type of each evaluated
+// Handle* method of a TypedVisitor.  There are however a few notable exceptions
+// to this rule, notably:
+// - HandleCompare and HandleIsFinite: where the resulting literal type is
+//   always boolean.
+// These operations are handled outside of the parent HloEvaluator handlers
+// instead of from within TypedVisitor.
+//
+// Type params:
+//   - ReturnT: The type of input and output of each operation.
+//   - ElementwiseT: The type in which internal computation are done.
+//
+// This a logically a private part of HloEvaluator.  It lives in this header
+// file rather than in hlo_evaluator.cc because we use extern templates and a
+// bunch of independent cc files to speed up compiling the many instantiations
+// of this class.
+template <typename ReturnT, typename ElementwiseT = ReturnT>
+class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
+ public:
+  explicit HloEvaluatorTypedVisitor(HloEvaluator* p) : parent_(p) {}
+
+  // The following higher-order functions convert a function with ElementwiseT
+  // to a function with ReturnT.
+  std::function<ReturnT(ReturnT)> ConvertUnaryFunction(
+      const std::function<ElementwiseT(ElementwiseT)>& unary_op) {
+    return [&unary_op](ReturnT arg) {
+      return static_cast<ReturnT>(unary_op(static_cast<ElementwiseT>(arg)));
+    };
+  }
+  std::function<ReturnT(ReturnT, ReturnT)> ConvertBinaryFunction(
+      const std::function<ElementwiseT(ElementwiseT, ElementwiseT)>&
+          binary_op) {
+    return [&binary_op](ReturnT arg1, ReturnT arg2) {
+      return static_cast<ReturnT>(binary_op(static_cast<ElementwiseT>(arg1),
+                                            static_cast<ElementwiseT>(arg2)));
+    };
+  }
+  std::function<ReturnT(ReturnT, ReturnT, ReturnT)> ConvertTernaryFunction(
+      const std::function<ElementwiseT(ElementwiseT, ElementwiseT,
+                                       ElementwiseT)>& ternary_op) {
+    return [&ternary_op](ReturnT arg1, ReturnT arg2, ReturnT arg3) {
+      return static_cast<ReturnT>(ternary_op(static_cast<ElementwiseT>(arg1),
+                                             static_cast<ElementwiseT>(arg2),
+                                             static_cast<ElementwiseT>(arg3)));
+    };
+  }
+
+  Status DefaultAction(HloInstruction* hlo_instruction) override {
+    return Unimplemented("unhandled HLO ops for HloEvaluator: %s.",
+                         HloOpcodeString(hlo_instruction->opcode()).c_str());
+  }
+
+  // TODO(b/35950897): many of the stl functions used in the handlers are not
+  // overloaded for every XLA primitive type.
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_unsigned<NativeT>::value>::type* =
+                nullptr>
+  Status HandleAbs(HloInstruction* abs) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
+                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
+                          return elem_operand;
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_signed<NativeT>::value>::type* = nullptr>
+  Status HandleAbs(HloInstruction* abs) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
+                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
+                          return std::abs(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex64_t<NativeT>::value>::type* = nullptr>
+  Status HandleAbs(HloInstruction* abs) {
+    const Literal& operand_literal =
+        parent_->GetEvaluatedLiteralFor(abs->operand(0));
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[abs],
+        (HloEvaluator::ElementWiseUnaryOpImpl<float, NativeT>(
+            abs, [](NativeT elem_operand) { return std::abs(elem_operand); },
+            operand_literal)));
+
+    return Status::OK();
+  }
+
+  Status HandleAbs(HloInstruction* abs) override {
+    // If the operand is of C64 type, the return type of abs will be F32.
+    // However, ElementwiseT would still be the return type, F32, and thus
+    // specifying the ElementwiseT explicitly as C64 is needed below.
+    if (abs->operand(0)->shape().element_type() == C64) {
+      return HandleAbs<complex64>(abs);
+    }
+    return HandleAbs<ElementwiseT>(abs);
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleRound(HloInstruction* round) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[round],
+        ElementWiseUnaryOp(round, [](ElementwiseT elem_operand) {
+          return std::round(elem_operand);
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleRound(HloInstruction* round) {
+    return InvalidArgument("Unsupported type for Round");
+  }
+
+  Status HandleRound(HloInstruction* round) override {
+    return HandleRound<ReturnT>(round);
+  }
+
+  Status HandleBroadcast(HloInstruction* broadcast) override {
+    const Literal& operand_to_broadcast =
+        parent_->GetEvaluatedLiteralFor(broadcast->operand(0));
+    std::vector<int64> broadcast_indices(
+        ShapeUtil::Rank(broadcast->operand(0)->shape()), 0);
+
+    TF_RET_CHECK(broadcast->dimensions().size() ==
+                 ShapeUtil::Rank(operand_to_broadcast.shape()))
+        << "broadcast dimensions is of size: " << broadcast->dimensions().size()
+        << " and rank of operand_to_broadcast is: "
+        << ShapeUtil::Rank(operand_to_broadcast.shape());
+    // Checks that operand's dimensions are the same as the broadcast's
+    // dimensions along the dimensions to be broadcasted.
+    for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
+      TF_RET_CHECK(broadcast->shape().dimensions(broadcast->dimensions(i)) ==
+                   operand_to_broadcast.shape().dimensions(i));
+    }
+
+    auto output = MakeUnique<Literal>(broadcast->shape());
+    TF_RETURN_IF_ERROR(output->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
+            broadcast_indices[i] = multi_index[broadcast->dimensions(i)];
+          }
+          return operand_to_broadcast.Get<ReturnT>(broadcast_indices);
+        }));
+    parent_->evaluated_[broadcast] = std::move(output);
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleCeil(HloInstruction* ceil) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[ceil],
+                        ElementWiseUnaryOp(ceil, [](ElementwiseT elem_operand) {
+                          return std::ceil(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleCeil(HloInstruction* ceil) {
+    return InvalidArgument("Unsupported type for Ceil");
+  }
+
+  Status HandleCeil(HloInstruction* ceil) override {
+    return HandleCeil<ReturnT>(ceil);
+  }
+
+  Status HandleConvert(HloInstruction* convert) override {
+    const HloInstruction* operand = convert->operand(0);
+    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> result,
+                        parent_->GetEvaluatedLiteralFor(operand).Convert(
+                            convert->shape().element_type()));
+
+    if (LayoutUtil::LayoutsInShapesEqual(result->shape(), convert->shape())) {
+      parent_->evaluated_[convert] = std::move(result);
+    } else {
+      parent_->evaluated_[convert] =
+          result->Relayout(convert->shape().layout());
+    }
+    return Status::OK();
+  }
+
+  Status HandleBitcastConvert(HloInstruction* convert) override {
+    const HloInstruction* operand = convert->operand(0);
+    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> result,
+                        parent_->GetEvaluatedLiteralFor(operand).BitcastConvert(
+                            convert->shape().element_type()));
+
+    if (LayoutUtil::LayoutsInShapesEqual(result->shape(), convert->shape())) {
+      parent_->evaluated_[convert] = std::move(result);
+    } else {
+      parent_->evaluated_[convert] =
+          result->Relayout(convert->shape().layout());
+    }
+    return Status::OK();
+  }
+
+  Status HandleExp(HloInstruction* exp) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
+                        ElementWiseUnaryOp(exp, [](ElementwiseT elem_operand) {
+                          return std::exp(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleExpm1(HloInstruction* expm1) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[expm1],
+        ElementWiseUnaryOp(expm1, [](ElementwiseT elem_operand) {
+          return std::expm1(elem_operand);
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleExpm1(HloInstruction* floor) {
+    return InvalidArgument("Unsupported type for Expm1");
+  }
+
+  Status HandleExpm1(HloInstruction* floor) override {
+    return HandleExpm1<ReturnT>(floor);
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleFloor(HloInstruction* floor) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[floor],
+        ElementWiseUnaryOp(floor, [](ElementwiseT elem_operand) {
+          return std::floor(elem_operand);
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleFloor(HloInstruction* floor) {
+    return InvalidArgument("Unsupported type for Floor");
+  }
+
+  Status HandleFloor(HloInstruction* floor) override {
+    return HandleFloor<ReturnT>(floor);
+  }
+
+  Status HandleLog(HloInstruction* log) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[log],
+                        ElementWiseUnaryOp(log, [](ElementwiseT elem_operand) {
+                          return std::log(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleLog1p(HloInstruction* expm1) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[expm1],
+        ElementWiseUnaryOp(expm1, [](ElementwiseT elem_operand) {
+          return std::log1p(elem_operand);
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleLog1p(HloInstruction* floor) {
+    return InvalidArgument("Unsupported type for Log1p");
+  }
+
+  Status HandleLog1p(HloInstruction* floor) override {
+    return HandleLog1p<ReturnT>(floor);
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_integral<NativeT>::value &&
+                !std::is_same<NativeT, bool>::value>::type* = nullptr>
+  Status HandleNot(HloInstruction* not_) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
+                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
+                          return ~elem_operand;
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleNot(HloInstruction* not_) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
+                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
+                          return !elem_operand;
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_same<NativeT, bool>::value>::type* =
+                nullptr>
+  Status HandleNot(HloInstruction* not_) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
+                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
+                          return !elem_operand;
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleNot(HloInstruction* not_) {
+    return InvalidArgument("Unsupported type for Not");
+  }
+
+  Status HandleNot(HloInstruction* not_) override {
+    return HandleNot<ElementwiseT>(not_);
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_signed<NativeT>::value &&
+                !std::is_floating_point<NativeT>::value>::type* = nullptr>
+  Status HandleNegate(HloInstruction* negate) {
+    using type = typename std::make_unsigned<NativeT>::type;
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[negate],
+        ElementWiseUnaryOp(negate, [](ElementwiseT elem_operand) {
+          return NativeT(-type(elem_operand));
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                !std::is_signed<NativeT>::value ||
+                std::is_floating_point<NativeT>::value>::type* = nullptr>
+  Status HandleNegate(HloInstruction* negate) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[negate],
+        ElementWiseUnaryOp(
+            negate, [](ElementwiseT elem_operand) { return -elem_operand; }));
+    return Status::OK();
+  }
+
+  Status HandleNegate(HloInstruction* negate) override {
+    return HandleNegate<ReturnT>(negate);
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleSign(HloInstruction* sign) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
+                        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
+                          return (ElementwiseT(0) < elem_operand) -
+                                 (elem_operand < ElementwiseT(0));
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleSign(HloInstruction* sign) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
+                        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
+                          auto abs_val = std::abs(elem_operand);
+                          return 0 == abs_val ? ElementwiseT(0)
+                                              : elem_operand / abs_val;
+                        }));
+    return Status::OK();
+  }
+
+  Status HandleSign(HloInstruction* sign) override {
+    return HandleSign<ReturnT>(sign);
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleAtan2(HloInstruction* atan2) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[atan2],
+                        ElementWiseBinaryOp(atan2, [](ElementwiseT lhs_elem,
+                                                      ElementwiseT rhs_elem) {
+                          return std::atan2(lhs_elem, rhs_elem);
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<!std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleAtan2(HloInstruction* atan2) {
+    return InvalidArgument("Unsupported type for Atan2");
+  }
+
+  Status HandleAtan2(HloInstruction* atan2) override {
+    return HandleAtan2<ElementwiseT>(atan2);
+  }
+
+  Status HandleTanh(HloInstruction* tanh) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[tanh],
+                        ElementWiseUnaryOp(tanh, [](ElementwiseT elem_operand) {
+                          return std::tanh(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_signed<NativeT>::value &&
+                !std::is_floating_point<NativeT>::value>::type* = nullptr>
+  Status HandleMultiply(HloInstruction* multiply) {
+    using type = typename std::make_unsigned<NativeT>::type;
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[multiply],
+        ElementWiseBinaryOp(multiply,
+                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+                              return NativeT(type(lhs_elem) * type(rhs_elem));
+                            }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_unsigned<NativeT>::value ||
+                              std::is_floating_point<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleMultiply(HloInstruction* multiply) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[multiply],
+        ElementWiseBinaryOp(multiply,
+                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+                              return lhs_elem * rhs_elem;
+                            }));
+    return Status::OK();
+  }
+
+  Status HandleMultiply(HloInstruction* multiply) override {
+    return HandleMultiply<ElementwiseT>(multiply);
+  }
+
+  Status HandleSubtract(HloInstruction* subtract) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[subtract],
+        ElementWiseBinaryOp(subtract,
+                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+                              return lhs_elem - rhs_elem;
+                            }));
+    return Status::OK();
+  }
+
+  Status HandleAdd(HloInstruction* add) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[add],
+                        ElementWiseBinaryOp(add, [](ElementwiseT lhs_elem,
+                                                    ElementwiseT rhs_elem) {
+                          return lhs_elem + rhs_elem;
+                        }));
+    return Status::OK();
+  }
+
+  Status HandleDivide(HloInstruction* divide) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[divide],
+                        ElementWiseBinaryOp(divide, [](ElementwiseT lhs_elem,
+                                                       ElementwiseT rhs_elem) {
+                          return lhs_elem / rhs_elem;
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleMaximum(HloInstruction* maximum) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[maximum],
+        ElementWiseBinaryOp(maximum, [](ElementwiseT lhs, ElementwiseT rhs) {
+          return std::max(lhs, rhs);
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleMaximum(HloInstruction* maximum) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[maximum],
+        ElementWiseBinaryOp(maximum, [](ElementwiseT lhs, ElementwiseT rhs) {
+          return ((lhs >= rhs) || std::isnan(lhs)) ? lhs : rhs;
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleMaximum(HloInstruction* maximum) {
+    return InvalidArgument("Unsupported type for Maximum");
+  }
+
+  Status HandleMaximum(HloInstruction* maximum) override {
+    return HandleMaximum<ElementwiseT>(maximum);
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleMinimum(HloInstruction* minimum) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[minimum],
+                        ElementWiseBinaryOp(minimum, [](ElementwiseT lhs_el,
+                                                        ElementwiseT rhs_el) {
+                          return std::min(lhs_el, rhs_el);
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleMinimum(HloInstruction* minimum) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[minimum],
+        ElementWiseBinaryOp(minimum, [](ElementwiseT lhs_el,
+                                        ElementwiseT rhs_el) {
+          return ((lhs_el <= rhs_el) || std::isnan(lhs_el)) ? lhs_el : rhs_el;
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleMinimum(HloInstruction* minimum) {
+    return InvalidArgument("Unsupported type for Minimum");
+  }
+
+  Status HandleMinimum(HloInstruction* minimum) override {
+    return HandleMinimum<ElementwiseT>(minimum);
+  }
+
+  Status HandlePower(HloInstruction* power) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[power],
+                        ElementWiseBinaryOp(power, [](ElementwiseT lhs_el,
+                                                      ElementwiseT rhs_el) {
+                          return std::pow(lhs_el, rhs_el);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleRemainder(HloInstruction* remainder) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[remainder],
+                        ElementWiseBinaryOp(remainder, [](ElementwiseT lhs_el,
+                                                          ElementwiseT rhs_el) {
+                          return std::fmod(lhs_el, rhs_el);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleRemainder(HloInstruction* remainder) {
+    return InvalidArgument("Unsupported type for Remainder");
+  }
+
+  Status HandleRemainder(HloInstruction* remainder) override {
+    return HandleRemainder<ElementwiseT>(remainder);
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleAnd(HloInstruction* and_) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[and_],
+        ElementWiseBinaryOp(and_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
+          return lhs_el & rhs_el;
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleAnd(HloInstruction* and_) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[and_],
+        ElementWiseBinaryOp(and_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
+          return lhs_el && rhs_el;
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleAnd(HloInstruction* and_) {
+    return InvalidArgument("Unsupported type for And");
+  }
+
+  Status HandleAnd(HloInstruction* and_) override {
+    return HandleAnd<ElementwiseT>(and_);
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleOr(HloInstruction* or_) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[or_],
+        ElementWiseBinaryOp(or_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
+          return lhs_el | rhs_el;
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleOr(HloInstruction* or_) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[or_],
+        ElementWiseBinaryOp(or_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
+          return lhs_el || rhs_el;
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleOr(HloInstruction* or_) {
+    return InvalidArgument("Unsupported type for Or");
+  }
+
+  Status HandleOr(HloInstruction* or_) override {
+    return HandleOr<ElementwiseT>(or_);
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_integral<NativeT>::value &&
+                !std::is_same<NativeT, bool>::value>::type* = nullptr>
+  Status HandleShiftLeft(HloInstruction* shl) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[shl],
+        ElementWiseBinaryOp(shl, [](NativeT lhs_elem, NativeT rhs_elem) {
+          return IsShiftOutOfBounds<NativeT>(rhs_elem) ? 0
+                                                       : (lhs_elem << rhs_elem);
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<!std::is_integral<NativeT>::value ||
+                                    std::is_same<NativeT, bool>::value>::type* =
+                nullptr>
+  Status HandleShiftLeft(HloInstruction*) {
+    return InvalidArgument("Unsupported type for ShiftLeft");
+  }
+
+  Status HandleShiftLeft(HloInstruction* shl) override {
+    return HandleShiftLeft<ElementwiseT>(shl);
+  }
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_integral<NativeT>::value &&
+                !std::is_same<NativeT, bool>::value>::type* = nullptr>
+  Status HandleShiftRightArithmetic(HloInstruction* shr) {
+    typedef typename std::make_signed<NativeT>::type SignedT;
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[shr],
+        ElementWiseBinaryOp(shr, [](NativeT lhs_elem, NativeT rhs_elem) {
+          SignedT lhs_signed = static_cast<SignedT>(lhs_elem);
+          if (IsShiftOutOfBounds<NativeT>(rhs_elem)) {
+            return lhs_signed < 0 ? static_cast<SignedT>(-1) : 0;
+          } else {
+            return lhs_signed >> rhs_elem;
+          }
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<!std::is_integral<NativeT>::value ||
+                                    std::is_same<NativeT, bool>::value>::type* =
+                nullptr>
+  Status HandleShiftRightArithmetic(HloInstruction*) {
+    return InvalidArgument("Unsupported type for ShiftRightArithmetic");
+  }
+
+  Status HandleShiftRightArithmetic(HloInstruction* shra) override {
+    return HandleShiftRightArithmetic<ElementwiseT>(shra);
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_integral<NativeT>::value &&
+                !std::is_same<NativeT, bool>::value>::type* = nullptr>
+  Status HandleShiftRightLogical(HloInstruction* shr) {
+    typedef typename std::make_unsigned<NativeT>::type UnsignedT;
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[shr],
+        ElementWiseBinaryOp(shr, [](NativeT lhs_elem, NativeT rhs_elem) {
+          // If shift amount is greater than the number of bits, then return 0.
+          if (IsShiftOutOfBounds<NativeT>(rhs_elem)) {
+            return static_cast<NativeT>(0);
+          }
+          return static_cast<NativeT>(static_cast<UnsignedT>(lhs_elem) >>
+                                      rhs_elem);
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<!std::is_integral<NativeT>::value ||
+                                    std::is_same<NativeT, bool>::value>::type* =
+                nullptr>
+  Status HandleShiftRightLogical(HloInstruction*) {
+    return InvalidArgument("Unsupported type for ShiftRightLogical");
+  }
+
+  Status HandleShiftRightLogical(HloInstruction* shrl) override {
+    return HandleShiftRightLogical<ElementwiseT>(shrl);
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleClamp(HloInstruction* clamp) {
+    std::function<ElementwiseT(ElementwiseT, ElementwiseT, ElementwiseT)>
+        clamp_op = [](ElementwiseT low, ElementwiseT value, ElementwiseT high) {
+          return std::fmin(high, std::fmax(value, low));
+        };
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[clamp],
+        ElementwiseTernaryOp(clamp,
+                             std::move(ConvertTernaryFunction(clamp_op))));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleClamp(HloInstruction*) {
+    return InvalidArgument("Unsupported type for Clamp");
+  }
+
+  Status HandleClamp(HloInstruction* clamp) override {
+    return HandleClamp<ElementwiseT>(clamp);
+  }
+
+  Status HandleSelect(HloInstruction* select) override {
+    CHECK(!ShapeUtil::IsScalar(select->operand(0)->shape()));
+    CHECK(!ShapeUtil::IsTuple(select->shape()));
+    std::function<ReturnT(bool, ReturnT, ReturnT)> select_op =
+        [](bool pred, ReturnT on_true, ReturnT on_false) {
+          if (pred) {
+            return on_true;
+          }
+          return on_false;
+        };
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[select],
+                        ElementwiseTernaryOp(select, std::move(select_op)));
+    return Status::OK();
+  }
+
+  Status HandleReverse(HloInstruction* reverse) override {
+    const auto result_shape = reverse->shape();
+    const auto reverse_dimensions = reverse->dimensions();
+
+    auto operand = reverse->operand(0);
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferReverseShape(operand->shape(),
+                                                          reverse_dimensions));
+
+    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
+        << "return shape set to: " << ShapeUtil::HumanString(result_shape)
+        << " but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    auto result = MakeUnique<Literal>(result_shape);
+
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> out_index) {
+          std::vector<int64> from_index(out_index.begin(), out_index.end());
+          for (const int64 dim : reverse_dimensions) {
+            from_index[dim] = result_shape.dimensions(dim) - 1 - out_index[dim];
+          }
+          return operand_literal.Get<ReturnT>(from_index);
+        }));
+
+    parent_->evaluated_[reverse] = std::move(result);
+    return Status::OK();
+  }
+
+  Status HandleConvolution(HloInstruction* conv) override {
+    auto lhs = conv->operand(0);
+    auto rhs = conv->operand(1);
+    const auto& window = conv->window();
+    const Shape& result_shape = conv->shape();
+    const Shape& lhs_shape = lhs->shape();
+    const Shape& rhs_shape = rhs->shape();
+
+    TF_CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
+    TF_CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
+    CHECK(ShapeUtil::IsArray(lhs_shape));
+    CHECK(ShapeUtil::IsArray(rhs_shape));
+    CHECK(ShapeUtil::SameElementType(lhs_shape, rhs_shape));
+    CHECK(ShapeUtil::SameElementType(lhs_shape, result_shape));
+
+    const auto& dnums = conv->convolution_dimension_numbers();
+    const int64 num_spatial_dims = dnums.output_spatial_dimensions_size();
+    CHECK_EQ(num_spatial_dims, dnums.input_spatial_dimensions_size());
+    CHECK_EQ(num_spatial_dims, dnums.kernel_spatial_dimensions_size());
+    CHECK_GE(num_spatial_dims, 0);
+    CHECK_EQ(window.dimensions_size(), num_spatial_dims);
+
+    const auto lhs_rank = ShapeUtil::Rank(lhs_shape);
+    const auto rhs_rank = ShapeUtil::Rank(rhs_shape);
+
+    CHECK_EQ(num_spatial_dims + 2, lhs_rank);
+    CHECK_EQ(num_spatial_dims + 2, rhs_rank);
+
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferConvolveShape(lhs_shape, rhs_shape,
+                                                           window, dnums));
+    CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
+        << "return shape set to: " << ShapeUtil::HumanString(result_shape)
+        << " but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+
+    std::vector<int64> window_dimension_sizes;
+    for (auto i : dnums.kernel_spatial_dimensions()) {
+      window_dimension_sizes.push_back(ShapeUtil::GetDimension(rhs_shape, i));
+    }
+
+    const Shape& window_shape =
+        ShapeUtil::MakeShape(rhs_shape.element_type(), window_dimension_sizes);
+
+    DimensionVector lhs_dim_multipliers = MakeDimMultipliers(lhs_shape);
+    DimensionVector rhs_dim_multipliers = MakeDimMultipliers(rhs_shape);
+
+    auto lhs_literal_data = lhs_literal.data<ReturnT>();
+    auto rhs_literal_data = rhs_literal.data<ReturnT>();
+
+    auto func = [&window_shape, &dnums, &lhs_shape, &rhs_shape, &window,
+                 &lhs_dim_multipliers, &rhs_dim_multipliers, lhs_literal_data,
+                 rhs_literal_data](
+                    tensorflow::gtl::ArraySlice<int64> out_index) {
+      // Dimension number applicable for input (lhs).
+      const int64 input_batch_dim = dnums.input_batch_dimension();
+      const int64 input_z_dim = dnums.input_feature_dimension();
+      // Dimension number applicable for kernel (rhs).
+      const int64 kernel_input_z_dim = dnums.kernel_input_feature_dimension();
+      const int64 kernel_output_z_dim = dnums.kernel_output_feature_dimension();
+      // Dimension number applicable for output.
+      const int64 output_batch_dim = dnums.output_batch_dimension();
+      const int64 output_z_dim = dnums.output_feature_dimension();
+
+      const int64 z_size = ShapeUtil::GetDimension(lhs_shape, input_z_dim);
+
+      ElementwiseT result_val = static_cast<ElementwiseT>(0);
+      DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size(),
+                                        0);
+
+      // Convolve input feature with kernel.
+      do {
+        for (int64 iz = 0; iz < z_size; ++iz) {
+          int64 lhs_linear_index = 0;
+          lhs_linear_index += out_index[output_batch_dim] *
+                              lhs_dim_multipliers[input_batch_dim];
+          lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
+
+          int64 rhs_linear_index = 0;
+          rhs_linear_index += out_index[output_z_dim] *
+                              rhs_dim_multipliers[kernel_output_z_dim];
+          rhs_linear_index += iz * rhs_dim_multipliers[kernel_input_z_dim];
+
+          // Find corresponding spatial dimension index for input (lhs).
+          for (int64 ki = 0; ki < rhs_spatial_index.size(); ++ki) {
+            // Spatial dimension number for input (lhs) and output.
+            const int64 input_spatial_dim = dnums.input_spatial_dimensions(ki);
+            const int64 output_spatial_dim =
+                dnums.output_spatial_dimensions(ki);
+
+            // Calculate lhs (input) index without taking base dilation into
+            // account.
+            const auto& window_dim = window.dimensions(ki);
+            const int64 undilated_index =
+                out_index[output_spatial_dim] * window_dim.stride() -
+                window_dim.padding_low() +
+                rhs_spatial_index[ki] * window_dim.window_dilation();
+            // Skip if the lhs (input) index is to be dilated.  As an
+            // optimization, skip this mod if there's no dilation.
+            if (window_dim.base_dilation() > 1 &&
+                undilated_index % window_dim.base_dilation() != 0) {
+              goto cnt;
+            }
+
+            // Calculate the actual lhs (input) index after dilation.  As an
+            // optimization, skip this integer divide if there's no dilation.
+            int64 lhs_spatial_index;
+            if (window_dim.base_dilation() > 1) {
+              lhs_spatial_index = undilated_index / window_dim.base_dilation();
+            } else {
+              lhs_spatial_index = undilated_index;
+            }
+            lhs_linear_index +=
+                lhs_spatial_index * lhs_dim_multipliers[input_spatial_dim];
+
+            // Skip if input index is not in bounds.
+            if (!(lhs_spatial_index >= 0 &&
+                  lhs_spatial_index <
+                      lhs_shape.dimensions(input_spatial_dim))) {
+              goto cnt;
+            }
+
+            rhs_linear_index +=
+                (window_dim.window_reversal()
+                     ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
+                     : rhs_spatial_index[ki]) *
+                rhs_dim_multipliers[dnums.kernel_spatial_dimensions(ki)];
+          }
+
+          result_val +=
+              static_cast<ElementwiseT>(lhs_literal_data[lhs_linear_index]) *
+              static_cast<ElementwiseT>(rhs_literal_data[rhs_linear_index]);
+        }
+      cnt : {}
+      } while (IndexUtil::BumpIndices(window_shape, &rhs_spatial_index));
+
+      return static_cast<ReturnT>(result_val);
+    };
+
+    auto result = MakeUnique<Literal>(result_shape);
+    TF_RETURN_IF_ERROR(result->PopulateParallel<ReturnT>(func));
+
+    parent_->evaluated_[conv] = std::move(result);
+    return Status::OK();
+  }
+
+  Status HandleDot(HloInstruction* dot) override {
+    auto lhs = dot->operand(0);
+    auto rhs = dot->operand(1);
+    CHECK(ShapeUtil::IsArray(dot->shape()));
+    CHECK(ShapeUtil::IsArray(lhs->shape()));
+    CHECK(ShapeUtil::IsArray(rhs->shape()));
+
+    const auto& dnums = dot->dot_dimension_numbers();
+
+    const auto lhs_rank = ShapeUtil::Rank(lhs->shape());
+    const auto rhs_rank = ShapeUtil::Rank(rhs->shape());
+
+    CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
+    CHECK(ShapeUtil::SameElementType(lhs->shape(), dot->shape()));
+
+    // There must be 1 and only 1 Contracting dimension for lhs and rhs.
+    CHECK_EQ(dnums.lhs_contracting_dimensions_size(), 1);
+    CHECK_EQ(dnums.rhs_contracting_dimensions_size(), 1);
+    const int64 lhs_contracting_dimension = dnums.lhs_contracting_dimensions(0);
+    const int64 rhs_contracting_dimension = dnums.rhs_contracting_dimensions(0);
+    // Contracted dimension sizes must be the same.
+    CHECK_EQ(lhs->shape().dimensions(lhs_contracting_dimension),
+             rhs->shape().dimensions(rhs_contracting_dimension))
+        << "lhs contracted dimension: "
+        << lhs->shape().dimensions(lhs_contracting_dimension)
+        << " rhs contracted dimension: "
+        << rhs->shape().dimensions(rhs_contracting_dimension);
+    const int64 contracted_dimension_size =
+        lhs->shape().dimensions(lhs_contracting_dimension);
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+
+    CHECK_EQ(dnums.lhs_batch_dimensions_size(),
+             dnums.rhs_batch_dimensions_size());
+
+    std::vector<int64> lhs_non_contracting_dims;
+    for (int64 i = 0; i < lhs_rank; i++) {
+      if (i != lhs_contracting_dimension) {
+        lhs_non_contracting_dims.push_back(i);
+      }
+    }
+
+    std::vector<int64> rhs_non_batch_non_contracting_dims;
+    tensorflow::gtl::FlatSet<int64> batch_dims_set(
+        dnums.rhs_batch_dimensions().begin(),
+        dnums.rhs_batch_dimensions().end());
+    for (int64 i = 0; i < rhs_rank; i++) {
+      if (i != rhs_contracting_dimension && batch_dims_set.count(i) == 0) {
+        rhs_non_batch_non_contracting_dims.push_back(i);
+      }
+    }
+
+    const int64 batch_dim_size = dnums.lhs_batch_dimensions_size();
+    const int64 lhs_non_contracting_size = lhs_non_contracting_dims.size();
+
+    DimensionVector lhs_index(lhs_rank);
+    DimensionVector rhs_index(rhs_rank);
+    auto result = MakeUnique<Literal>(dot->shape());
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> result_index) {
+          ElementwiseT result_val = static_cast<ElementwiseT>(0);
+
+          // Find the corresponding non-contracting indices for lhs and rhs.
+          //
+          // For `result_index`, its batch dimension, if exists, will be at the
+          // same dimension as the batch dimension of lhs and rhs. More
+          // specifically:
+          // - For lhs, the non-contracting dimensions, including the batch
+          // dimension have the same index as the `result_index`.
+          // - For rhs, the batch dimension is set seperately from other
+          // non-contracting dimensions, since these other non-contracting
+          // dimensions in rhs follow the non-contracting dimensions of lhs in
+          // the resulting index.
+          //
+          // As an example, for a resulting index:
+          //  result_index [result_batch, result_x, result_y]
+          // the effecting lhs and rhs indices are:
+          //  lhs [result_batch, lhs_non_contracting_dim, contracting_dim
+          //  rhs [result_batch, contracting_dim, rhs_non_contracting_dim]
+          // `result_x` is only affected by the lhs_non_contracting_dim and
+          // likewise `result_y` only depends on rhs_non_contracting_dim.
+          //
+          // so we can look up the lhs and rhs indices by:
+          //
+          // lhs:
+          //  batch index is the same as `result_batch`.
+          //    non-contracting dimension is the same as
+          //    result_index[lhs_non_contracting_dim]
+          // rhs:
+          //  batch index: the same as `result_batch`.
+          //  non-contracting dimension index: *not* the same as
+          //    result_index[rhs_non_contractng_dim], since the
+          //    non-contracting dimensions of lhs are included in the
+          //    result_index first. Instead, the non_contracting_dim of rhs must
+          //    be calculated as following:
+          //      lhs_non_contracting_dimensions_size +
+          //      (rhs_non_batch_non_contracting_dim - batch_dim_size) - 1
+          //
+          //    Note that (rhs_non_batch_contracting_dim - batch_dim_size) is
+          //    the index offset to the result_index that only depends on
+          //    the non_batch and non-contracting dimensions of rhs. -1 at the
+          //    end translates size to index.
+          for (auto i : lhs_non_contracting_dims) {
+            lhs_index[i] = result_index[i];
+          }
+          for (auto i : dnums.rhs_batch_dimensions()) {
+            rhs_index[i] = result_index[i];
+          }
+          for (auto i : rhs_non_batch_non_contracting_dims) {
+            const int64 rhs_non_batch_non_contracting_dim =
+                lhs_non_contracting_size + (i - batch_dim_size) - 1;
+            rhs_index[i] = result_index[rhs_non_batch_non_contracting_dim];
+          }
+
+          // Accumulates resulting product along the contracted dimension.
+          for (int64 i = 0; i < contracted_dimension_size; ++i) {
+            lhs_index[lhs_contracting_dimension] = i;
+            rhs_index[rhs_contracting_dimension] = i;
+
+            result_val +=
+                static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
+                static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
+          }
+
+          return static_cast<ReturnT>(result_val);
+        }));
+
+    parent_->evaluated_[dot] = std::move(result);
+    return Status::OK();
+  }
+
+  Status HandlePad(HloInstruction* pad) override {
+    CHECK(!ShapeUtil::IsTuple(pad->operand(0)->shape()));
+    // Padding value must be scalar.
+    CHECK(ShapeUtil::IsScalar(pad->operand(1)->shape()));
+    CHECK_EQ(ShapeUtil::Rank(pad->operand(0)->shape()),
+             pad->padding_config().dimensions_size());
+
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferPadShape(
+                            /*operand_shape=*/pad->operand(0)->shape(),
+                            /*padding_value_shape=*/pad->operand(1)->shape(),
+                            /*padding_config=*/pad->padding_config()));
+    CHECK(ShapeUtil::Compatible(pad->shape(), inferred_return_shape))
+        << "return shape is set to: " << ShapeUtil::HumanString(pad->shape())
+        << "but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+
+    // Create new HLO of padded shape with padding value.
+    ReturnT scalar =
+        parent_->GetEvaluatedLiteralFor(pad->operand(1)).Get<ReturnT>({});
+    auto result = MakeUnique<Literal>(pad->shape());
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&scalar](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          return scalar;
+        }));
+
+    const Literal& evaluated_operand =
+        parent_->GetEvaluatedLiteralFor(pad->operand(0));
+
+    std::vector<int64> input_index(ShapeUtil::Rank(evaluated_operand.shape()),
+                                   0);
+    std::vector<int64> target_index(ShapeUtil::Rank(result->shape()), 0);
+
+    // Loop through each element of the operand, assign them to the
+    // corresponding index of the resulting padded literal.
+    const PaddingConfig& pad_config = pad->padding_config();
+
+    auto func = [&](tensorflow::gtl::ArraySlice<int64> input_index) {
+      for (auto i = 0; i < input_index.size(); ++i) {
+        // Interior padding occurs logically before edge padding, so in the case
+        // of negative edge padding elements are removed from the
+        // interior-padded operand.
+        target_index[i] =
+            pad_config.dimensions(i).edge_padding_low() +
+            input_index[i] * (pad_config.dimensions(i).interior_padding() + 1);
+
+        // Account for negative low and high padding: skip assignment if the
+        // any target index is out of range.
+        if (!(target_index[i] >= 0 &&
+              target_index[i] < pad->shape().dimensions(i))) {
+          return true;
+        }
+      }
+      result->Set<ReturnT>(target_index,
+                           evaluated_operand.Get<ReturnT>(input_index));
+      return true;
+    };
+
+    std::vector<int64> zero_base(evaluated_operand.shape().dimensions_size(),
+                                 0);
+    std::vector<int64> step(evaluated_operand.shape().dimensions_size(), 1);
+
+    ShapeUtil::ForEachIndex(
+        evaluated_operand.shape(), zero_base,
+        AsInt64Slice(evaluated_operand.shape().dimensions()), step, func);
+
+    parent_->evaluated_[pad] = std::move(result);
+    return Status::OK();
+  }
+
+  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override {
+    auto operand = dynamic_slice->operand(0);
+    auto start_indices = dynamic_slice->operand(1);
+    auto result_shape = dynamic_slice->shape();
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferDynamicSliceShape(
+                            operand->shape(), start_indices->shape(),
+                            dynamic_slice->dynamic_slice_sizes()));
+    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
+        << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
+        << "but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+    TF_RET_CHECK(
+        primitive_util::IsIntegralType(start_indices->shape().element_type()));
+
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    const Literal& start_indices_literal =
+        parent_->GetEvaluatedLiteralFor(start_indices);
+
+    switch (start_indices->shape().element_type()) {
+      case S32: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_slice],
+            DynamicSlice<int32>(operand_literal, start_indices_literal,
+                                result_shape));
+      } break;
+      case S64: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_slice],
+            DynamicSlice<int64>(operand_literal, start_indices_literal,
+                                result_shape));
+      } break;
+      case U32: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_slice],
+            DynamicSlice<uint32>(operand_literal, start_indices_literal,
+                                 result_shape));
+      } break;
+      case U64: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_slice],
+            DynamicSlice<uint64>(operand_literal, start_indices_literal,
+                                 result_shape));
+      } break;
+      default:
+        LOG(FATAL) << "HandleDynamicSlice: unhandled primitive type for "
+                      "start_indices: "
+                   << PrimitiveType_Name(start_indices->shape().element_type());
+    }
+
+    return Status::OK();
+  }
+
+  Status HandleDynamicUpdateSlice(
+      HloInstruction* dynamic_update_slice) override {
+    auto operand = dynamic_update_slice->operand(0);
+    auto update = dynamic_update_slice->operand(1);
+    auto start_indices = dynamic_update_slice->operand(2);
+    auto result_shape = dynamic_update_slice->shape();
+    TF_ASSIGN_OR_RETURN(
+        auto inferred_return_shape,
+        ShapeInference::InferDynamicUpdateSliceShape(
+            operand->shape(), update->shape(), start_indices->shape()));
+    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
+        << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
+        << "but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+    TF_RET_CHECK(
+        primitive_util::IsIntegralType(start_indices->shape().element_type()));
+    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, operand->shape()));
+
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    const Literal& update_literal = parent_->GetEvaluatedLiteralFor(update);
+    const Literal& start_indices_literal =
+        parent_->GetEvaluatedLiteralFor(start_indices);
+
+    switch (start_indices->shape().element_type()) {
+      case S32: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_update_slice],
+            DynamicUpdateSlice<int32>(operand_literal, update_literal,
+                                      start_indices_literal));
+      } break;
+      case S64: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_update_slice],
+            DynamicUpdateSlice<int64>(operand_literal, update_literal,
+                                      start_indices_literal));
+      } break;
+      case U32: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_update_slice],
+            DynamicUpdateSlice<uint32>(operand_literal, update_literal,
+                                       start_indices_literal));
+      } break;
+      case U64: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_update_slice],
+            DynamicUpdateSlice<uint64>(operand_literal, update_literal,
+                                       start_indices_literal));
+      } break;
+      default:
+        LOG(FATAL) << "HandleDynamicUpdateSlice: unhandled primitive type for "
+                      "start_indices: "
+                   << PrimitiveType_Name(start_indices->shape().element_type());
+    }
+
+    return Status::OK();
+  }
+
+  template <typename NativeT>
+  StatusOr<std::unique_ptr<Literal>> MapImpl(HloInstruction* map) {
+    auto operands = map->operands();
+    HloComputation* computation = map->to_apply();
+
+    auto result = MakeUnique<Literal>(map->shape());
+
+    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          std::vector<std::unique_ptr<Literal>> arg_literals;
+          arg_literals.reserve(operands.size());
+
+          // Construct scalar literal parameters to be passed to the map
+          // computation.
+          for (auto operand : operands) {
+            const Literal& arg_literal =
+                parent_->GetEvaluatedLiteralFor(operand);
+
+            auto curr_val = arg_literal.Get<NativeT>(multi_index);
+            auto curr_val_literal = Literal::CreateR0<NativeT>(curr_val);
+
+            arg_literals.push_back(std::move(curr_val_literal));
+          }
+
+          std::unique_ptr<Literal> computed_result =
+              embedded_evaluator
+                  .Evaluate<std::unique_ptr<Literal>>(*computation,
+                                                      arg_literals)
+                  .ConsumeValueOrDie();
+          // Clear visit states so that the we can use the evaluate again on
+          // the same computation.
+          embedded_evaluator.ResetVisitStates();
+
+          return computed_result->Get<ReturnT>({});
+        }));
+    return std::move(result);
+  }
+
+  Status HandleMap(HloInstruction* map) override {
+    switch (map->operand(0)->shape().element_type()) {
+      case PRED: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<bool>(map));
+        break;
+      }
+      case U8: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint8>(map));
+        break;
+      }
+      case U32: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint32>(map));
+        break;
+      }
+      case U64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint64>(map));
+        break;
+      }
+      case S8: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int8>(map));
+        break;
+      }
+      case S32: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int32>(map));
+        break;
+      }
+      case S64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int64>(map));
+        break;
+      }
+      case F16: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map],
+                            MapImpl<Eigen::half>(map));
+        break;
+      }
+      case F32: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<float>(map));
+        break;
+      }
+      case F64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<double>(map));
+        break;
+      }
+      case C64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<complex64>(map));
+        break;
+      }
+      default:
+        LOG(FATAL) << "HandleMap: unhandled primitive type for "
+                      "input operand: "
+                   << PrimitiveType_Name(
+                          map->operand(0)->shape().element_type());
+    }
+
+    return Status::OK();
+  }
+
+  Status HandleReduce(HloInstruction* reduce) override {
+    auto arg = reduce->operand(0);
+    auto init_value = reduce->operand(1);
+    tensorflow::gtl::ArraySlice<int64> dimensions(reduce->dimensions());
+    HloComputation* function = reduce->to_apply();
+    TF_RET_CHECK(ShapeUtil::Rank(reduce->shape()) ==
+                 ShapeUtil::Rank(arg->shape()) - dimensions.size());
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferReduceShape(
+                            /*arg=*/arg->shape(),
+                            /*init_value=*/init_value->shape(),
+                            /*dimensions_to_reduce=*/dimensions,
+                            /*to_apply=*/function->ComputeProgramShape()));
+    TF_RET_CHECK(ShapeUtil::Compatible(reduce->shape(), inferred_return_shape))
+        << "return shape is set to: " << ShapeUtil::HumanString(reduce->shape())
+        << "but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+
+    const Literal& arg_literal = parent_->GetEvaluatedLiteralFor(arg);
+    VLOG(3) << "HandleReduce arg_literal: " << arg_literal.ToString();
+    const Literal& init_literal = parent_->GetEvaluatedLiteralFor(init_value);
+    VLOG(3) << "HandleReduce init_literal: " << init_literal.ToString();
+    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
+    auto init_scalar = init_literal.Get<ReturnT>({});
+
+    const auto arg_dimensions = AsInt64Slice(arg_literal.shape().dimensions());
+    std::vector<int64> arg_dim_steps(arg_dimensions.size());
+    std::vector<int64> arg_dim_counts(arg_dimensions.size());
+    for (const int64 dim : dimensions) {
+      arg_dim_steps[dim] = 1;
+      arg_dim_counts[dim] = arg_dimensions[dim];
+    }
+
+    // Map each dimension in the result to a dimension in arg that isn't
+    // being reduced.
+    std::vector<int64> result_to_arg_index;
+    for (int64 i = 0; i < arg_dimensions.size(); ++i) {
+      if (arg_dim_steps[i] == 0) {
+        result_to_arg_index.push_back(i);
+      }
+    }
+
+    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
+    auto result = MakeUnique<Literal>(reduce->shape());
+    // For each resulting dimension, calculate and assign computed value.
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          ReturnT result_val = init_scalar;
+
+          std::vector<int64> base(arg_dimensions.size());
+          for (int64 i = 0; i < multi_index.size(); ++i) {
+            base[result_to_arg_index[i]] = multi_index[i];
+          }
+
+          // When the reduction is addition of floats, accumulate in a double
+          // for better precision. Also, avoid creating Literals for the
+          // intermediate results; it's much faster.
+          if (ShapeUtil::ElementIsFloating(init_literal.shape()) &&
+              IsScalarAdd(function)) {
+            double computed_result = 0;
+            auto func = [&](tensorflow::gtl::ArraySlice<int64> input_index) {
+              computed_result += arg_literal.Get<float>(input_index);
+              return true;
+            };
+            ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
+                                    arg_dim_steps, func);
+            return static_cast<ReturnT>(computed_result);
+          }
+          auto func = [&](tensorflow::gtl::ArraySlice<int64> input_index) {
+            auto curr_val = arg_literal.Get<ReturnT>(input_index);
+
+            // Evaluate computation with specified literal operands.
+            auto curr_val_literal = Literal::CreateR0<ReturnT>(curr_val);
+            auto result_val_literal = Literal::CreateR0<ReturnT>(result_val);
+            std::vector<const Literal*> args = {result_val_literal.get(),
+                                                curr_val_literal.get()};
+
+            std::unique_ptr<Literal> computed_result =
+                embedded_evaluator.Evaluate<const Literal*>(*function, args)
+                    .ConsumeValueOrDie();
+            // Clear visit states so that we can use the evaluator again on
+            // the same computation.
+            embedded_evaluator.ResetVisitStates();
+            // Assign computed result to result_val.
+            result_val = computed_result->Get<ReturnT>({});
+            return true;
+          };
+          // Computes one element of the result, reducing all dimensions that
+          // contribute to that element.
+          ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
+                                  arg_dim_steps, func);
+          return result_val;
+        }));
+
+    parent_->evaluated_[reduce] = std::move(result);
+    return Status::OK();
+  }
+
+  bool IsScalarAdd(HloComputation* computation) {
+    HloInstruction* instruction = computation->root_instruction();
+    if (instruction->opcode() == HloOpcode::kAdd &&
+        computation->num_parameters() == 2) {
+      const HloInstruction* lhs = instruction->operand(0);
+      const HloInstruction* rhs = instruction->operand(1);
+      return lhs->opcode() == HloOpcode::kParameter &&
+             ShapeUtil::IsScalar(lhs->shape()) &&
+             rhs->opcode() == HloOpcode::kParameter &&
+             ShapeUtil::IsScalar(rhs->shape()) && lhs != rhs;
+    }
+    return false;
+  }
+
+  Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override {
+    auto operand = select_and_scatter->operand(0);
+    auto source = select_and_scatter->operand(1);
+    const Window& window = select_and_scatter->window();
+
+    const Literal& init_literal =
+        parent_->GetEvaluatedLiteralFor(select_and_scatter->operand(2));
+    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
+    auto init_scalar = init_literal.Get<ReturnT>({});
+
+    auto result = MakeUnique<Literal>(select_and_scatter->shape());
+
+    // Initialize result array with the init value.
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> output_index) {
+          return init_scalar;
+        }));
+
+    std::vector<int64> window_dimension_sizes;
+    for (const auto& window_dimension : window.dimensions()) {
+      window_dimension_sizes.push_back(window_dimension.size());
+    }
+    const Shape window_shape = ShapeUtil::MakeShape(
+        operand->shape().element_type(), window_dimension_sizes);
+
+    HloComputation* select = select_and_scatter->select();
+    HloComputation* scatter = select_and_scatter->scatter();
+
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    const Literal& source_literal = parent_->GetEvaluatedLiteralFor(source);
+
+    int64 rank = ShapeUtil::Rank(operand_literal.shape());
+
+    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
+    DimensionVector source_index(rank, 0);
+
+    // Used in the dual IterateThroughWindow lambdas below. Hoisted to avoid
+    // dynamic memory allocations.
+    auto curr_val_literal = Literal::CreateR0<ReturnT>(ReturnT());
+    auto selected_val_literal = Literal::CreateR0<ReturnT>(ReturnT());
+    auto source_literal_scatter = Literal::CreateR0<ReturnT>(ReturnT());
+    auto scattered_literal = Literal::CreateR0<ReturnT>(ReturnT());
+    do {
+      // For each element in `source`, we place a window in `operand`. For each
+      // window placement, we iterate inside the window twice:
+      //
+      // 1. Find the selected index by applying `select` function to all
+      // elements. E.g., If the `select` function is GreaterEqual, the first
+      // iteration through the window finds the biggest value and returns its
+      // index.
+      //
+      // 2. Using the selected index, scatter value from `source` to result. We
+      // do this by iterating through the window, and compare each index with
+      // the selected index.
+      tensorflow::gtl::optional<ReturnT> selected_val;
+      tensorflow::gtl::optional<std::vector<int64>> selected_index;
+
+      IterateThroughWindow(
+          window_shape, window, operand_literal.shape(), source_index,
+          [&](const std::vector<int64>& operand_index) {
+            auto curr_val = operand_literal.Get<ReturnT>(operand_index);
+            if (!selected_val) {
+              selected_val = curr_val;
+              selected_index = operand_index;
+            }
+            curr_val_literal->Set({}, curr_val);
+            selected_val_literal->Set({}, *selected_val);
+            std::unique_ptr<Literal> computed_result =
+                embedded_evaluator
+                    .Evaluate<const Literal*>(
+                        *select,
+                        {selected_val_literal.get(), curr_val_literal.get()})
+                    .ConsumeValueOrDie();
+            bool selected = !computed_result->Get<bool>({});
+            if (selected) {
+              selected_val = curr_val;
+              selected_index = operand_index;
+            }
+            embedded_evaluator.ResetVisitStates();
+          });
+
+      IterateThroughWindow(
+          window_shape, window, operand_literal.shape(), source_index,
+          [&](const std::vector<int64>& operand_index) {
+            if (std::equal(operand_index.begin(), operand_index.end(),
+                           selected_index->begin())) {
+              auto source = source_literal.Get<ReturnT>(source_index);
+              auto scattered = result->Get<ReturnT>(operand_index);
+              source_literal_scatter->Set({}, source);
+              scattered_literal->Set({}, scattered);
+              std::unique_ptr<Literal> computed_result =
+                  embedded_evaluator
+                      .Evaluate<const Literal*>(*scatter,
+                                                {source_literal_scatter.get(),
+                                                 scattered_literal.get()})
+                      .ConsumeValueOrDie();
+              result->Set(operand_index, computed_result->Get<ReturnT>({}));
+              // Clear visit states so that the we can use the evaluator again
+              // on the same computation.
+              embedded_evaluator.ResetVisitStates();
+            }
+          });
+    } while (IndexUtil::BumpIndices(source->shape(), &source_index));
+
+    parent_->evaluated_[select_and_scatter] = std::move(result);
+    return Status::OK();
+  }
+
+  Status HandleReduceWindow(HloInstruction* reduce_window) override {
+    auto operand = reduce_window->operand(0);
+    const Window& window = reduce_window->window();
+    HloComputation* function = reduce_window->to_apply();
+    TF_ASSIGN_OR_RETURN(
+        auto inferred_return_shape,
+        ShapeInference::InferReduceWindowShape(
+            /*operand_shape=*/reduce_window->operand(0)->shape(),
+            /*init_value=*/reduce_window->operand(1)->shape(), window,
+            /*to_apply_shape=*/function->ComputeProgramShape()));
+    TF_RET_CHECK(
+        ShapeUtil::Compatible(reduce_window->shape(), inferred_return_shape))
+        << "return shape is set to: "
+        << ShapeUtil::HumanStringWithLayout(reduce_window->shape())
+        << "but is inferred to be: "
+        << ShapeUtil::HumanStringWithLayout(inferred_return_shape);
+
+    const Literal& operand_literal =
+        parent_->GetEvaluatedLiteralFor(reduce_window->operand(0));
+    VLOG(3) << "HandleReduceWindow arg_literal: " << operand_literal.ToString();
+    const Literal& init_literal =
+        parent_->GetEvaluatedLiteralFor(reduce_window->operand(1));
+    VLOG(3) << "HandleReduceWindow init_literal: " << init_literal.ToString();
+    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
+    auto init_scalar = init_literal.Get<ReturnT>({});
+
+    // Creates a Shape object from window, for iteration below.
+    std::vector<int64> window_dimension_sizes;
+    for (const auto& window_dimension : window.dimensions()) {
+      window_dimension_sizes.push_back(window_dimension.size());
+    }
+    const Shape window_shape = ShapeUtil::MakeShape(
+        operand->shape().element_type(), window_dimension_sizes);
+
+    DimensionVector window_index(window.dimensions_size());
+    DimensionVector operand_index(ShapeUtil::Rank(operand_literal.shape()));
+
+    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
+    auto result = MakeUnique<Literal>(reduce_window->shape());
+    // For each resulting dimension, calculate and assign computed value.
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> output_index) {
+          ReturnT result_val = init_scalar;
+
+          std::fill(window_index.begin(), window_index.end(), 0);
+          std::fill(operand_index.begin(), operand_index.end(), 0);
+
+          IterateThroughWindow(
+              window_shape, window, operand_literal.shape(), output_index,
+              [&](const std::vector<int64>& operand_index) {
+                auto curr_val = operand_literal.Get<ReturnT>(operand_index);
+
+                // Evaluate computation with specified literal operands.
+                const auto curr_val_literal =
+                    Literal::CreateR0<ReturnT>(curr_val);
+                const auto result_val_literal =
+                    Literal::CreateR0<ReturnT>(result_val);
+                const std::vector<const Literal*> args = {
+                    result_val_literal.get(), curr_val_literal.get()};
+                std::unique_ptr<Literal> computed_result =
+                    embedded_evaluator.Evaluate<const Literal*>(*function, args)
+                        .ConsumeValueOrDie();
+
+                // Clear visit states so that the we can use the evaluate again
+                // on the same computation.
+                embedded_evaluator.ResetVisitStates();
+
+                result_val = computed_result->Get<ReturnT>({});
+              });
+
+          return result_val;
+        }));
+
+    parent_->evaluated_[reduce_window] = std::move(result);
+    return Status::OK();
+  }
+
+  Status HandleSlice(HloInstruction* slice) override {
+    auto operand = slice->operand(0);
+    const Shape& shape = slice->shape();
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferSliceShape(
+                            operand->shape(), slice->slice_starts(),
+                            slice->slice_limits(), slice->slice_strides()));
+    TF_RET_CHECK(ShapeUtil::Compatible(shape, inferred_return_shape))
+        << "return shape set to: " << ShapeUtil::HumanString(shape)
+        << " but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+
+    const int64 rank = ShapeUtil::Rank(operand->shape());
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    auto func = [&](tensorflow::gtl::ArraySlice<int64> out_index) {
+      DimensionVector operand_index(rank);
+      for (int64 i = 0; i < rank; ++i) {
+        operand_index[i] =
+            slice->slice_starts(i) + out_index[i] * slice->slice_strides(i);
+      }
+      return operand_literal.Get<ReturnT>(operand_index);
+    };
+
+    auto result = Literal::CreateFromDimensions(
+        shape.element_type(), AsInt64Slice(shape.dimensions()));
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(func));
+    parent_->evaluated_[slice] = std::move(result);
+    return Status::OK();
+  }
+
+  // Enable CLZ only for int32, uint32, int64 and uint64.
+  template <
+      typename NativeT,
+      typename std::enable_if<
+          (std::is_floating_point<NativeT>::value ||
+           std::is_integral<NativeT>::value || is_complex_t<NativeT>::value) &&
+          !(std::is_same<NativeT, uint32>::value ||
+            std::is_same<NativeT, int32>::value ||
+            std::is_same<NativeT, int64>::value ||
+            std::is_same<NativeT, uint64>::value)>::type* = nullptr>
+  Status HandleClz(HloInstruction* clz) {
+    return InvalidArgument("Unsupported type for Clz");
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_same<NativeT, uint32>::value ||
+                std::is_same<NativeT, int32>::value>::type* = nullptr>
+  Status HandleClz(HloInstruction* clz) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clz],
+                        ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) {
+                          return 31 - tensorflow::Log2Floor(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_same<NativeT, uint64>::value ||
+                std::is_same<NativeT, int64>::value>::type* = nullptr>
+  Status HandleClz(HloInstruction* clz) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clz],
+                        ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) {
+                          return 63 - tensorflow::Log2Floor64(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  Status HandleClz(HloInstruction* clz) override {
+    return HandleClz<ElementwiseT>(clz);
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleSin(HloInstruction* sin) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sin],
+                        ElementWiseUnaryOp(sin, [](ElementwiseT elem_operand) {
+                          return std::sin(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_integral<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleSin(HloInstruction* sin) {
+    return InvalidArgument("Unsupported type for Sin");
+  }
+
+  Status HandleSin(HloInstruction* sin) override {
+    return HandleSin<ElementwiseT>(sin);
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleCos(HloInstruction* cos) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[cos],
+                        ElementWiseUnaryOp(cos, [](ElementwiseT elem_operand) {
+                          return std::cos(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_integral<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleCos(HloInstruction* cos) {
+    return InvalidArgument("Unsupported type for Cos");
+  }
+
+  Status HandleCos(HloInstruction* cos) override {
+    return HandleCos<ElementwiseT>(cos);
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_same<
+                                  float, NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[reduce_precision],
+        ElementWiseUnaryOp(reduce_precision, [reduce_precision](
+                                                 ElementwiseT elem) {
+          uint32_t value_as_int = tensorflow::bit_cast<uint32_t>(elem);
+          const uint32_t mantissa_bits = reduce_precision->mantissa_bits();
+          const uint32_t exponent_bits = reduce_precision->exponent_bits();
+
+          // Code is based on the CPU/GPU implementation in LLVM-emitting code.
+          //
+          // Bits in float type:
+          //   mantissa : bits [0:22]
+          //   exponent : bits [23:30]
+          //   sign     : bits [31]
+          if (mantissa_bits < 23) {
+            const uint32_t last_mantissa_bit_mask = 1u << (23 - mantissa_bits);
+
+            // Compute rounding bias for round-to-nearest with ties to even.
+            // This is equal to a base value of 0111... plus one bit if the last
+            // remaining mantissa bit is 1.
+            const uint32_t base_rounding_bias =
+                (last_mantissa_bit_mask >> 1) - 1;
+            const uint32_t x_last_mantissa_bit =
+                (value_as_int & last_mantissa_bit_mask) >> (23 - mantissa_bits);
+            const uint32_t x_rounding_bias =
+                x_last_mantissa_bit + base_rounding_bias;
+
+            // Add rounding bias, and mask out truncated bits.  Note that the
+            // case where adding the rounding bias overflows into the exponent
+            // bits is correct; the non-masked mantissa bits will all be zero,
+            // and the exponent will be incremented by one.
+            const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
+            value_as_int = value_as_int + x_rounding_bias;
+            value_as_int = value_as_int & truncation_mask;
+          }
+          if (exponent_bits < 8) {
+            // Masks for f32 values.
+            const uint32_t f32_sign_bit_mask = 1u << 31;
+            const uint32_t f32_exp_bits_mask = 0xffu << 23;
+
+            // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the
+            // most- significant bit -- is equal to 1.0f for all exponent sizes.
+            // Adding 2^(n-1)-1 to this gives us the highest non-infinite
+            // exponent for a bit- size of n, and subtracting 2^(n-1)-1 from
+            // this gives us the lowest' exponent (corresponding to 0.0f).
+            //
+            // Thus, the f32 exponent corresponding to the highest non-infinite
+            // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
+            // exponent corresponding to the lowest exponent for a bit size of n
+            // is (2^7-1) - 2^(n-1)-1.
+            //
+            // Note that we have already checked that exponents_bits >= 1.
+            const uint32_t f32_exponent_bias = (1 << 7) - 1;
+            const uint32_t reduced_exponent_bias =
+                (1 << (exponent_bits - 1)) - 1;
+            const uint32_t reduced_max_exponent =
+                f32_exponent_bias + reduced_exponent_bias;
+            const uint32_t reduced_min_exponent =
+                f32_exponent_bias - reduced_exponent_bias;
+
+            // Do we overflow or underflow?
+            const uint32_t x_exponent = value_as_int & f32_exp_bits_mask;
+            const bool x_overflows = x_exponent > (reduced_max_exponent << 23);
+            const bool x_underflows =
+                x_exponent <= (reduced_min_exponent << 23);
+
+            // Compute appropriately-signed values of zero and infinity.
+            const uint32_t x_signed_zero = value_as_int & f32_sign_bit_mask;
+            const uint32_t x_signed_inf = x_signed_zero | f32_exp_bits_mask;
+
+            // Force to zero or infinity if overflow or underflow.  (Note that
+            // this truncates all denormal values to zero, rather than rounding
+            // them.)
+            value_as_int = x_overflows ? x_signed_inf : value_as_int;
+            value_as_int = x_underflows ? x_signed_zero : value_as_int;
+          }
+
+          float reduced_result = tensorflow::bit_cast<float>(value_as_int);
+          if (std::isnan(elem)) {
+            reduced_result = mantissa_bits > 0
+                                 ? elem
+                                 : std::numeric_limits<float>::infinity();
+          }
+          return reduced_result;
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_same<
+                                  double, NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    return InvalidArgument("Double not supported for reduce precision");
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_integral<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    return InvalidArgument("Unsupported type for reduce precision");
+  }
+
+  Status HandleReducePrecision(HloInstruction* reduce_precision) override {
+    return HandleReducePrecision<ElementwiseT>(reduce_precision);
+  }
+
+ private:
+  // Creates a vector of multipliers which can be used to create a linear index
+  // into shape.
+  //
+  // Given the multidimensional index {i1, ..., iN} and
+  // M = MakeDimMultipliers(shape), the corresponding linear index LI is simply
+  //
+  //   LI = i1 * M[1] + i2 * M[2] + ... + iN * M[N].
+  //
+  // This lets you calculate LI given the multidimensional indices in any order.
+  static DimensionVector MakeDimMultipliers(const Shape& shape) {
+    DimensionVector v(ShapeUtil::Rank(shape));
+    int64 scale = 1;
+    for (auto dim : LayoutUtil::MinorToMajor(shape)) {
+      v[dim] = scale;
+      scale *= shape.dimensions(dim);
+    }
+    return v;
+  }
+
+  // For one particular placement of a window in a base shape (the placement is
+  // represented as `window_count_index`), iterates inside the window.
+  // Translates the window index into base index. If the base index is within
+  // bound, call `f` with the base index.
+  static void IterateThroughWindow(
+      const Shape& window_shape, const Window& window, const Shape& base_shape,
+      const tensorflow::gtl::ArraySlice<int64>& window_count_index,
+      const std::function<void(const std::vector<int64>&)>& f) {
+    const int64 rank = ShapeUtil::Rank(base_shape);
+    DimensionVector window_index(rank);
+    std::fill(window_index.begin(), window_index.end(), 0);
+    do {
+      std::vector<int64> base_index(rank);
+      bool out_of_bound = false;
+      for (int64 i = 0; i < rank; ++i) {
+        base_index[i] = window_count_index[i] * window.dimensions(i).stride() +
+                        window_index[i] - window.dimensions(i).padding_low();
+        if (base_index[i] < 0 || base_index[i] >= base_shape.dimensions(i)) {
+          out_of_bound = true;
+          break;
+        }
+      }
+      if (!out_of_bound) {
+        f(base_index);
+      }
+    } while (IndexUtil::BumpIndices(window_shape, &window_index));
+  }
+
+  template <typename IndexT>
+  StatusOr<std::unique_ptr<Literal>> DynamicSlice(
+      const Literal& operand_literal, const Literal& start_indices_literal,
+      const Shape& result_shape) {
+    auto start_indices_typed = start_indices_literal.data<IndexT>();
+    std::vector<int64> start(start_indices_typed.begin(),
+                             start_indices_typed.end());
+
+    // Clamp the start indices so the slice is in-bounds w.r.t the operand.
+
+    // TODO(b/74360564): This is implementation defined behavior, but is
+    // currently respected by all implementations. Change this if we ever decide
+    // to oficially document different behavior.
+    for (int64 i = 0; i < start.size(); ++i) {
+      start[i] = std::min<int64>(
+          std::max(0LL, start[i]),
+          operand_literal.shape().dimensions(i) - result_shape.dimensions(i));
+    }
+
+    std::vector<int64> operand_indices(start.size());
+    auto result = MakeUnique<Literal>(result_shape);
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          for (int64 i = 0; i < operand_indices.size(); ++i) {
+            CHECK_GE(multi_index[i] + start[i], 0);
+            operand_indices[i] = multi_index[i] + start[i];
+          }
+
+          auto result = operand_literal.Get<ReturnT>(operand_indices);
+          return result;
+        }));
+
+    return std::move(result);
+  }
+
+  template <typename IndexT>
+  StatusOr<std::unique_ptr<Literal>> DynamicUpdateSlice(
+      const Literal& operand_literal, const Literal& update_literal,
+      const Literal& start_indices_literal) {
+    auto result = operand_literal.CloneToUnique();
+    auto start_indices_typed = start_indices_literal.data<IndexT>();
+    const auto rank = ShapeUtil::Rank(result->shape());
+    std::vector<int64> start(start_indices_typed.begin(),
+                             start_indices_typed.end());
+    // Clamp the update start indices so the slice is in-bounds w.r.t the
+    // operand.
+
+    // TODO(b/74360564): This is implementation defined behavior, but is
+    // currently respected by all implementations. Change this if we ever decide
+    // to oficially document different behavior.
+    for (int64 i = 0; i < rank; ++i) {
+      start[i] = std::min<int64>(
+          std::max<int64>(0, start[i]),
+          result->shape().dimensions(i) - update_literal.shape().dimensions(i));
+    }
+    std::vector<int64> result_index(rank, 0);
+
+    auto func = [&](tensorflow::gtl::ArraySlice<int64> update_index) {
+      std::transform(update_index.begin(), update_index.end(), start.begin(),
+                     result_index.begin(), std::plus<int64>());
+      result->Set<ReturnT>(result_index,
+                           update_literal.Get<ReturnT>(update_index));
+      return true;
+    };
+
+    std::vector<int64> base(update_literal.shape().dimensions_size(), 0);
+    std::vector<int64> step(update_literal.shape().dimensions_size(), 1);
+    ShapeUtil::ForEachIndex(update_literal.shape(), base,
+                            AsInt64Slice(update_literal.shape().dimensions()),
+                            step, func);
+
+    return std::move(result);
+  }
+
+  StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOp(
+      HloInstruction* instruction,
+      const std::function<ElementwiseT(ElementwiseT)>& unary_op) {
+    const Literal& operand_literal =
+        parent_->GetEvaluatedLiteralFor(instruction->operand(0));
+    TF_ASSIGN_OR_RETURN(
+        auto result_literal,
+        (HloEvaluator::ElementWiseUnaryOpImpl<ReturnT, ReturnT>(
+            instruction, ConvertUnaryFunction(unary_op), operand_literal)));
+
+    return std::move(result_literal);
+  }
+
+  StatusOr<std::unique_ptr<Literal>> ElementWiseBinaryOp(
+      HloInstruction* instruction,
+      const std::function<ElementwiseT(ElementwiseT, ElementwiseT)>&
+          binary_op) {
+    const auto shape = instruction->shape();
+    const auto* lhs = instruction->operand(0);
+    const auto* rhs = instruction->operand(1);
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast
+    // is removed.
+    if (!(ShapeUtil::SameDimensions(shape, rhs->shape()) &&
+          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
+      return Unimplemented(
+          "Implicit broadcasting is currently unsupported in HLO evaluator "
+          "Shape Mismatch: %s vs %s vs %s: ",
+          ShapeUtil::HumanString(shape).c_str(),
+          ShapeUtil::HumanString(lhs->shape()).c_str(),
+          ShapeUtil::HumanString(rhs->shape()).c_str());
+    }
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+
+    auto result = MakeUnique<Literal>(shape);
+
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          return ConvertBinaryFunction(binary_op)(
+              lhs_literal.Get<ReturnT>(multi_index),
+              rhs_literal.Get<ReturnT>(multi_index));
+        }));
+    return std::move(result);
+  }
+
+  template <typename LhsType, typename RhsType, typename EhsType>
+  StatusOr<std::unique_ptr<Literal>> ElementwiseTernaryOp(
+      HloInstruction* instruction,
+      const std::function<ReturnT(LhsType, RhsType, EhsType)>& ternary_op) {
+    const auto shape = instruction->shape();
+    const auto* lhs = instruction->operand(0);
+    const auto* rhs = instruction->operand(1);
+    const auto* ehs = instruction->operand(2);
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit
+    // broadcast is removed.
+    if (!(ShapeUtil::SameDimensions(shape, lhs->shape()) &&
+          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()) &&
+          ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()))) {
+      return Unimplemented(
+          "Implicit broadcasting is currently unsupported in HLO evaluator "
+          "Shape Mismatch: %s vs %s vs %s vs %s: ",
+          ShapeUtil::HumanString(shape).c_str(),
+          ShapeUtil::HumanString(lhs->shape()).c_str(),
+          ShapeUtil::HumanString(rhs->shape()).c_str(),
+          ShapeUtil::HumanString(ehs->shape()).c_str());
+    }
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+    const Literal& ehs_literal = parent_->GetEvaluatedLiteralFor(ehs);
+
+    auto result = MakeUnique<Literal>(shape);
+
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          return ternary_op(lhs_literal.Get<LhsType>(multi_index),
+                            rhs_literal.Get<RhsType>(multi_index),
+                            ehs_literal.Get<EhsType>(multi_index));
+        }));
+
+    return std::move(result);
+  }
+
+  template <typename NativeT>
+  static bool IsShiftOutOfBounds(NativeT rhs) {
+    typedef typename std::make_unsigned<NativeT>::type UnsignedT;
+    UnsignedT lhs_size_unsigned = sizeof(NativeT) * CHAR_BIT;
+    UnsignedT rhs_unsigned = static_cast<UnsignedT>(rhs);
+    return rhs_unsigned >= lhs_size_unsigned;
+  }
+
+  HloEvaluator* parent_;
+};
+
+// These extern templates prevent users of this class from implicitly
+// instantiating it.  We explicitly instantiate this class in the various
+// hlo_evaluator_typed_visitor*.cc files.
+extern template class HloEvaluatorTypedVisitor<bool>;
+extern template class HloEvaluatorTypedVisitor<uint8>;
+extern template class HloEvaluatorTypedVisitor<uint32>;
+extern template class HloEvaluatorTypedVisitor<uint64>;
+extern template class HloEvaluatorTypedVisitor<int8>;
+extern template class HloEvaluatorTypedVisitor<int32>;
+extern template class HloEvaluatorTypedVisitor<int64>;
+extern template class HloEvaluatorTypedVisitor<Eigen::half, float>;
+extern template class HloEvaluatorTypedVisitor<float>;
+extern template class HloEvaluatorTypedVisitor<double>;
+extern template class HloEvaluatorTypedVisitor<complex64>;
+extern template class HloEvaluatorTypedVisitor<bfloat16, float>;
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_TYPED_VISITOR_H_
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bfloat16.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bfloat16.cc
new file mode 100644
index 0000000000000000000000000000000000000000..39c352dfb966af4ad9f1874d078b92dd2a321783
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bfloat16.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<bfloat16, float>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bool.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..289b40fa06d37b8f5b2705e7de2f479c4a30e89d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bool.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<bool>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex64.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex64.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9cb4eb921fd3af566de5998a097423c90f0cb860
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex64.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<complex64>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_double.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_double.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e6252fbf8c24a7b79c7e656040a6be7be8d777f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_double.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<double>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_float.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_float.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee793ae77b1b432daece31697ad436de1683bc08
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_float.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<float>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_half.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_half.cc
new file mode 100644
index 0000000000000000000000000000000000000000..038d9d39e4a5881b9f0fb1d98732132aab3aaa2c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_half.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<Eigen::half, float>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int32.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int32.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b1952ca6193958eec49fd15297f73a6c6ac22b83
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int32.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<int32>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int64.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int64.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0cbaffb40b7128fb6e99308fbc2b48e63a3d6fac
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int64.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<int64>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int8.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int8.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6f4bf2a392b51abc4d37db4beab6d1ea2b0c4e3a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int8.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<int8>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint32.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint32.cc
new file mode 100644
index 0000000000000000000000000000000000000000..10235447e0d266a6071097e38913c3856939509b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint32.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<uint32>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint64.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint64.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8abeaa6ffca4409d2664de6f55850622e95bbc9d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint64.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<uint64>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint8.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint8.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6dabd1c176eabcf6656d6de9683bbf0131456d96
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint8.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<uint8>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
index a0cb28246d3be541e798e85552436f64a3521f22..4900c813fdf037e65c6b42d027f1cbefb6ee9830 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
@@ -16,52 +16,32 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 namespace {
 
-class HloExecutionProfileTest : public HloTestBase {
- protected:
-  static constexpr int64 kInstructionCyclesIndex = 0;
-  static constexpr int64 kInstructionNameIndex = 19;
-};
+using tensorflow::strings::StrCat;
+using ::testing::AllOf;
+using ::testing::ContainsRegex;
 
-// Splits `lines` into a sequence of lines delimited by newlines and then split
-// each of those lines into a sequence of words delimited by spaces.  Filter out
-// empty words.
-std::vector<std::vector<string>> SplitIntoLinesAndWords(
-    tensorflow::StringPiece lines) {
-  std::vector<std::vector<string>> result;
-  for (const string& line : tensorflow::str_util::Split(lines, '\n')) {
-    std::vector<string> words;
-    for (const string& word : tensorflow::str_util::Split(line, ' ')) {
-      if (!word.empty()) {
-        words.push_back(word);
-      }
-    }
-    result.push_back(std::move(words));
-  }
-
-  return result;
-}
+class HloExecutionProfileTest : public HloTestBase {};
 
 TEST_F(HloExecutionProfileTest, Basic) {
-  std::unique_ptr<HloModule> hlo_module = CreateNewModule();
-
-  HloComputation::Builder builder(TestName());
+  auto hlo_module = tools::Parse(R"(
+  HloModule test_module
+  ENTRY entry_computation {
+    lhs = f32[30,30]{1,0} parameter(0)
+    rhs = f32[30,30]{1,0} parameter(1)
+    add = f32[30,30]{1,0} add(lhs, rhs)
+    ROOT dot = f32[30,30]{1,0} dot(lhs, add), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  })")
+                        .ValueOrDie();
+  const HloInstruction* dot_instruction =
+      hlo_module->entry_computation()->root_instruction();
+  const HloInstruction* add_instruction = dot_instruction->operand(1);
   Shape shape = ShapeUtil::MakeShape(F32, {30, 30});
-  HloInstruction* param_lhs =
-      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "lhs"));
-  HloInstruction* param_rhs =
-      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "rhs"));
-  HloInstruction* add_instruction =
-      builder.AddInstruction(HloInstruction::CreateBinary(
-          shape, HloOpcode::kAdd, param_lhs, param_rhs));
-  HloInstruction* dot_instruction =
-      builder.AddInstruction(HloInstruction::CreateBinary(
-          shape, HloOpcode::kDot, param_lhs, add_instruction));
-
-  hlo_module->AddEntryComputation(builder.Build());
 
   auto shape_size_function = [&](const Shape& shape) {
     const int64 pointer_size = 8;
@@ -84,20 +64,12 @@ TEST_F(HloExecutionProfileTest, Basic) {
   execution_profile.SetCyclesTakenBy(add_instruction, add_cycles);
   execution_profile.SetCyclesTakenBy(dot_instruction, dot_cycles);
 
-  string rendered_profile = execution_profile.ToString(
-      backend().default_stream_executor()->GetDeviceDescription());
-  std::vector<std::vector<string>> lines_and_words =
-      SplitIntoLinesAndWords(rendered_profile);
-  ASSERT_EQ(lines_and_words.size(), 8);
-
-  const std::vector<string>& line_2 = lines_and_words[2];
-  const std::vector<string>& line_3 = lines_and_words[3];
-
-  EXPECT_EQ(line_2[kInstructionCyclesIndex], std::to_string(dot_cycles));
-  EXPECT_EQ(line_2[kInstructionNameIndex], '%' + dot_instruction->name());
-
-  EXPECT_EQ(line_3[kInstructionCyclesIndex], std::to_string(add_cycles));
-  EXPECT_EQ(line_3[kInstructionNameIndex], '%' + add_instruction->name());
+  EXPECT_THAT(execution_profile.ToString(
+                  backend().default_stream_executor()->GetDeviceDescription()),
+              AllOf(ContainsRegex(StrCat(dot_cycles, R"(\b.*%)",
+                                         dot_instruction->name())),
+                    ContainsRegex(StrCat(add_cycles, R"(\b.*%)",
+                                         add_instruction->name()))));
 }
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index bb4db89f0a242c66b40a5c6541a968cdeb5fb0be..17e3c405f1e5269ddf2f03c031a1137f9bb14fcc 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -322,11 +322,13 @@ class HloDotDumper {
  public:
   HloDotDumper(const HloComputation* computation, tensorflow::StringPiece label,
                const DebugOptions& debug_options, bool show_metadata,
-               const HloExecutionProfile* profile, NodeFilter filter)
+               bool show_backend_config, const HloExecutionProfile* profile,
+               NodeFilter filter)
       : computation_(computation),
-        label_(label.ToString()),
+        label_(std::string(label)),
         debug_options_(debug_options),
         show_metadata_(show_metadata),
+        show_backend_config_(show_backend_config),
         profile_(profile),
         filter_(std::move(filter)) {}
 
@@ -365,6 +367,7 @@ class HloDotDumper {
   string GetInstructionNodeShape(const HloInstruction* instr);
   string GetInstructionNodeLabel(const HloInstruction* instr);
   string GetInstructionNodeMetadata(const HloInstruction* instr);
+  string GetInstructionNodeBackendConfig(const HloInstruction* instr);
   string GetInstructionNodeExtraInfo(const HloInstruction* instr);
   string GetInstructionNodeInlinedOperands(const HloInstruction* instr);
   void AddInstructionIncomingEdges(const HloInstruction* instr);
@@ -393,6 +396,7 @@ class HloDotDumper {
   const string label_;                 // overall name for the graph
   const DebugOptions& debug_options_;
   const bool show_metadata_;
+  const bool show_backend_config_;
   const HloExecutionProfile* profile_;  // may be null
   const NodeFilter filter_;
 
@@ -611,6 +615,10 @@ tooltip = " ";
     if (!extra_info.empty()) {
       StrAppend(&subcomp_label, "<br/>", extra_info);
     }
+    string node_backend_config = GetInstructionNodeBackendConfig(parent_instr);
+    if (!node_backend_config.empty()) {
+      StrAppend(&subcomp_label, "<br/>", node_backend_config);
+    }
 
     bool highlight = filter_.Highlight(parent_instr);
     const char* fillcolor;
@@ -765,6 +773,7 @@ string HloDotDumper::DumpInstruction(const HloInstruction* instr) {
   string node_shape = GetInstructionNodeShape(instr);
   string node_label = GetInstructionNodeLabel(instr);
   string node_metadata = GetInstructionNodeMetadata(instr);
+  string node_backend_config = GetInstructionNodeBackendConfig(instr);
   string extra_info = GetInstructionNodeExtraInfo(instr);
   string inlined_constants = GetInstructionNodeInlinedOperands(instr);
   string trivial_subcomputation = GetInstructionTrivialComputationStr(instr);
@@ -782,8 +791,8 @@ string HloDotDumper::DumpInstruction(const HloInstruction* instr) {
   }
   // Build the text that will be displayed inside the node.
   string node_body = node_label;
-  for (const string& s :
-       {trivial_subcomputation, node_metadata, extra_info, inlined_constants}) {
+  for (const string& s : {trivial_subcomputation, node_metadata,
+                          node_backend_config, extra_info, inlined_constants}) {
     if (!s.empty()) {
       StrAppend(&node_body, "<br/>", s);
     }
@@ -816,7 +825,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
         *elem_count *= dim;
       }
     }
-    if (elem_count.has_value() && *elem_count <= 8) {
+    if (elem_count.has_value() && *elem_count <= 8 && constant->HasLiteral()) {
       return Printf("%s (%s)", constant->literal().ToString(),
                     ShapeUtil::HumanString(constant->shape()));
     }
@@ -916,6 +925,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kDivide:
     case HloOpcode::kEq:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
     case HloOpcode::kGe:
     case HloOpcode::kGt:
@@ -923,6 +933,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kIsFinite:
     case HloOpcode::kLe:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kLt:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
@@ -1078,13 +1089,23 @@ string HloDotDumper::GetInstructionNodeMetadata(const HloInstruction* instr) {
   return Join(lines, "<br/>");
 }
 
+string HloDotDumper::GetInstructionNodeBackendConfig(
+    const HloInstruction* instr) {
+  if (!show_backend_config_ || instr->backend_config().empty()) {
+    return "";
+  }
+
+  return StrCat("backend_config=\"", instr->backend_config(), "\"");
+}
+
 string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
   std::vector<string> lines;
 
   // Get the instruction's extra attributes excluding the names of its
   // subcomputations, since those are drawn explicitly in the graph.
   for (const auto& line : instr->ExtraAttributesToString(
-           HloPrintOptions().set_print_subcomputation_references(false))) {
+           HloPrintOptions().set_print_subcomputation_mode(
+               HloPrintOptions::PrintSubcomputationMode::kOff))) {
     lines.push_back(HtmlLikeStringSanitize(line));
   }
 
@@ -1404,7 +1425,7 @@ string ExportGraph(const string& graph,
 string DumpGraph(const HloComputation& computation, const string& label,
                  const DebugOptions& debug_options,
                  const HloExecutionProfile* hlo_execution_profile,
-                 bool show_metadata) {
+                 bool show_metadata, bool show_backend_config) {
   GraphRendererInterface::GraphKind graph_kind;
   string graph;
   if (debug_options.xla_hlo_dump_as_graphdef()) {
@@ -1414,9 +1435,10 @@ string DumpGraph(const HloComputation& computation, const string& label,
                                                           &graph));
     graph_kind = GraphRendererInterface::TF_GRAPHDEF;
   } else {
-    graph = HloDotDumper(&computation, label, debug_options, show_metadata,
-                         hlo_execution_profile, NodeFilter())
-                .Dump();
+    graph =
+        HloDotDumper(&computation, label, debug_options, show_metadata,
+                     show_backend_config, hlo_execution_profile, NodeFilter())
+            .Dump();
     graph_kind = GraphRendererInterface::DOT_GRAPH;
   }
 
@@ -1427,15 +1449,15 @@ string DumpGraph(const HloComputation& computation, const string& label,
 }
 
 string DumpNeighborhoodAround(const HloInstruction& node, int radius,
-                              bool show_metadata) {
+                              bool show_metadata, bool show_backend_config) {
   auto debug_options = node.GetModule()->config().debug_options();
   string label =
       StrCat("Neighborhood of ", radius, " nodes around ", node.name());
   NodeFilter filter = MakeNodeFilter(&node, radius);
-  string graph =
-      HloDotDumper(node.parent(), label, debug_options, show_metadata,
-                   /*profile=*/nullptr, filter)
-          .Dump();
+  string graph = HloDotDumper(node.parent(), label, debug_options,
+                              show_metadata, show_backend_config,
+                              /*profile=*/nullptr, filter)
+                     .Dump();
   return ExportGraph(graph, GraphRendererInterface::DOT_GRAPH, debug_options);
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 2704aae1e3ba7fb131bfcb1287d807d785fd9774..fc8e1468aca9c2edbc22c30a41a1be8b32a1feca 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -56,7 +56,7 @@ string MaybeDumpHloModule(const HloModule& module, const string& label,
 string DumpGraph(const HloComputation& computation, const string& label,
                  const DebugOptions& debug_options,
                  const HloExecutionProfile* hlo_execution_profile = nullptr,
-                 bool show_metadata = false);
+                 bool show_metadata = false, bool show_backend_config = false);
 
 // Like DumpGraph, but renders only nodes "near" the given node in the graph.
 //
@@ -64,7 +64,8 @@ string DumpGraph(const HloComputation& computation, const string& label,
 // (roughly) corresponds to the max distance a node may be from the primary node
 // before it's omitted from the graph.
 string DumpNeighborhoodAround(const HloInstruction& node, int radius,
-                              bool show_metadata = false);
+                              bool show_metadata = false,
+                              bool show_backend_config = false);
 
 // Dumps the HloModule::ToString() as a file into the provided directory path
 // suffixed with the provided label.
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index a714d0e114245021c28da26beae444dbd3d99bb5..db1c33e2f0dfa0599810ab2e8d32209e64c5c865 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -51,7 +51,7 @@ using ::tensorflow::strings::StrCat;
 
 /* static */
 StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
-    HloModule* module, const HloInstructionProto& proto,
+    const HloInstructionProto& proto,
     const tensorflow::gtl::FlatMap<int64, HloInstruction*>& instruction_map,
     const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map) {
   TF_RET_CHECK(!proto.opcode().empty());
@@ -109,6 +109,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   instruction->name_ = proto.name();
 
   instruction->metadata_ = proto.metadata();
+  instruction->set_backend_config(proto.backend_config());
   if (proto.has_literal()) {
     TF_ASSIGN_OR_RETURN(instruction->literal_,
                         Literal::CreateFromProto(proto.literal()));
@@ -256,10 +257,12 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     case HloOpcode::kCos:
     case HloOpcode::kClz:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
     case HloOpcode::kImag:
     case HloOpcode::kIsFinite:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kNot:
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
@@ -437,7 +440,7 @@ HloInstruction::CreateCrossReplicaSum(
       << "Outfeed shape " << shape << " must be compatible with operand shape "
       << operand->shape();
   instruction->AppendOperand(operand);
-  instruction->outfeed_config_ = outfeed_config.ToString();
+  instruction->outfeed_config_ = std::string(outfeed_config);
   instruction->outfeed_shape_ = shape;
   return instruction;
 }
@@ -792,23 +795,11 @@ HloInstruction::CreateBroadcastSequence(
   return instruction;
 }
 
-// We put the fusion kind into the instruction's name for transpose-dot fusions,
-// since those fusions are really just describing a type of dot rather than
-// generating a novel computation.
-static string FusionNodeName(HloInstruction::FusionKind fusion_kind) {
-  switch (fusion_kind) {
-    case HloInstruction::FusionKind::kTransposeDot:
-      return "dot_fusion";
-    default:
-      return "fusion";
-  }
-}
-
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateFusion(
     const Shape& shape, FusionKind fusion_kind, HloInstruction* fused_root) {
   auto instruction = WrapUnique(new HloInstruction(HloOpcode::kFusion, shape));
   instruction->fusion_kind_ = fusion_kind;
-  instruction->name_ = FusionNodeName(fusion_kind);
+  instruction->name_ = "fusion";
   instruction->set_parent(fused_root->parent());
   instruction->set_metadata(fused_root->metadata());
   instruction->CloneAndFuseInternal(fused_root);
@@ -824,7 +815,7 @@ static string FusionNodeName(HloInstruction::FusionKind fusion_kind) {
     instruction->AppendOperand(operand);
   }
   instruction->fusion_kind_ = fusion_kind;
-  instruction->name_ = FusionNodeName(fusion_kind);
+  instruction->name_ = "fusion";
   instruction->called_computations_.push_back(fusion_computation);
   fusion_computation->SetFusionInstruction(instruction.get());
   return instruction;
@@ -1123,7 +1114,7 @@ RandomDistribution HloInstruction::random_distribution() const {
   return distribution_;
 }
 
-bool HloInstruction::HasSideEffect() const {
+bool HloInstruction::HasSideEffectNoRecurse() const {
   switch (opcode_) {
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
@@ -1135,16 +1126,22 @@ bool HloInstruction::HasSideEffect() const {
     case HloOpcode::kTrace:
     case HloOpcode::kHostCompute:
       return true;
-    default: {
-      // Check if any of the called computations has a side effect.
-      for (const auto& computation : called_computations()) {
-        if (computation->HasSideEffect()) {
-          return true;
-        }
-      }
+    default:
       return false;
+  }
+}
+
+bool HloInstruction::HasSideEffect() const {
+  if (HasSideEffectNoRecurse()) {
+    return true;
+  }
+  // Check if any of the called computations has a side effect.
+  for (const auto& computation : called_computations()) {
+    if (computation->HasSideEffect()) {
+      return true;
     }
   }
+  return false;
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCall(
@@ -1167,7 +1164,7 @@ bool HloInstruction::HasSideEffect() const {
   for (auto operand : operands) {
     instruction->AppendOperand(operand);
   }
-  instruction->custom_call_target_ = custom_call_target.ToString();
+  instruction->custom_call_target_ = std::string(custom_call_target);
   return instruction;
 }
 
@@ -1179,7 +1176,7 @@ bool HloInstruction::HasSideEffect() const {
   for (auto operand : operands) {
     instruction->AppendOperand(operand);
   }
-  instruction->channel_name_ = channel_name.ToString();
+  instruction->channel_name_ = std::string(channel_name);
   instruction->cost_estimate_ns_ = cost_estimate_ns;
   return instruction;
 }
@@ -1231,12 +1228,15 @@ bool HloInstruction::HasSideEffect() const {
 std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     const Shape& shape,
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
-    HloModule* module) const {
+    HloModule* module, CloneMap* clone_map) const {
   VLOG(3) << "CloneWithNewOperands:\n  " << ToString();
   VLOG(3) << "  new operands:";
   for (const HloInstruction* new_operand : new_operands) {
     VLOG(3) << "    %" << new_operand->name();
   }
+  if (module == nullptr) {
+    module = GetModule();
+  }
 
   std::unique_ptr<HloInstruction> clone;
 
@@ -1253,10 +1253,12 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kImag:
     case HloOpcode::kIsFinite:
     case HloOpcode::kFloor:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kNot:
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
@@ -1342,7 +1344,8 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       break;
     case HloOpcode::kFft:
       CHECK_EQ(new_operands.size(), 1);
-      return CreateFft(shape, new_operands[0], fft_type_, fft_length_);
+      clone = CreateFft(shape, new_operands[0], fft_type_, fft_length_);
+      break;
     case HloOpcode::kCrossReplicaSum:
       clone = CreateCrossReplicaSum(shape, new_operands);
       break;
@@ -1415,9 +1418,15 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kConstant:
       clone = CreateConstant(literal_->CloneToUnique());
       break;
-    case HloOpcode::kFusion:
-      clone = CloneFusionWithNewOperands(shape, new_operands, module);
+    case HloOpcode::kFusion: {
+      CHECK_NE(module, nullptr);
+      auto new_fused_computation = module->AddEmbeddedComputation(
+          fused_instructions_computation()->Clone("clone", module, clone_map));
+      clone = CreateFusion(/*shape=*/shape, /*fusion_kind=*/fusion_kind(),
+                           /*operands=*/new_operands,
+                           /*fusion_computation=*/new_fused_computation);
       break;
+    }
     case HloOpcode::kParameter:
       clone = CreateParameter(parameter_number_, shape, name_);
       break;
@@ -1481,15 +1490,19 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
   }
   SetupDerivedInstruction(clone.get());
   clone->set_parent(parent_);
+  clone->set_backend_config(backend_config());
+  if (clone_map != nullptr) {
+    InsertOrDie(clone_map, this, clone.get());
+  }
   return clone;
 }
 
 HloInstruction::~HloInstruction() {}
 
-std::unique_ptr<HloInstruction> HloInstruction::Clone(const string& suffix,
-                                                      HloModule* module) const {
+std::unique_ptr<HloInstruction> HloInstruction::Clone(
+    const string& suffix, HloModule* module, CloneMap* clone_map) const {
   std::unique_ptr<HloInstruction> clone =
-      CloneWithNewOperands(shape_, operands_, module);
+      CloneWithNewOperands(shape_, operands_, module, clone_map);
   if (suffix.empty()) {
     clone->name_ = name();
   } else {
@@ -1526,71 +1539,6 @@ std::unique_ptr<HloInstruction> HloInstruction::Clone(const string& suffix,
   return clone;
 }
 
-std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
-    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-    HloModule* module) const {
-  CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK(parent() != nullptr);
-
-  auto new_instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kFusion, shape));
-  // Add the operands to our new fusion instruction.
-  for (HloInstruction* new_operand : operands) {
-    new_instruction->AppendOperand(new_operand);
-  }
-  // Clone all the fused instructions for the new fusion instruction.
-  HloInstructionMap<HloInstruction*> old_to_new;
-  std::list<std::unique_ptr<HloInstruction>> new_fused_instructions;
-  // Create the list of fused parameters by mapping through the cloned,
-  // fused instructions.
-  for (HloInstruction* old_fused_parameter :
-       fused_instructions_computation()->parameter_instructions()) {
-    new_fused_instructions.push_back(
-        old_fused_parameter->Clone("clone", module));
-    HloInstruction* new_fusion_parameter = new_fused_instructions.back().get();
-    InsertOrDie(&old_to_new, old_fused_parameter, new_fusion_parameter);
-  }
-  for (auto old_fused_instruction :
-       fused_instructions_computation()->MakeInstructionPostOrder()) {
-    if (old_fused_instruction->opcode() == HloOpcode::kParameter) {
-      FindOrDie(old_to_new, old_fused_instruction);
-      continue;
-    }
-    std::vector<HloInstruction*> new_operands;
-    for (int64 operand_idx = 0;
-         operand_idx < old_fused_instruction->operand_count(); ++operand_idx) {
-      HloInstruction* old_operand =
-          old_fused_instruction->mutable_operand(operand_idx);
-      new_operands.push_back(FindOrDie(old_to_new, old_operand));
-    }
-    new_fused_instructions.push_back(
-        old_fused_instruction->CloneWithNewOperands(
-            old_fused_instruction->shape(), new_operands, module));
-    HloInstruction* new_fused_instruction = new_fused_instructions.back().get();
-    new_fused_instruction->set_parent(parent_);
-    InsertOrDie(&old_to_new, old_fused_instruction, new_fused_instruction);
-  }
-  new_instruction->fusion_kind_ = fusion_kind_;
-  auto computation_builder = HloComputation::Builder(
-      fused_instructions_computation()->name() + ".clone",
-      new_instruction.get());
-  // We iterated the fusion instructions in reverse post order which means
-  // that we must reverse our new list of fusion instructions.
-  for (auto new_fused_instruction_iter = new_fused_instructions.rbegin();
-       new_fused_instruction_iter != new_fused_instructions.rend();
-       ++new_fused_instruction_iter) {
-    computation_builder.AddInstruction(std::move(*new_fused_instruction_iter));
-  }
-  if (module == nullptr) {
-    module = GetModule();
-  }
-  auto fused_root_ = fused_expression_root();
-  new_instruction->called_computations_.push_back(
-      CHECK_NOTNULL(module)->AddEmbeddedComputation(
-          computation_builder.Build(FindOrDie(old_to_new, fused_root_))));
-  return new_instruction;
-}
-
 std::pair<const HloInstruction*, ShapeIndex>
 HloInstruction::LatestNonGteAncestorAndIndex() const {
   const HloInstruction* hlo = this;
@@ -1619,6 +1567,8 @@ const Literal& HloInstruction::literal() const {
   return *literal_;
 }
 
+bool HloInstruction::HasLiteral() const { return literal_ != nullptr; }
+
 bool HloInstruction::CanHaveDimensionsField() const {
   return (opcode() == HloOpcode::kReverse ||
           opcode() == HloOpcode::kConcatenate ||
@@ -1739,26 +1689,30 @@ bool HloInstruction::HasConstantOperand() const {
 bool HloInstruction::IdenticalSlowPath(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
-        eq_computations,
-    const std::function<bool(const Shape&, const Shape&)>& eq_shapes) const {
+        eq_computations) const {
   // Perform opcode specific checks.
   switch (opcode()) {
     // The result of these instructions only depend upon their opcode and
     // operands.
     case HloOpcode::kAbs:
     case HloOpcode::kAtan2:
-    case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kAdd:
+    case HloOpcode::kBitcast:
+    case HloOpcode::kBitcastConvert:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kClz:
     case HloOpcode::kComplex:
+    case HloOpcode::kConvert:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kDivide:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
     case HloOpcode::kEq:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
     case HloOpcode::kGe:
     case HloOpcode::kGt:
@@ -1766,6 +1720,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kIsFinite:
     case HloOpcode::kLe:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kAnd:
     case HloOpcode::kNot:
     case HloOpcode::kOr:
@@ -1778,6 +1733,8 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kPower:
     case HloOpcode::kReal:
     case HloOpcode::kRemainder:
+    case HloOpcode::kReshape:
+    case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kSelect:
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
@@ -1789,6 +1746,12 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kTuple:
       return true;
 
+    // Broadcast, Concatenate, and Transpose need the same dimensions field.
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kTranspose:
+      return dimensions() == other.dimensions();
+
     case HloOpcode::kFusion:
       return fusion_kind() == other.fusion_kind() &&
              eq_computations(fused_instructions_computation(),
@@ -1801,10 +1764,7 @@ bool HloInstruction::IdenticalSlowPath(
       return false;
 
     case HloOpcode::kParameter:
-      return parameter_number() == other.parameter_number() &&
-             // Check the shape too because `this` and `other` may be in
-             // different HloComputations.
-             eq_shapes(shape(), other.shape());
+      return parameter_number() == other.parameter_number();
 
     case HloOpcode::kBatchNormTraining:
     case HloOpcode::kBatchNormInference:
@@ -1816,12 +1776,6 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kConstant:
       return literal() == other.literal();
 
-    // A convert result is determined by the primitive type that the operand is
-    // converted into.
-    case HloOpcode::kConvert:
-    case HloOpcode::kBitcastConvert:
-      return shape().element_type() == other.shape().element_type();
-
     // A reduce-precision operation is determined by the bit sizes.
     case HloOpcode::kReducePrecision:
       return exponent_bits() == other.exponent_bits() &&
@@ -1864,22 +1818,8 @@ bool HloInstruction::IdenticalSlowPath(
              eq_computations(scatter(), other.scatter()) &&
              protobuf_util::ProtobufEquals(window(), other.window());
 
-    case HloOpcode::kReshape:
-      return eq_shapes(shape(), other.shape());
-
-    // Transpose result is determined by the final shape and the permutation.
-    case HloOpcode::kTranspose:
-      return eq_shapes(shape(), other.shape()) &&
-             dimensions() == other.dimensions();
 
     // Remaining instructions with special values.
-    case HloOpcode::kBitcast:
-      return eq_shapes(shape(), other.shape());
-    case HloOpcode::kBroadcast:
-      return eq_shapes(shape(), other.shape()) &&
-             dimensions() == other.dimensions();
-    case HloOpcode::kConcatenate:
-      return dimensions() == other.dimensions();
     case HloOpcode::kGetTupleElement:
       return tuple_index() == other.tuple_index();
     case HloOpcode::kPad:
@@ -1889,11 +1829,6 @@ bool HloInstruction::IdenticalSlowPath(
       return slice_starts_ == other.slice_starts_ &&
              slice_limits_ == other.slice_limits_ &&
              slice_strides_ == other.slice_strides_;
-    case HloOpcode::kDynamicSlice:
-      return eq_shapes(shape(), other.shape()) &&
-             dynamic_slice_sizes_ == other.dynamic_slice_sizes_;
-    case HloOpcode::kDynamicUpdateSlice:
-      return eq_shapes(shape(), other.shape());
     case HloOpcode::kCall:
     case HloOpcode::kMap:
       return eq_computations(to_apply(), other.to_apply());
@@ -2160,28 +2095,68 @@ string PrintName(const string& name, const HloPrintOptions& options) {
 }  // namespace
 
 string HloInstruction::ToString(const HloPrintOptions& options) const {
-  string result =
-      StrCat(PrintName(name(), options), " = ",
-             ShapeUtil::HumanStringWithLayout(shape()), " ",
-             HloOpcodeString(opcode()), "(", OperandsToString(options), ")");
+  CanonicalNameMap new_map;
+  return ToStringWithCanonicalNameMap(options, &new_map);
+}
+
+string HloInstruction::ToStringWithCanonicalNameMap(
+    const HloPrintOptions& options,
+    CanonicalNameMap* canonical_name_map) const {
+  string result = "";
+
+  // Logic to print the instruction name (e.g. "%foo = ").
+  if (options.canonicalize_instruction_names()) {
+    if (options.is_in_nested_computation()) {
+      // If we are canonicalizing instruction names and this is a top-level
+      // HloInstruction::ToString() call, don't print an instruction name.
+      StrAppend(&result,
+                PrintName(canonical_name_map->LookupOrInsert(name()), options),
+                " = ");
+    }
+  } else {
+    StrAppend(&result, PrintName(name(), options), " = ");
+  }
+
+  // Print opcode, operand(s) and shape.
+  StrAppend(&result, ShapeUtil::HumanStringWithLayout(shape()), " ",
+            HloOpcodeString(opcode()), "(",
+            OperandsToStringWithCanonicalNameMap(options, canonical_name_map),
+            ")");
+
+  // Print additional attributes. If an instruction contains a subcomputation,
+  // the subcomputation is also printed here.
   for (const string& extra : ExtraAttributesToString(options)) {
     StrAppend(&result, ", ", extra);
   }
+
   if (options.print_metadata() &&
       (!metadata_.op_type().empty() || !metadata_.op_name().empty() ||
        !metadata_.source_file().empty())) {
     StrAppend(&result, ", metadata={", xla::OpMetadataToString(metadata_), "}");
   }
+  if (options.print_backend_config() && !backend_config().empty()) {
+    StrAppend(&result, ", backend_config=\"", CEscape(backend_config()), "\"");
+  }
   return result;
 }
 
 string HloInstruction::OperandsToString(const HloPrintOptions& options) const {
+  CanonicalNameMap new_map;
+  return OperandsToStringWithCanonicalNameMap(options, &new_map);
+}
+
+string HloInstruction::OperandsToStringWithCanonicalNameMap(
+    const HloPrintOptions& options,
+    CanonicalNameMap* canonical_name_map) const {
   string operands;
   if (opcode() == HloOpcode::kConstant) {
     // For constants, show the actual value in place of an empty operand list.
-    if ((!ShapeUtil::IsTuple(shape()) &&
-         ShapeUtil::ElementsIn(shape()) <= 10) ||
-        options.print_large_constants()) {
+    //
+    // In HloInstruction, sometimes a constant literal is not constructed due
+    // to its size. Skip the printing in this case.
+    if (HasLiteral() && ((!ShapeUtil::IsTuple(shape()) &&
+                          ShapeUtil::ElementsIn(shape()) <= 10) ||
+                         options.print_large_constants())) {
       // Literal::ToString emits multidimensional arrays over multiple
       // lines. Compact this into one line by stripping out white space.
       string tmp = literal().ToString();
@@ -2215,7 +2190,14 @@ string HloInstruction::OperandsToString(const HloPrintOptions& options) const {
       if (options.print_operand_shape()) {
         str.push_back(ShapeUtil::HumanStringWithLayout(operand->shape()));
       }
-      if (!options.compact_operands()) {
+
+      // In a top-level HloInstruction::ToString() call, the operand name is not
+      // part of the canonical string.
+      if (options.canonicalize_instruction_names() &&
+          options.is_in_nested_computation()) {
+        str.push_back(PrintName(
+            canonical_name_map->LookupOrInsert(operand->name()), options));
+      } else if (!options.compact_operands()) {
         str.push_back(PrintName(operand->name(), options));
       }
       StrAppend(out, Join(str, " "));
@@ -2284,7 +2266,8 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
     extra.push_back(StrCat("fft_length={", Join(fft_length(), ","), "}"));
   }
 
-  if (options.print_subcomputation_references()) {
+  if (options.print_subcomputation_mode() ==
+      HloPrintOptions::PrintSubcomputationMode::kNameOnly) {
     if (opcode() == HloOpcode::kWhile) {
       extra.push_back(
           StrCat("condition=", PrintName(while_condition()->name(), options)));
@@ -2312,8 +2295,45 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
                                      PrintName(computation->name(), options));
                          })));
     }
+  } else if (options.print_subcomputation_mode() ==
+             HloPrintOptions::PrintSubcomputationMode::kFullBodies) {
+    HloPrintOptions new_options = options;
+    new_options.set_is_in_nested_computation(true);
+    switch (opcode()) {
+      case HloOpcode::kWhile:
+        extra.push_back(
+            StrCat("condition=\n", while_condition()->ToString(new_options)));
+        extra.push_back(StrCat("body=\n", while_body()->ToString(new_options)));
+        break;
+      case HloOpcode::kSelectAndScatter:
+        extra.push_back(StrCat("select=\n", select()->ToString(new_options)));
+        extra.push_back(StrCat("scatter=\n", scatter()->ToString(new_options)));
+        break;
+      case HloOpcode::kConditional:
+        extra.push_back(StrCat("true_computation=\n",
+                               true_computation()->ToString(new_options)));
+        extra.push_back(StrCat("false_computation=\n",
+                               false_computation()->ToString(new_options)));
+        break;
+      case HloOpcode::kCall:
+      case HloOpcode::kMap:
+      case HloOpcode::kReduceWindow:
+      case HloOpcode::kReduce:
+        extra.push_back(
+            StrCat("to_apply=\n", to_apply()->ToString(new_options)));
+        break;
+      default:
+        if (!called_computations().empty()) {
+          extra.push_back(
+              StrCat("calls=\n",
+                     Join(called_computations(), ", ",
+                          [&](string* out, const HloComputation* computation) {
+                            StrAppend(out, computation->ToString(new_options));
+                          })));
+        }
+        break;
+    }
   }
-
   if (opcode() == HloOpcode::kSend || opcode() == HloOpcode::kRecv ||
       opcode() == HloOpcode::kSendDone || opcode() == HloOpcode::kRecvDone) {
     extra.push_back(StrCat("channel_id=", channel_id_));
@@ -2351,12 +2371,13 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
   }
 
   // By contract, we print the custom call target even if
-  // !options.print_subcomputation_references(), because the call target is not
+  // options.print_subcomputation_mode() == kOff, because the call target is not
   // an HloComputation.
   if (opcode() == HloOpcode::kCustomCall) {
     extra.push_back(
         StrCat("custom_call_target=\"", CEscape(custom_call_target_), "\""));
   }
+
   return extra;
 }
 
@@ -2386,6 +2407,7 @@ HloInstructionProto HloInstruction::ToProto() const {
   }
 
   *proto.mutable_metadata() = metadata_;
+  proto.set_backend_config(backend_config());
   if (literal_ != nullptr) {
     *proto.mutable_literal() = literal_->ToProto();
   }
@@ -2451,6 +2473,10 @@ HloInstructionProto HloInstruction::ToProto() const {
     proto.add_fft_length(fft_len);
   }
 
+  if (has_sharding()) {
+    *proto.mutable_sharding() = sharding().ToProto();
+  }
+
   proto.set_channel_name(channel_name_);
   proto.set_cost_estimate_ns(cost_estimate_ns_);
 
@@ -2487,8 +2513,6 @@ string HloInstruction::ToCategory() const {
         return "input fusion";
       case FusionKind::kOutput:
         return "output fusion";
-      case FusionKind::kTransposeDot:
-        return "dot";
       case FusionKind::kCustom:
         return "custom fusion";
     }
@@ -2673,6 +2697,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleNegate(this);
     case HloOpcode::kExp:
       return visitor->HandleExp(this);
+    case HloOpcode::kExpm1:
+      return visitor->HandleExpm1(this);
     case HloOpcode::kFloor:
       return visitor->HandleFloor(this);
     case HloOpcode::kCeil:
@@ -2681,6 +2707,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleClz(this);
     case HloOpcode::kLog:
       return visitor->HandleLog(this);
+    case HloOpcode::kLog1p:
+      return visitor->HandleLog1p(this);
     case HloOpcode::kTanh:
       return visitor->HandleTanh(this);
     case HloOpcode::kCos:
@@ -2971,6 +2999,7 @@ Status HloInstruction::AcceptOrdered(
       continue;
     }
 
+    // TODO(b/78350259): Eliminate const laundering.
     HloInstruction* instruction =
         const_cast<HloInstruction*>(const_instruction);
 
@@ -3026,10 +3055,12 @@ bool HloInstruction::IsElementwise() const {
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
     case HloOpcode::kImag:
     case HloOpcode::kIsFinite:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kNot:
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
@@ -3094,7 +3125,7 @@ bool HloInstruction::IsElementwise() const {
 
 bool HloInstruction::ImplicitlyBroadcastsOperand(int64 operand_idx) const {
   CHECK(IsElementwise());
-  return !ShapeUtil::Equal(shape(), operand(operand_idx)->shape());
+  return !ShapeUtil::SameDimensions(shape(), operand(operand_idx)->shape());
 }
 
 namespace {
@@ -3270,8 +3301,6 @@ string ToString(HloInstruction::FusionKind kind) {
       return "kInput";
     case HloInstruction::FusionKind::kOutput:
       return "kOutput";
-    case HloInstruction::FusionKind::kTransposeDot:
-      return "kTransposeDot";
     case HloInstruction::FusionKind::kCustom:
       return "kCustom";
   }
@@ -3288,9 +3317,6 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
   if (kind_name == "kOutput") {
     return HloInstruction::FusionKind::kOutput;
   }
-  if (kind_name == "kTransposeDot") {
-    return HloInstruction::FusionKind::kTransposeDot;
-  }
   if (kind_name == "kCustom") {
     return HloInstruction::FusionKind::kCustom;
   }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index a5e9aecb9e7f5204b53186abca78033215a75828..234dbc8399de2d88209dd8dd2be58dd152ddbe76 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -60,51 +60,75 @@ class HloModule;
 // A bunch of switches that control how the hlo text should be printed.
 class HloPrintOptions {
  public:
+  enum class PrintSubcomputationMode {
+    kOff,         // Do not print anything about subcomputations.
+    kNameOnly,    // Only print the name of subcomputations.
+    kFullBodies,  // Print the full bodies of subcomputations.
+  };
+
   // Constructs the default print options: don't print large constants, don't
   // compact operands, no indentation.
   HloPrintOptions()
       : print_large_constants_(false),
-        print_subcomputation_references_(true),
+        print_subcomputation_mode_(PrintSubcomputationMode::kNameOnly),
         print_metadata_(true),
+        print_backend_config_(true),
         compact_operands_(false),
         print_operand_shape_(true),
         print_program_shape_(true),
         print_percent_(true),
-        indent_amount_(0) {}
+        canonicalize_instruction_names_(false),
+        indent_amount_(0),
+        is_in_nested_computation_(false) {}
 
   static HloPrintOptions ShortParsable() {
     return HloPrintOptions()
         .set_print_large_constants(true)
-        .set_print_subcomputation_references(true)
+        .set_print_subcomputation_mode(PrintSubcomputationMode::kNameOnly)
         .set_print_metadata(false)
+        .set_print_backend_config(false)
         .set_print_operand_shape(false)
         .set_print_program_shape(false)
         .set_print_percent(false);
   }
 
+  // Options to produce the canonical string representing an isomorphic
+  // computation graph.
+  static HloPrintOptions Canonical() {
+    return HloPrintOptions()
+        .set_print_subcomputation_mode(PrintSubcomputationMode::kFullBodies)
+        .set_print_metadata(false)
+        .set_compact_operands(true)
+        .set_print_operand_shape(true)
+        .set_print_program_shape(false)
+        .set_print_percent(false)
+        .set_canonicalize_instruction_names(true);
+  }
+
   // If true, large constants will be printed out.
   HloPrintOptions& set_print_large_constants(bool value) {
     print_large_constants_ = value;
     return *this;
   }
 
-  // If true, the names of subcomputations (e.g. a fusion node's fused
-  // computation) won't be printed.  This makes the resulting text not parsable.
-  //
-  // A CustomCall's call target is printed even if
-  // print_subcomputation_references is false, because the call target isn't an
-  // HloComputation.
-  HloPrintOptions& set_print_subcomputation_references(bool value) {
-    print_subcomputation_references_ = value;
+  HloPrintOptions& set_print_subcomputation_mode(
+      PrintSubcomputationMode value) {
+    print_subcomputation_mode_ = value;
     return *this;
   }
 
-  // If true, metatdata will be printed.
+  // If true, metadata will be printed.
   HloPrintOptions& set_print_metadata(bool value) {
     print_metadata_ = value;
     return *this;
   }
 
+  // If true, backend_config will be printed.
+  HloPrintOptions& set_print_backend_config(bool value) {
+    print_backend_config_ = value;
+    return *this;
+  }
+
   // If true, operands' shapes will be printed.
   HloPrintOptions& set_print_operand_shape(bool value) {
     print_operand_shape_ = value;
@@ -130,54 +154,175 @@ class HloPrintOptions {
     return *this;
   }
 
+  // If true, canonicalizes instructions' name. Instead of using "%foo.1" as
+  // the name of an instruction, we use "%tmp_1", "%tmp_2" etc.
+  HloPrintOptions& set_canonicalize_instruction_names(bool value) {
+    canonicalize_instruction_names_ = value;
+    return *this;
+  }
+
   // The indent of the hlo text block.
   HloPrintOptions& set_indent_amount(int value) {
     indent_amount_ = value;
     return *this;
   }
 
+  // If true, indicates the instruction being printed is inside a nested
+  // computation.
+  HloPrintOptions& set_is_in_nested_computation(bool value) {
+    is_in_nested_computation_ = value;
+    return *this;
+  }
+
   bool print_large_constants() const { return print_large_constants_; }
-  bool print_subcomputation_references() const {
-    return print_subcomputation_references_;
+  PrintSubcomputationMode print_subcomputation_mode() const {
+    return print_subcomputation_mode_;
   }
   bool print_metadata() const { return print_metadata_; }
+  bool print_backend_config() const { return print_metadata_; }
   bool compact_operands() const { return compact_operands_; }
   bool print_operand_shape() const { return print_operand_shape_; }
   bool print_program_shape() const { return print_program_shape_; }
   bool print_percent() const { return print_percent_; }
+  bool canonicalize_instruction_names() const {
+    return canonicalize_instruction_names_;
+  }
   int indent_amount() const { return indent_amount_; }
+  int is_in_nested_computation() const { return is_in_nested_computation_; }
 
  private:
   bool print_large_constants_;
-  bool print_subcomputation_references_;
+  PrintSubcomputationMode print_subcomputation_mode_;
   bool print_metadata_;
+  bool print_backend_config_;
   bool compact_operands_;
   bool print_operand_shape_;
   bool print_program_shape_;
   bool print_percent_;
+  bool canonicalize_instruction_names_;
   int indent_amount_;
+  bool is_in_nested_computation_;
+};
+
+// For canonical string output, we need to have a canonical way to rename
+// each instruction and its operands. Each operand is renamed as "tmp_<xxx>",
+// where <xxx> is an index starting from 0.
+class CanonicalNameMap {
+ public:
+  CanonicalNameMap() : index(0) {}
+
+  string LookupOrInsert(const string& old_name) {
+    auto iter = canonical_name_map.find(old_name);
+    if (iter != canonical_name_map.end()) {
+      return iter->second;
+    }
+
+    string new_name = tensorflow::strings::StrCat("tmp_", index++);
+    canonical_name_map[old_name] = new_name;
+    return new_name;
+  }
+  void Clear() {
+    canonical_name_map.clear();
+    index = 0;
+  }
+
+ private:
+  int64 index;
+  tensorflow::gtl::FlatMap<string, string> canonical_name_map;
 };
 
-// HLO instructions are the IR used by the high-level compiler.
+// HLO instructions are the atomic unit of the high-level compiler's IR.
+//
+// HloInstructions live inside of an HloComputation, which is analogous to a
+// function in other programming languages.  Nodes have no total order within
+// their computation.  Instead, they have a partial ordering determined by their
+// data and control dependencies.
+//
+// HLO does not have basic blocks or explicit "branch" instructions.  Instead,
+// certain HloInstructions -- namely, kWhile, kConditional, and kCall -- encode
+// control flow.  For example, the kConditional HLO executes one of two possible
+// computations, depending on the runtime value of a predicate.
+//
+// HLO is pure (mostly).  It has no concept of mutable state.  Instead, data
+// values are produced by one HLO and flow into consumers across dependency
+// edges.
 class HloInstruction {
  public:
+  // A fusion node computes the same value a call to its fusion computation
+  // would compute.  However, the choice of fusion kind dictates codegen
+  // strategy for the backend.
+  //
+  // To generate code for a kFusion HloInstruction, most backends do something
+  // like the following:
+  //
+  // 1) Identify the "primary" HloInstruction of the fused computation.
+  // 2) Emit code that does the work of the primary node, creating its inputs
+  //    and transforming its outputs as specified by the fused computation.
+  //
+  // In step (2), the code emitted is usually similar to the code that would be
+  // emitted for an *unfused* version of the primary node, except that
+  //
+  //  - when the primary node reads an element of one of its operands, instead
+  //    of loading the value from memory, it *computes* the value based on the
+  //    contents of the fused computation.
+  //  - when the primary node outputs a value, instead of storing it to memory,
+  //    it forwards the value to its users, which then perform additional
+  //    computations before the value is finally stored to memory at the root of
+  //    the fusion node.
+  //
+  // An HloInstruction's FusionKind helps us find the kFusion instruction's
+  // primary node, and can also affect how we generate code in step (2).
+  //
+  //  - kInput: The primary node is the root of the fused instruction.
+  //
+  //  - kOutput: The primary node is not the root of the fused instruction.
+  //    This fusion kind requires that one operand buffer of the fusion
+  //    instruction be able to alias the output buffer.  This constraint is
+  //    usually enough to let backends find the primary node unambiguously.
+  //
+  //  - kLoop: The primary node is the root of the fused computation, but,
+  //    unlike in input fusion, we prescribe a specific implementation for
+  //    codegen.  Rather than generating code that looks like the code we'd emit
+  //    for an unfused version of the primary/root node, we emit code that
+  //    generates one element of the root at a time.
+  //
+  //  - kCustom: Custom category for backend-specific fusions that don't fit
+  //    into the above patterns.
+  //
+  // Not all backends support all fusion kinds, and given a particular fused
+  // computation, it's not in general safe to change its fusion kind.  Creation
+  // of fusion nodes is always backend-specific.
+  //
+  // For elementwise ops (e.g. kAdd), most backends would emit a
+  // one-element-at-a-time implementation for the unfused version, so loop
+  // fusion and input fusion are probably equivalent if the root node is
+  // elementwise.  They're not necessarily equivalent e.g. for kReduce, where an
+  // implementation might emit something more sophisticated for an unfused or
+  // input-fusion reduce, but will emit the naive code that reduces one element
+  // at a time for loop fusion with a reduce as the root.
+  //
+  // Another way to think of loop fusion is that it's equivalent to input
+  // fusion, but where the root node is an implicit identity node, whose
+  // unfused implementation is "read one element, write one element".
+  //
+  // TODO(b/79869434): This categorization scheme is not great.  For one thing,
+  // input and loop fusion are basically the same thing: There is no reason for
+  // the HLO to encode backend-specific decisions about how e.g. a reduce that's
+  // the root of a fusion should be lowered.  In addition, this scheme as
+  // written doesn't work for multi-output fusion, where the primary node is
+  // never actually the root (which is a kTuple instruction that gathers the
+  // multiple outputs of the fusion).
   enum class FusionKind {
-    kLoop,          // Fused into a loop.
-    kInput,         // Op's input is fused into the op itself.
-    kOutput,        // Op's output is fused into the op itself.
-                    // REQUIRES: At least one operand buffer must be able
-                    // to alias the output buffer.
-    kTransposeDot,  // Fused into a dot with transposed operands.
-    kCustom,        // Custom category for backend-specific fusions that
-                    // do not match any of the more specific ones.
+    kLoop,
+    kInput,
+    kOutput,
+    kCustom,
   };
 
   ~HloInstruction();
 
   // Creates an instruction from the given proto. Arguments:
   //
-  //   module: the module which will contain the instruction. The newly created
-  //     instruction is *not* added to the module or any computation, however.
   //   proto: the proto to convert from.
   //   instruction_map: a map from instruction id to HloInstruction*. This map
   //     must contain all operands of the newly constructed instruction.
@@ -185,7 +330,7 @@ class HloInstruction {
   //     must contain all computations which the newly constructed instruction
   //     calls.
   static StatusOr<std::unique_ptr<HloInstruction>> CreateFromProto(
-      HloModule* module, const HloInstructionProto& proto,
+      const HloInstructionProto& proto,
       const tensorflow::gtl::FlatMap<int64, HloInstruction*>& instruction_map,
       const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map);
 
@@ -503,6 +648,10 @@ class HloInstruction {
   // Returns the opcode for this instruction.
   HloOpcode opcode() const { return opcode_; }
 
+  // Returns true if this instruction has a side effect, irrespective of whether
+  // any called computations may contain an instruction with side effects.
+  bool HasSideEffectNoRecurse() const;
+
   // Returns true if this instruction has a side effect. An instruction has a
   // side effect if it uses certain opcodes or calls a computation with a side
   // effect.
@@ -597,10 +746,8 @@ class HloInstruction {
     if (opcode() != other.opcode()) {
       return false;
     }
-    using EqShapeFuncType = bool (*)(const Shape&, const Shape&);
-    EqShapeFuncType eq_shapes =
-        layout_sensitive ? ShapeUtil::Equal : ShapeUtil::Compatible;
-    if (!eq_shapes(shape(), other.shape())) {
+    if (!(layout_sensitive ? ShapeUtil::Equal(shape(), other.shape())
+                           : ShapeUtil::Compatible(shape(), other.shape()))) {
       return false;
     }
     if (operands().size() != other.operands().size()) {
@@ -615,7 +762,7 @@ class HloInstruction {
       }
     }
 
-    return IdenticalSlowPath(other, eq_computations, eq_shapes);
+    return IdenticalSlowPath(other, eq_computations);
   }
 
   // Returns whether the instruction has a constant operand.
@@ -643,6 +790,8 @@ class HloInstruction {
   // Detaches an instruction from its operands. That is, remove the instruction
   // from each operand's user set. This should only be called prior to
   // deallocating the instruction.
+  //
+  // TODO(b/78305363): Make this automatic when deleting an instruction.
   void DetachFromOperands();
 
   // Performs a postorder DFS visit using this node as the root. If
@@ -695,6 +844,9 @@ class HloInstruction {
   // Note: only constant and parameter opcodes have an associated literal.
   const Literal& literal() const;
 
+  // Returns whether there is literal associated with this instruction.
+  bool HasLiteral() const;
+
   // Returns the parameter number associated with this instruction.
   //
   // Note: only parameter opcodes have an associated parameter number.
@@ -956,6 +1108,14 @@ class HloInstruction {
   void clear_sharding() { sharding_ = nullptr; }
   // Return true if this operator has a sharding assigned.
   bool has_sharding() const { return sharding_ != nullptr; }
+  // Checks whether the instruction has compatible sharding with the other
+  // instruction.
+  bool has_compatible_sharding(const HloInstruction* other) const {
+    if (!has_sharding()) {
+      return !other->has_sharding();
+    }
+    return other->has_sharding() ? sharding() == other->sharding() : false;
+  }
 
   // When creating a new instruction which either replaces, or shifts up (kCopy
   // insertion case), another instruction, we need to make sure the certain
@@ -1157,23 +1317,30 @@ class HloInstruction {
   // Precondition: opcode() == HloOpcode::kRng
   RandomDistribution random_distribution() const;
 
+  // See documentation for Clone().
+  using CloneMap = std::unordered_map<const HloInstruction*, HloInstruction*>;
+
   // Clones the HLO instruction. The clone will have the same opcode, shape, and
   // operands. After creation the clone has no uses. "this" (the instruction
   // cloned from) is not changed. Suffix is the string to append to the name of
-  // the instruction to form the name of the cloned instruction.  If the module
-  // pointer is not nullptr, it will be the module where the cloned computations
-  // will be added to (in order to support deep cloning).  Ignores the control
-  // predecessors and successors of this HLO instruction.
+  // the instruction to form the name of the cloned instruction. Ignores the
+  // control predecessors and successors of this HLO instruction.
+  //
+  // If the module pointer is not nullptr, then any cloned computations will be
+  // added to this module in order to support deep cloning. Otherwise the module
+  // of the instruction is used.
+  //
+  // If clone_map is not nullptr, then each original instruction that is cloned
+  // will be inserted and map to its clone. clone_map should not already contain
+  // any of the instructions to clone.
   std::unique_ptr<HloInstruction> Clone(const string& suffix = "clone",
-                                        HloModule* module = nullptr) const;
+                                        HloModule* module = nullptr,
+                                        CloneMap* clone_map = nullptr) const;
 
-  // Clones the HLO instruction as above but with new shape and operands.  If
-  // the module pointer is not nullptr, it will be the module where the cloned
-  // computations will be added to (in order to support deep cloning).  Ignores
-  // the control predecessors and successors of this HLO instruction.
+  // Clones the HLO instruction as above but with new shape and operands.
   std::unique_ptr<HloInstruction> CloneWithNewOperands(
       const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-      HloModule* module = nullptr) const;
+      HloModule* module = nullptr, CloneMap* clone_map = nullptr) const;
 
   // Returns the computations this instruction directly calls (if any).
   const std::vector<HloComputation*>& called_computations() const {
@@ -1245,7 +1412,7 @@ class HloInstruction {
 
   // Gets/sets the string identifier for this instruction.
   const string& name() const { return name_; }
-  void set_name(tensorflow::StringPiece name) { name_ = name.ToString(); }
+  void set_name(tensorflow::StringPiece name) { name_ = std::string(name); }
 
   // Use the given NameUniquer to select a unique name for the instruction based
   // on the instruction's existing name.
@@ -1262,6 +1429,19 @@ class HloInstruction {
   // if no id has been assigned yet).
   int unique_id() const { return unique_id_; }
 
+  // Returns the backend-specific configuration for how a backend should compile
+  // this HLO. The meaning of the field is backend specific. Not for use before
+  // or during general HLO optimization, since HLO optimizations do not preserve
+  // this field and they cannot interpret it due to its meaning being backend
+  // specific.
+  //
+  // TODO(b/78194644): Introduce structured configuration format as per
+  // go/xla-heuristics.
+  const string& backend_config() const { return backend_config_; }
+  void set_backend_config(string backend_config) {
+    backend_config_ = std::move(backend_config);
+  }
+
   // Sets the debug metadata for this instruction.
   void set_metadata(const OpMetadata& metadata) { metadata_ = metadata; }
   const OpMetadata& metadata() const { return metadata_; }
@@ -1283,6 +1463,7 @@ class HloInstruction {
   // Get/Set the number of partitions per outer dimension (in order, starting
   // with outer-most dimension first). Currently used by the parallel cpu
   // backend to partition HLOs into parallel tasks.
+  //
   // TODO(b/62783254) Replace these methods with a more general way to
   // annotate HLOs with backend-specific information.
   const std::vector<int64>& outer_dimension_partitions() const {
@@ -1298,20 +1479,34 @@ class HloInstruction {
                         const ShapeIndex& shape_index = {});
 
  private:
+  // Prints an instruction to a string.
+  //
+  // The canonical string representation needs to name operands and instruction
+  // names in a consistent way. This is implemented through the
+  // canonical_name_map.
+  string ToStringWithCanonicalNameMap(
+      const HloPrintOptions& options,
+      CanonicalNameMap* canonical_name_map) const;
+
+  // Prints an operand to a string.
+  string OperandsToStringWithCanonicalNameMap(
+      const HloPrintOptions& options,
+      CanonicalNameMap* canonical_name_map) const;
+
+  // Allow HloInstruction to access the ToStringWithCanonicalNameMap() and
+  // OperandsToStringWithCanonicalNameMap() functions.
+  friend class HloComputation;
+
   enum class UseKind { kNoUse, kReuse, kUsePermutingElements, kUse };
 
   // Helper class for computing OperandElementUse for kFusion.
   class FusionReusesParamElements;
 
   // See comments on Identical().
-  // eq_shapes() is used to check shapes for equality, and would normally be
-  // expected to be ShapeUtil::Equals or ShapeUtil::Compatible, depending on
-  // whether we want a layout-sensitive check or not.
   bool IdenticalSlowPath(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
-          eq_computations,
-      const std::function<bool(const Shape&, const Shape&)>& eq_shapes) const;
+          eq_computations) const;
 
   // Creates an n-ary elementwise operation.
   static std::unique_ptr<HloInstruction> CreateNary(
@@ -1510,6 +1705,10 @@ class HloInstruction {
   // The string representation of the infeed configuration.
   string infeed_config_;
 
+  // The backend-specific configuration for how a backend should compile this
+  // HLO. See the documentation on backend_config().
+  string backend_config_;
+
   // String identifier for instruction.
   string name_;
 
@@ -1540,13 +1739,20 @@ std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind);
 // an HloInstruction* or a const HloInstruction*.
 // To make the iteration order over the map deterministic, the comparator
 // should not be using the pointer values, but rather an intrinsic property of
-// the hlo.
+// the hlo. Exception: null pointer values compare less than non-null.
 //
 // Note that this cannot be used for HLO instructions across multiple modules
 // since the id of HLO instructions are only unique within each HLO module.
 struct HloPtrComparator {
   bool operator()(const HloInstruction* const& lhs,
                   const HloInstruction* const& rhs) const {
+    if (rhs == nullptr) {
+      // Nothing compares less than nullptr.
+      return false;
+    }
+    if (lhs == nullptr) {
+      return true;
+    }
     return lhs->unique_id() < rhs->unique_id();
   }
 };
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 5b65b1152c8298a8954890374626ae5329dccff9..a61c472c72804b077d21274d2e866a69c5e73157 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -1102,7 +1102,7 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
   auto module = CreateNewModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
-      {dot, reshape}, HloInstruction::FusionKind::kTransposeDot);
+      {dot, reshape}, HloInstruction::FusionKind::kLoop);
 
   auto fusion2 = fusion->Clone();
   const HloInstruction* root = fusion->fused_expression_root();
@@ -1169,7 +1169,7 @@ TEST_F(HloInstructionTest, NestedFusionEquality) {
   auto computation = module->AddEntryComputation(builder.Build());
 
   auto nested_fusion = computation->CreateFusionInstruction(
-      {dot, b_t}, HloInstruction::FusionKind::kTransposeDot);
+      {dot, b_t}, HloInstruction::FusionKind::kLoop);
 
   auto fusion = computation->CreateFusionInstruction(
       {add, nested_fusion}, HloInstruction::FusionKind::kOutput);
@@ -1246,13 +1246,6 @@ TEST_F(HloInstructionTest, Stringification) {
 
   auto module = CreateNewModule();
   auto* computation = module->AddEntryComputation(builder.Build());
-  HloInstruction* fusion = computation->CreateFusionInstruction(
-      {dot, reshape}, HloInstruction::FusionKind::kTransposeDot);
-
-  EXPECT_EQ(
-      fusion->ToString(options),
-      "%dot_fusion = f32[5,20]{1,0} fusion(f32[5,10]{1,0} %x, "
-      "f32[20,10]{1,0} %y), kind=kTransposeDot, calls=%fused_computation");
 
   HloInstruction* loop = builder.AddInstruction(
       HloInstruction::CreateWhile(sout, computation, computation, x));
@@ -1343,5 +1336,163 @@ TEST_F(HloInstructionTest, StringifyGather_1) {
             "index_vector_dim=2, window_bounds={30,29,28,27,26}");
 }
 
+TEST_F(HloInstructionTest, CanonnicalStringificationFusion) {
+  // Tests stringification of a simple op, fusion, while, and conditional.
+  const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
+  const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
+  const Shape s2t = ShapeUtil::MakeShape(F32, {10, 20});
+  const Shape sout = ShapeUtil::MakeShape(F32, {5, 20});
+
+  HloComputation::Builder builder("TransposeDot");
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "x"));
+  HloInstruction* y =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
+  HloInstruction* reshape =
+      builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
+
+  auto options = HloPrintOptions().Canonical();
+
+  EXPECT_EQ(dot->ToString(options),
+            "f32[5,20]{1,0} dot(f32[5,10]{1,0}, f32[10,20]{1,0}), "
+            "lhs_contracting_dims={1}, rhs_contracting_dims={0}");
+
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
+  HloInstruction* fusion = computation->CreateFusionInstruction(
+      {dot, reshape}, HloInstruction::FusionKind::kLoop);
+
+  EXPECT_EQ(
+      fusion->ToString(options),
+      R"(f32[5,20]{1,0} fusion(f32[5,10]{1,0}, f32[20,10]{1,0}), kind=kLoop, calls=
+{
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  tmp_1 = f32[20,10]{1,0} parameter(1)
+  tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0}
+  ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})");
+}
+
+TEST_F(HloInstructionTest, CanonnicalStringificationWhile) {
+  // Tests stringification of a simple op, fusion, while, and conditional.
+  const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
+  const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
+  const Shape s2t = ShapeUtil::MakeShape(F32, {10, 20});
+  const Shape sout = ShapeUtil::MakeShape(F32, {5, 20});
+
+  HloComputation::Builder builder("TransposeDot");
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "x"));
+  HloInstruction* y =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
+  HloInstruction* reshape =
+      builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
+
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
+  computation->CreateFusionInstruction({dot, reshape},
+                                       HloInstruction::FusionKind::kLoop);
+
+  HloInstruction* loop = builder.AddInstruction(
+      HloInstruction::CreateWhile(sout, computation, computation, x));
+
+  auto options = HloPrintOptions().Canonical();
+  EXPECT_EQ(loop->ToString(options),
+            R"(f32[5,20]{1,0} while(f32[5,10]{1,0}), condition=
+{
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  tmp_1 = f32[20,10]{1,0} parameter(1)
+  ROOT tmp_2 = f32[5,20]{1,0} fusion(f32[5,10]{1,0} tmp_0, f32[20,10]{1,0} tmp_1), kind=kLoop, calls=
+  {
+    tmp_0 = f32[5,10]{1,0} parameter(0)
+    tmp_1 = f32[20,10]{1,0} parameter(1)
+    tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0}
+    ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+}, body=
+{
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  tmp_1 = f32[20,10]{1,0} parameter(1)
+  ROOT tmp_2 = f32[5,20]{1,0} fusion(f32[5,10]{1,0} tmp_0, f32[20,10]{1,0} tmp_1), kind=kLoop, calls=
+  {
+    tmp_0 = f32[5,10]{1,0} parameter(0)
+    tmp_1 = f32[20,10]{1,0} parameter(1)
+    tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0}
+    ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+})");
+}
+
+TEST_F(HloInstructionTest, CanonnicalStringificationConditional) {
+  // Tests stringification of a simple op, fusion, while, and conditional.
+  const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
+  const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
+  const Shape s2t = ShapeUtil::MakeShape(F32, {10, 20});
+  const Shape sout = ShapeUtil::MakeShape(F32, {5, 20});
+
+  HloComputation::Builder builder("TransposeDot");
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "x"));
+  HloInstruction* y =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
+  HloInstruction* reshape =
+      builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
+
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
+  computation->CreateFusionInstruction({dot, reshape},
+                                       HloInstruction::FusionKind::kLoop);
+
+  builder.AddInstruction(
+      HloInstruction::CreateWhile(sout, computation, computation, x));
+
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  HloInstruction* conditional =
+      builder.AddInstruction(HloInstruction::CreateConditional(
+          sout, pred, x, computation, x, computation));
+  auto options = HloPrintOptions().Canonical();
+  EXPECT_EQ(
+      conditional->ToString(options),
+      R"(f32[5,20]{1,0} conditional(pred[], f32[5,10]{1,0}, f32[5,10]{1,0}), true_computation=
+{
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  tmp_1 = f32[20,10]{1,0} parameter(1)
+  ROOT tmp_2 = f32[5,20]{1,0} fusion(f32[5,10]{1,0} tmp_0, f32[20,10]{1,0} tmp_1), kind=kLoop, calls=
+  {
+    tmp_0 = f32[5,10]{1,0} parameter(0)
+    tmp_1 = f32[20,10]{1,0} parameter(1)
+    tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0}
+    ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+}, false_computation=
+{
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  tmp_1 = f32[20,10]{1,0} parameter(1)
+  ROOT tmp_2 = f32[5,20]{1,0} fusion(f32[5,10]{1,0} tmp_0, f32[20,10]{1,0} tmp_1), kind=kLoop, calls=
+  {
+    tmp_0 = f32[5,10]{1,0} parameter(0)
+    tmp_1 = f32[20,10]{1,0} parameter(1)
+    tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0}
+    ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+})");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
new file mode 100644
index 0000000000000000000000000000000000000000..43c41ece6efc4f9e8ca74f16e0f63d29abc4de4e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
@@ -0,0 +1,306 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_liveness_analysis.h"
+
+#include <deque>
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+using Worklist = std::deque<const HloInstruction*>;
+using Workset = std::unordered_set<const HloInstruction*>;
+
+namespace {
+
+void AddToWorklist(const HloInstruction* instruction, Worklist* worklist,
+                   Workset* workset) {
+  if (workset->count(instruction) == 0) {
+    worklist->push_back(instruction);
+    workset->insert(instruction);
+    VLOG(3) << "ADD instruction: " << instruction->name();
+  }
+}
+
+using VisitorFunction = std::function<void(const ShapeIndex& /*index*/)>;
+
+void ForEachLiveIndex(const ShapeTree<bool>& index_tree,
+                      const VisitorFunction& func) {
+  index_tree.ForEachElement([&](const ShapeIndex& shape_index, bool live) {
+    if (live) {
+      func(shape_index);
+    }
+  });
+}
+
+// Marks 'instruction' output live at 'shape_index'.
+// Adds to 'worklist' iff:
+// *) 'instruction' is not already on worklist.
+// *) 'shape_index' has not yet been visited.
+void MarkLiveAtIndex(const HloInstruction* instruction,
+                     const ShapeIndex& shape_index,
+                     HloLivenessAnalysis::HloIndexMap* live_index_map,
+                     Worklist* worklist, Workset* workset) {
+  auto it = live_index_map->find(instruction);
+  if (it == live_index_map->end()) {
+    auto it_added = live_index_map->emplace(
+        std::piecewise_construct, std::forward_as_tuple(instruction),
+        std::forward_as_tuple(instruction->shape(), /*init_value=*/false));
+    it = it_added.first;
+  }
+  if (it->second.element(shape_index) == false) {
+    AddToWorklist(instruction, worklist, workset);
+    *it->second.mutable_element(shape_index) = true;
+    VLOG(3) << "MARK instruction: " << instruction->name()
+            << " shape_index: " << shape_index.ToString();
+  }
+}
+
+// Marks 'instruction' live at all shape indices in its output.
+void MarkLiveAtAllIndices(const HloInstruction* instruction,
+                          HloLivenessAnalysis::HloIndexMap* live_index_map,
+                          Worklist* worklist, Workset* workset) {
+  bool add_to_worklist = false;
+  auto it = live_index_map->find(instruction);
+  if (it == live_index_map->end()) {
+    live_index_map->emplace(
+        std::piecewise_construct, std::forward_as_tuple(instruction),
+        std::forward_as_tuple(instruction->shape(), /*init_value=*/true));
+    add_to_worklist = true;
+  } else {
+    ShapeUtil::ForEachSubshape(
+        instruction->shape(),
+        [&](const Shape& sub_shape, const ShapeIndex& shape_index) {
+          if (it->second.element(shape_index) == false) {
+            add_to_worklist = true;
+            *it->second.mutable_element(shape_index) = true;
+            VLOG(3) << "MARK instruction: " << instruction->name()
+                    << " shape_index: " << shape_index.ToString();
+          }
+        });
+  }
+  if (add_to_worklist) {
+    AddToWorklist(instruction, worklist, workset);
+  }
+}
+
+// Propagates liveness through Tuple instructions.
+// *) For each tuple operand:
+//   *) For tuple output shape index associated with operand:
+//     *) Propgate live shape indices to tuple operand at the associated
+//        shape index in the operands output, and add to worklist.
+void PropagateLivenessThroughTuple(
+    const HloInstruction* instruction,
+    HloLivenessAnalysis::HloIndexMap* live_index_map, Worklist* worklist,
+    Workset* workset) {
+  CHECK_EQ(instruction->opcode(), HloOpcode::kTuple);
+  for (int64 operand_index = 0; operand_index < instruction->operand_count();
+       ++operand_index) {
+    const ShapeTree<bool>& index_tree = FindOrDie(*live_index_map, instruction);
+    ForEachLiveIndex(index_tree, [&](const ShapeIndex& shape_index) {
+      if (shape_index.empty() || shape_index[0] != operand_index) {
+        return;
+      }
+      // Mark top-level index of operand at 'operand_index'.
+      MarkLiveAtIndex(instruction->operand(operand_index), {}, live_index_map,
+                      worklist, workset);
+      // Mark sub-shape index of operand at 'operand_index'.
+      ShapeIndex operand_shape_index;
+      for (int i = 1; i < shape_index.size(); ++i) {
+        operand_shape_index.push_back(shape_index[i]);
+      }
+      MarkLiveAtIndex(instruction->operand(operand_index), operand_shape_index,
+                      live_index_map, worklist, workset);
+    });
+  }
+}
+
+// Propagates liveness through GetTupleElement instructions.
+// *) For each live index in GetTupleElement output, mark output of GTE operand
+//    at associated shape index in its output, and add to worklist.
+void PropagateLivenessThroughGTE(
+    const HloInstruction* instruction,
+    HloLivenessAnalysis::HloIndexMap* live_index_map, Worklist* worklist,
+    Workset* workset) {
+  CHECK_EQ(instruction->opcode(), HloOpcode::kGetTupleElement);
+  // Mark operand top-level index.
+  MarkLiveAtIndex(instruction->operand(0), {}, live_index_map, worklist,
+                  workset);
+  const ShapeTree<bool>& index_tree = FindOrDie(*live_index_map, instruction);
+  // Propagate live shape indices along GTE -> Tuple edge.
+  ForEachLiveIndex(index_tree, [&](const ShapeIndex& shape_index) {
+    ShapeIndex operand_shape_index(shape_index);
+    operand_shape_index.push_front(instruction->tuple_index());
+    MarkLiveAtIndex(instruction->operand(0), operand_shape_index,
+                    live_index_map, worklist, workset);
+  });
+}
+
+// Propagates liveness through While instructions.
+// *) For each live index in While output, mark shape index of while.body.root
+//    and while.operand (adding each to worklist).
+// *) Mark while.cond.root and add to worklist.
+void PropagateLivenessThroughWhile(
+    const HloInstruction* instruction,
+    HloLivenessAnalysis::HloIndexMap* live_index_map, Worklist* worklist,
+    Workset* workset) {
+  CHECK_EQ(instruction->opcode(), HloOpcode::kWhile);
+  const ShapeTree<bool>& index_tree = FindOrDie(*live_index_map, instruction);
+
+  ForEachLiveIndex(index_tree, [&](const ShapeIndex& shape_index) {
+    // Propagate liveness to while body computation root instruction.
+    MarkLiveAtIndex(instruction->while_body()->root_instruction(), shape_index,
+                    live_index_map, worklist, workset);
+    // Propagate liveness to tuple-shaped operand.
+    MarkLiveAtIndex(instruction->operand(0), shape_index, live_index_map,
+                    worklist, workset);
+  });
+
+  // Propagate liveness to while condition computation root instruction.
+  MarkLiveAtIndex(instruction->while_condition()->root_instruction(), {},
+                  live_index_map, worklist, workset);
+}
+
+// Propagates liveness out of Parameter instructions to callers and aliasing
+// positions. This can occur if liveness propagates to a parameter in the
+// while.condition computation, requiring liveness to propagate out to caller
+// callsite while (and while.body.root).
+void PropagateLivenessToParameterCallers(
+    const HloInstruction* instruction,
+    HloLivenessAnalysis::HloIndexMap* live_index_map, Worklist* worklist,
+    Workset* workset, CallGraph* call_graph) {
+  CHECK_EQ(instruction->opcode(), HloOpcode::kParameter);
+  const CallGraphNode& call_graph_node =
+      call_graph->GetNode(instruction->parent());
+  if (call_graph_node.context() == CallContext::kSequential) {
+    for (const CallSite& callsite : call_graph_node.caller_callsites()) {
+      if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
+        auto* xla_while = callsite.instruction();
+        const ShapeTree<bool>& index_tree =
+            FindOrDie(*live_index_map, instruction);
+        ForEachLiveIndex(index_tree, [&](const ShapeIndex& shape_index) {
+          // Propagate liveness to while result{shape_index}
+          MarkLiveAtIndex(xla_while, shape_index, live_index_map, worklist,
+                          workset);
+          // Propagate liveness to while body root{shape_index}.
+          MarkLiveAtIndex(xla_while->while_body()->root_instruction(),
+                          shape_index, live_index_map, worklist, workset);
+          // Propagate liveness to operand(0){shape_index}.
+          MarkLiveAtIndex(xla_while->operand(0), shape_index, live_index_map,
+                          worklist, workset);
+        });
+      }
+    }
+  }
+}
+
+}  // namespace
+
+HloLivenessAnalysis::HloLivenessAnalysis(const HloModule& module)
+    : module_(module), call_graph_(CallGraph::Build(&module)) {}
+
+// Runs liveness analysis on 'module_'.
+// Initializes worklist with entry root instruction (and any instruction with
+// side-effects), marking all of their output shape indices live.
+// Visits elements on worklist, propagating liveness from an instructions
+// live output shape indices to its called computations and operands.
+void HloLivenessAnalysis::RunAnalysis() {
+  Worklist worklist;
+  Workset workset;
+  // Add entry compuation root instruction.
+  MarkLiveAtAllIndices(module_.entry_computation()->root_instruction(),
+                       &live_index_map_, &worklist, &workset);
+  for (auto* computation : module_.computations()) {
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->HasSideEffectNoRecurse()) {
+        // Add instructions with side effects.
+        MarkLiveAtAllIndices(instruction, &live_index_map_, &worklist,
+                             &workset);
+      }
+    }
+  }
+
+  while (!worklist.empty()) {
+    const HloInstruction* instruction = worklist.front();
+    worklist.pop_front();
+    workset.erase(workset.find(instruction));
+    VLOG(1) << "VISIT instruction: " << instruction->name();
+
+    if (instruction->opcode() == HloOpcode::kTuple) {
+      PropagateLivenessThroughTuple(instruction, &live_index_map_, &worklist,
+                                    &workset);
+    } else if (instruction->opcode() == HloOpcode::kGetTupleElement) {
+      PropagateLivenessThroughGTE(instruction, &live_index_map_, &worklist,
+                                  &workset);
+    } else if (instruction->opcode() == HloOpcode::kWhile &&
+               ShapeUtil::IsTuple(instruction->shape())) {
+      PropagateLivenessThroughWhile(instruction, &live_index_map_, &worklist,
+                                    &workset);
+    } else if (instruction->opcode() == HloOpcode::kParameter &&
+               ShapeUtil::IsTuple(instruction->shape())) {
+      PropagateLivenessToParameterCallers(instruction, &live_index_map_,
+                                          &worklist, &workset,
+                                          call_graph_.get());
+    } else {
+      // Propagate liveness to called computations.
+      for (auto* called_computation : instruction->called_computations()) {
+        MarkLiveAtAllIndices(called_computation->root_instruction(),
+                             &live_index_map_, &worklist, &workset);
+      }
+      // Propagate liveness to operands.
+      for (HloInstruction* operand : instruction->operands()) {
+        MarkLiveAtAllIndices(operand, &live_index_map_, &worklist, &workset);
+      }
+    }
+  }
+}
+
+bool HloLivenessAnalysis::IsLive(const HloInstruction* instruction,
+                                 const ShapeIndex& shape_index) const {
+  if (ContainsKey(live_index_map_, instruction)) {
+    return FindOrDie(live_index_map_, instruction).element(shape_index);
+  }
+  return false;
+}
+
+/* static */
+StatusOr<std::unique_ptr<HloLivenessAnalysis>> HloLivenessAnalysis::Run(
+    const HloModule& module) {
+  VLOG(1) << "HloLivenessAnalysis::Run on module " << module.name();
+  XLA_VLOG_LINES(2, module.ToString());
+
+  auto liveness_analysis = WrapUnique(new HloLivenessAnalysis(module));
+
+  liveness_analysis->RunAnalysis();
+
+  return std::move(liveness_analysis);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis.h b/tensorflow/compiler/xla/service/hlo_liveness_analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe55a8070a42a3d68836dd32cf7ce5823dd77951
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis.h
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LIVENESS_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LIVENESS_ANALYSIS_H_
+
+#include <unordered_map>
+
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_value.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Analysis which identifies all live {HloInstruction, ShapeIndex} pairs in
+// an HLO module.
+//
+// HloLivenessAnalysis marks the shape index of each live output of each
+// instruction in the module, by propagating live shape index information
+// from an instruction to its called computations and operands.
+class HloLivenessAnalysis {
+ public:
+  // Maps from an HloInstruction to its live/dead output shape indices.
+  using HloIndexMap =
+      std::unordered_map<const HloInstruction*, ShapeTree<bool>>;
+
+  // Runs liveness analysis on 'module'. Returns HloLivenessAnalysis object
+  // which exports liveness for each {HloInstruction, ShapeIndex} in 'module'.
+  static StatusOr<std::unique_ptr<HloLivenessAnalysis>> Run(
+      const HloModule& module);
+
+  // Returns true if output of 'instruction' at 'shape_index' is live.
+  // Returns false otherwise.
+  bool IsLive(const HloInstruction* instruction,
+              const ShapeIndex& shape_index) const;
+
+ private:
+  HloLivenessAnalysis(const HloModule& module);
+
+  void RunAnalysis();
+
+  const HloModule& module_;
+  std::unique_ptr<CallGraph> call_graph_;
+  HloIndexMap live_index_map_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LIVENESS_ANALYSIS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e2e2c7627ba6ac9e5078446056917a07436cbd7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
@@ -0,0 +1,402 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_liveness_analysis.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class HloLivenessAnalysisTest : public HloTestBase {
+ protected:
+  HloLivenessAnalysisTest() {}
+
+  // Run liveness analysis on the member module. For convenience returns a
+  // reference to the generated analysis stored in analysis_.
+  const HloLivenessAnalysis& RunLiveness(HloModule* module) {
+    liveness_ = HloLivenessAnalysis::Run(*module).ConsumeValueOrDie();
+    return *liveness_;
+  }
+
+  HloInstruction* GetInstruction(HloModule* module, const string& name) {
+    HloInstruction* to_return = nullptr;
+    for (auto* comp : module->computations()) {
+      for (auto* inst : comp->instructions()) {
+        if (inst->name() == name) {
+          to_return = inst;
+          break;
+        }
+      }
+    }
+    return CHECK_NOTNULL(to_return);
+  }
+
+  std::unique_ptr<HloLivenessAnalysis> liveness_;
+};
+
+// Test that add instruction at entry root is live at all output shape indices.
+TEST_F(HloLivenessAnalysisTest, AddAtEntryRoot) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleModule
+  ENTRY SimpleComputation {
+    constant.1 = s32[] constant(0)
+    constant.2 = s32[] constant(1)
+    ROOT add = s32[] add(constant.1, constant.2)
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "add"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.2"), {}));
+}
+
+// Test that a dead add instruction is marked as dead by analysis.
+TEST_F(HloLivenessAnalysisTest, DeadAdd) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleModule
+  ENTRY SimpleComputation {
+    constant.1 = s32[] constant(0)
+    constant.2 = s32[] constant(1)
+    add.1 = s32[] add(constant.1, constant.2)
+    ROOT add.2 = s32[] add(constant.1, constant.2)
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "add.2"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.2"), {}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "add.1"), {}));
+}
+
+// Test that all output shape indices of entry root tuple (and defining
+// instruction in its output) are marked live.
+TEST_F(HloLivenessAnalysisTest, TupleAtEntryRoot) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleModule
+  ENTRY SimpleComputation {
+    constant.1 = s32[] constant(0)
+    constant.2 = s32[] constant(1)
+    ROOT tuple.1 = (s32[], s32[]) tuple(constant.1, constant.2)
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.2"), {}));
+}
+
+// Tests that all outputs of nested tuple and entry root (and defining
+// instruction values appearing in its output) are marked live.
+TEST_F(HloLivenessAnalysisTest, NestedTupleAtEntryRoot) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleModule
+  ENTRY SimpleComputation {
+    constant.1 = s32[] constant(1)
+    constant.2 = s32[] constant(2)
+    constant.3 = s32[] constant(3)
+    tuple.1 = (s32[], s32[]) tuple(constant.2, constant.3)
+    ROOT tuple.2 = (s32[], s32[]) tuple(constant.1, tuple.1)
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1, 0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1, 1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.2"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.3"), {}));
+}
+
+// Tests that GTE at entry root of Tuple instruction only propgates liveness
+// to the live elements in tuple.
+TEST_F(HloLivenessAnalysisTest, GteOfTuple) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleModule
+  ENTRY SimpleComputation {
+    constant.1 = s32[] constant(0)
+    constant.2 = s32[] constant(1)
+    tuple.1 = (s32[], s32[]) tuple(constant.1, constant.2)
+    ROOT get-tuple-element.1 = s32[] get-tuple-element(tuple.1), index=0
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(
+      liveness.IsLive(GetInstruction(module.get(), "get-tuple-element.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.1"), {}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "constant.2"), {}));
+}
+
+// Tests that GTE at entry root of nested Tuple instruction only propgates
+// liveness to the live elements in tuple.
+TEST_F(HloLivenessAnalysisTest, GteOfNestedTuple) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleModule
+  ENTRY SimpleComputation {
+    constant.1 = s32[] constant(0)
+    constant.2 = s32[] constant(1)
+    constant.3 = s32[] constant(2)
+    tuple.1 = (s32[], s32[]) tuple(constant.2, constant.3)
+    tuple.2 = (s32[], s32[]) tuple(constant.1, tuple.1)
+    ROOT get-tuple-element.1 = (s32[], s32[]) get-tuple-element(tuple.2), index=1
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(
+      liveness.IsLive(GetInstruction(module.get(), "get-tuple-element.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(
+      GetInstruction(module.get(), "get-tuple-element.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(
+      GetInstruction(module.get(), "get-tuple-element.1"), {1}));
+
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1, 0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1, 1}));
+
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "constant.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.2"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.3"), {}));
+}
+
+// Tests that GTE of GTE (at entry root) of nested Tuple instruction only
+// propgates liveness to the live elements in tuple.
+TEST_F(HloLivenessAnalysisTest, GteOfGteOfNestedTuple) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleModule
+  ENTRY SimpleComputation {
+    constant.1 = s32[] constant(0)
+    constant.2 = s32[] constant(1)
+    constant.3 = s32[] constant(2)
+    tuple.1 = (s32[], s32[]) tuple(constant.2, constant.3)
+    tuple.2 = (s32[], s32[]) tuple(constant.1, tuple.1)
+    get-tuple-element.1 = (s32[], s32[]) get-tuple-element(tuple.2), index=1
+    ROOT get-tuple-element.2 = s32[] get-tuple-element(get-tuple-element.1), index=0
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(
+      liveness.IsLive(GetInstruction(module.get(), "get-tuple-element.2"), {}));
+
+  EXPECT_TRUE(
+      liveness.IsLive(GetInstruction(module.get(), "get-tuple-element.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(
+      GetInstruction(module.get(), "get-tuple-element.1"), {0}));
+  EXPECT_FALSE(liveness.IsLive(
+      GetInstruction(module.get(), "get-tuple-element.1"), {1}));
+
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1, 0}));
+  EXPECT_FALSE(
+      liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1, 1}));
+
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "constant.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.2"), {}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "constant.3"), {}));
+}
+
+// Test that live/dead while tuple elements are marked live/dead correctly.
+TEST_F(HloLivenessAnalysisTest, WhileWithDeadTupleElement) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add.0 = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply.0 = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple.0 = (s32[], s32[3]{0}) tuple(add.0, multiply.0)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s32[] constant(0)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s32[], s32[3]{0}) tuple(constant.3, constant.4)
+    while.0 = (s32[], s32[3]{0}) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT get-tuple-element.4 = s32[] get-tuple-element(while.0), index=0
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(
+      liveness.IsLive(GetInstruction(module.get(), "get-tuple-element.4"), {}));
+
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.0"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.0"), {0}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "while.0"), {1}));
+
+  // While operand.
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.3"), {}));
+
+  // While body.
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.0"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.0"), {0}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "tuple.0"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "add.0"), {}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "multiply.0"), {}));
+}
+
+// Tests that a tuple element live in while.cond computation, propagates
+// liveness to while.body.root/while.result/while.operand (where it is unused).
+TEST_F(HloLivenessAnalysisTest, WhileCondPropagatesLiveness) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add.0 = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply.0 = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple.0 = (s32[], s32[3]{0}) tuple(add.0, multiply.0)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    get-tuple-element.4 = s32[] get-tuple-element(loop_var.2), index=1
+    add.1 = s32[] add(get-tuple-element.3, get-tuple-element.4)
+    constant.2 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(add.1, constant.2)
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s32[] constant(0)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s32[], s32[3]{0}) tuple(constant.3, constant.4)
+    while.0 = (s32[], s32[3]{0}) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT get-tuple-element.5 = s32[] get-tuple-element(while.0), index=0
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(
+      liveness.IsLive(GetInstruction(module.get(), "get-tuple-element.5"), {}));
+
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.0"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.0"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.0"), {1}));
+
+  // While operand.
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.3"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.4"), {}));
+
+  // While body.
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.0"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.0"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.0"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "add.0"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "multiply.0"), {}));
+}
+
+// Tests that a use of while.result{0} propagates liveness to
+// while.body.param{1} to while.body.root{1}, and then to while.body.param{2}.
+TEST_F(HloLivenessAnalysisTest, WhileWithLiveTupleElements) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], s32[], s32[]) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    get-tuple-element.2 = s32[] get-tuple-element(loop_var.1), index=1
+    add.1 = s32[] add(get-tuple-element.1, get-tuple-element.2)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.1), index=2
+    multiply.1 = s32[] multiply(get-tuple-element.3, get-tuple-element.3)
+    ROOT tuple.1 = (s32[], s32[], s32[]) tuple(add.1, get-tuple-element.3, multiply.1)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], s32[], s32[]) parameter(0)
+    get-tuple-element.4 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.1 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.4, constant.1)
+  }
+  ENTRY SimpleLoop {
+    constant.2 = s32[] constant(0)
+    constant.3 = s32[] constant(1)
+    constant.4 = s32[] constant(2)
+    tuple.2 = (s32[], s32[], s32[]) tuple(constant.2, constant.3, constant.4)
+    while.1 = (s32[], s32[], s32[]) while(tuple.2), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT get-tuple-element.5 = s32[] get-tuple-element(while.1), index=0
+  })")
+                    .ValueOrDie();
+
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(
+      liveness.IsLive(GetInstruction(module.get(), "get-tuple-element.5"), {}));
+
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.1"), {2}));
+  // While operand.
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {2}));
+  // While body root.
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {2}));
+  // While body param.
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "loop_var.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "loop_var.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "loop_var.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "loop_var.1"), {2}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index 69deac263ee58f9e4d46987a54f09b11d650950a..7e4b8834357d39099f76450b849d6b5624e4e3b4 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -17,10 +17,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace xla {
 namespace testing {
 
+using ::tensorflow::str_util::Join;
+
 bool HloMatcher::MatchAndExplain(
     const HloInstruction* instruction,
     ::testing::MatchResultListener* listener) const {
@@ -195,6 +198,41 @@ void HloShardingMatcher::DescribeTo(std::ostream* os) const {
   }
 }
 
+bool HloDotWithContractingDimsMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (!HloMatcher::MatchAndExplain(instruction, listener)) {
+    return false;
+  }
+
+  const DotDimensionNumbers& dim_nums = instruction->dot_dimension_numbers();
+  if (dim_nums.lhs_contracting_dimensions_size() != 1 ||
+      dim_nums.lhs_contracting_dimensions(0) != lhs_contracting_dim_) {
+    *listener << instruction->ToString()
+              << " has wrong lhs_contracting_dimensions (got {"
+              << Join(dim_nums.lhs_contracting_dimensions(), ",") << "} want {"
+              << lhs_contracting_dim_ << "})";
+    return false;
+  }
+
+  if (dim_nums.rhs_contracting_dimensions_size() != 1 ||
+      dim_nums.rhs_contracting_dimensions(0) != rhs_contracting_dim_) {
+    *listener << instruction->ToString()
+              << " has wrong rhs_contracting_dimensions (got {"
+              << Join(dim_nums.rhs_contracting_dimensions(), ",") << "} want {"
+              << rhs_contracting_dim_ << "})";
+    return false;
+  }
+
+  return true;
+}
+
+void HloDotWithContractingDimsMatcher::DescribeTo(std::ostream* os) const {
+  HloMatcher::DescribeTo(os);
+  *os << " with lhs_contracting_dims={" << lhs_contracting_dim_
+      << "} and rhs_contracting_dims={" << rhs_contracting_dim_ << "}";
+}
+
 }  // namespace testing
 
 void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 5175736a2506c85836577a7f2ba2359a3d5a6b18..c33bdadf1c7145bf2aff09b01423c6c21382da0c 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -131,6 +131,27 @@ class HloShardingMatcher
   tensorflow::gtl::optional<HloSharding> sharding_;
 };
 
+// Matches a Dot HLO instruction with specific LHS and RHS contracting
+// dimensions.
+class HloDotWithContractingDimsMatcher : public HloMatcher {
+ public:
+  explicit HloDotWithContractingDimsMatcher(
+      ::testing::Matcher<const HloInstruction*> lhs,
+      ::testing::Matcher<const HloInstruction*> rhs, int64 lhs_contracting_dim,
+      int64 rhs_contracting_dim)
+      : HloMatcher(HloOpcode::kDot, /*operands=*/{lhs, rhs}),
+        lhs_contracting_dim_(lhs_contracting_dim),
+        rhs_contracting_dim_(rhs_contracting_dim) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  int64 lhs_contracting_dim_;
+  int64 rhs_contracting_dim_;
+};
+
 // HloInstruction* matchers for opcode and operands. Example:
 //   namespace op = xla::opcode_matchers;
 //   EXPECT_THAT(instruction,
@@ -158,7 +179,6 @@ HLO_MATCHER(Convolution);
 HLO_MATCHER(Copy);
 HLO_MATCHER(CrossReplicaSum);
 HLO_MATCHER(Divide);
-HLO_MATCHER(Dot);
 HLO_MATCHER(DynamicSlice);
 HLO_MATCHER(DynamicUpdateSlice);
 HLO_MATCHER(Eq);
@@ -310,6 +330,30 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> NoSharding() {
       new ::xla::testing::HloShardingMatcher(tensorflow::gtl::nullopt));
 }
 
+inline ::testing::Matcher<const ::xla::HloInstruction*> Dot(
+    ::testing::Matcher<const HloInstruction*> lhs_matcher,
+    ::testing::Matcher<const HloInstruction*> rhs_matcher) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloMatcher(
+      ::xla::HloOpcode::kDot, {lhs_matcher, rhs_matcher}));
+}
+
+// Matches a Dot HLO instruction if it has exactly one lhs contracting dimension
+// equal to `lhs_contracting_dim` and exactly one rhs contracting dimension
+// equal to `rhs_contracting_dim`.
+//
+// Currently the HLO verifier rejects Dot operations with more than one
+// contracting dimension (even though we can represent these in the
+// DotDimensionNumbers proto) so there is no need to generalize this to support
+// multiple contracting dimensions.
+inline ::testing::Matcher<const ::xla::HloInstruction*> Dot(
+    ::testing::Matcher<const HloInstruction*> lhs_matcher,
+    ::testing::Matcher<const HloInstruction*> rhs_matcher,
+    int64 lhs_contracting_dim, int64 rhs_contracting_dim) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloDotWithContractingDimsMatcher(
+          lhs_matcher, rhs_matcher, lhs_contracting_dim, rhs_contracting_dim));
+}
+
 #undef HLO_MATCHER
 }  // namespace opcode_matchers
 
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
index f2463060b7cd653dffb408f8df17f44fe0c1a97c..016cc01e33840aa195dfc0a21e8ac8f3d24a3e06 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace op = xla::testing::opcode_matchers;
 using ::testing::_;
@@ -165,5 +166,41 @@ TEST(HloMatchersTest, ShardingMatcher) {
               "has incorrect sharding (expected: {maximal device=0})");
 }
 
+TEST(HloMatchersTest, DotMatcher) {
+  string hlo_string = R"(
+HloModule DotOperationFusion_TransposeFusion
+
+ENTRY DotOperationFusion_TransposeFusion {
+  arg0 = f32[1,256] parameter(0)
+  arg1 = f32[256,1024] parameter(1)
+  ROOT dot = f32[1,1024] dot(arg0, arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  EXPECT_THAT(root, op::Dot(op::Parameter(0), op::Parameter(1),
+                            /*lhs_contracting_dim=*/1,
+                            /*rhs_contracting_dim=*/0));
+
+  EXPECT_THAT(
+      Explain(root, op::Dot(op::Parameter(0), op::Parameter(1),
+                            /*lhs_contracting_dim=*/0,
+                            /*rhs_contracting_dim=*/0)),
+      "%dot = f32[1,1024]{1,0} dot(f32[1,256]{1,0} %arg0, f32[256,1024]{1,0} "
+      "%arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0} has wrong "
+      "lhs_contracting_dimensions (got {1} want {0})");
+
+  EXPECT_THAT(
+      Explain(root, op::Dot(op::Parameter(0), op::Parameter(1),
+                            /*lhs_contracting_dim=*/1,
+                            /*rhs_contracting_dim=*/1)),
+      "%dot = f32[1,1024]{1,0} dot(f32[1,256]{1,0} %arg0, f32[256,1024]{1,0} "
+      "%arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0} has wrong "
+      "rhs_contracting_dimensions (got {0} want {1})");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index c7a719286753914ff39dc1fd528e74bc7fab2d7b..fbf1d58007e318a8a08aa9e11d9d54811533703e 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -46,6 +46,18 @@ HloModule::HloModule(const string& name, const HloModuleConfig& config)
       config_(config),
       unique_id_(next_unique_module_id_++) {}
 
+StatusOr<HloInstruction*> HloModule::LaunderConstInstructionFromModule(
+    const HloInstruction* hlo) {
+  if (hlo == nullptr) {
+    return nullptr;
+  }
+
+  TF_RET_CHECK(hlo->GetModule() == this);
+
+  // TODO(b/78350259): Eliminate const laundering.
+  return const_cast<HloInstruction*>(hlo);
+}
+
 HloComputation* HloModule::AddComputationInternal(
     std::unique_ptr<HloComputation> computation, bool is_entry,
     bool uniquify_names) {
@@ -254,24 +266,44 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
       << ShapeUtil::HumanStringWithLayout(expected_program_shape.result())
       << ", actual: " << ShapeUtil::HumanStringWithLayout(result_shape);
 
-  auto module = MakeUnique<HloModule>(proto.name(), entry_computation_handle,
-                                      module_config);
-
   tensorflow::gtl::FlatMap<int64, HloComputation*> computation_map;
+  tensorflow::gtl::FlatMap<HloComputation*, int64> to_proto_id;
+  std::vector<std::unique_ptr<HloComputation>> computations;
+  HloComputation* entry = nullptr;
   for (const HloComputationProto& computation_proto : proto.computations()) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> computation,
-                        HloComputation::CreateFromProto(
-                            module.get(), computation_proto, computation_map));
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloComputation> computation,
+        HloComputation::CreateFromProto(computation_proto, computation_map));
     CHECK_NE(computation.get(), nullptr);
     int64 computation_id = computation_proto.id();
     TF_RET_CHECK(computation_id != -1);
     TF_RET_CHECK(!ContainsKey(computation_map, computation_id));
+    computation_map[computation_id] = computation.get();
+    to_proto_id[computation.get()] = computation_id;
+    if (computation_id == proto.entry_computation_id()) {
+      entry = computation.get();
+    }
+    computations.push_back(std::move(computation));
+  }
+  TF_RET_CHECK(entry != nullptr);
+
+  auto module = MakeUnique<HloModule>(proto.name(), entry_computation_handle,
+                                      module_config);
+
+  // Sort the computations in the proto id's order.
+  std::sort(computations.begin(), computations.end(),
+            [&](const std::unique_ptr<HloComputation>& a,
+                const std::unique_ptr<HloComputation>& b) {
+              return to_proto_id[a.get()] < to_proto_id[b.get()];
+            });
+
+  // Add sorted computations to the module.
+  for (auto& computation : computations) {
+    bool is_entry = computation.get() == entry;
     // Don't uniquify names because we want names to be stable across
     // serialization and deserialization.
-    computation_map[computation_id] = module->AddComputationInternal(
-        std::move(computation),
-        /*is_entry=*/proto.entry_computation_id() == computation_id,
-        /*uniquify_names=*/false);
+    module->AddComputationInternal(std::move(computation), is_entry,
+                                   /*uniquify_names=*/false);
   }
   TF_RET_CHECK(module->entry_computation_ != nullptr);
 
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index f9674df812dbbc9c5a99c7c57a18b800f23ee36f..02918c377776b73f2086fe41afc406567a12af4c 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -42,10 +42,18 @@ namespace xla {
 
 // Describes a compilation unit at the HLO level.
 //
-// A HLO module contains one or more HLO computations. The module contains one
-// "entry" computation which produces the result. The module also includes any
-// embedded computations used by instructions such as "map" and "reduce". All
-// computations are owned by the module.
+// HloModule is the top-level unit in the HLO IR.  It corresponds to a whole
+// "program".  Running a module, from beginning to end, is the only way to run
+// an XLA program.
+//
+// A module contains one "entry computation"; this HloComputation is like main()
+// in a C program.  The result of running the module is the result of running
+// this computation.
+//
+// A module also contains some number of "nested computations".  Each nested
+// computation is attached to an HloInstruction within some other computation.
+// The meaning of the nested computation depends on the instruction it's
+// attached to.
 class HloModule {
  public:
   HloModule(const string& name,
@@ -217,6 +225,25 @@ class HloModule {
   // the lifetime of this process.
   int unique_id() const { return unique_id_; }
 
+  // Returns a non-const version of the passed-in const HloInstruction*. This is
+  // safe on the argument that if you have a non-const module, then you can
+  // access all instructions in the module as non-const.
+  //
+  // Returns an error if the passed-in instruction is not from this module,
+  // except that it is allowed to pass in a null pointer.
+  //
+  // TODO(b/78350259): Eliminate const laundering. The argument above is not
+  // reliable since at any time someone could add or discover a way for a
+  // non-const module to transitively contain a const HloInstruction. The
+  // reliable way to do this would be to create a const laundering map from a
+  // module, mapping each encountered HloInstruction to its non-const version
+  // and then look up each instruction in need of laundering in that map, but
+  // this is much more expensive and complicated. This returns a Status instead
+  // of doing a CHECK-failure in part to make it strongly apparent that this is
+  // something that can fail.
+  StatusOr<HloInstruction*> LaunderConstInstructionFromModule(
+      const HloInstruction* hlo);
+
  private:
   HloComputation* AddComputationInternal(
       std::unique_ptr<HloComputation> computation, bool is_entry,
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce.cc b/tensorflow/compiler/xla/service/hlo_module_dce.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98d20315e399c6b1a3979b5d11a89ef93869f4d9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_module_dce.cc
@@ -0,0 +1,131 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_module_dce.h"
+
+#include <deque>
+#include <unordered_set>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_liveness_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace {
+
+bool HasSendRecv(HloComputation* computation) {
+  for (auto* instruction : computation->instructions()) {
+    if (instruction->opcode() == HloOpcode::kSend ||
+        instruction->opcode() == HloOpcode::kSendDone ||
+        instruction->opcode() == HloOpcode::kRecv ||
+        instruction->opcode() == HloOpcode::kRecvDone) {
+      return true;
+    }
+    for (auto* sub_computation : instruction->called_computations()) {
+      if (HasSendRecv(sub_computation)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+StatusOr<bool> RunWhileDCE(HloModule* module, HloLivenessAnalysis* liveness) {
+  bool changed = false;
+  for (auto* computation : module->computations()) {
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kWhile) {
+        continue;
+      }
+
+      const auto* xla_while = instruction;
+      auto* while_body_comp = xla_while->while_body();
+      auto* while_body_param = while_body_comp->parameter_instruction(0);
+      auto* while_body_root = while_body_comp->root_instruction();
+
+      if (!ShapeUtil::IsTuple(xla_while->shape()) ||
+          while_body_root->opcode() != HloOpcode::kTuple ||
+          HasSendRecv(while_body_comp)) {
+        // Only run DCE on tuple-shaped while loops where body root is Tuple,
+        // with no send/recv instructions.
+        VLOG(1) << "WhileDCE SKIP while: " << xla_while->ToString();
+        continue;
+      }
+
+      // Remove dead tuple elements.
+      const int64 tuple_element_count =
+          ShapeUtil::TupleElementCount(xla_while->shape());
+      for (int64 i = 0; i < tuple_element_count; ++i) {
+        if (liveness->IsLive(xla_while, {i})) {
+          continue;
+        }
+        VLOG(1) << "WhileDCE Dead while tuple element."
+                << " while: " << xla_while->name() << " tuple_index: " << i;
+        // Transform while.body computation to make tuple element at
+        // 'shape_index' as simple pass-through parameter (which candidate
+        // be removed later by simplification pass).
+        HloInstruction* pass_thru_gte = while_body_comp->AddInstruction(
+            HloInstruction::CreateGetTupleElement(
+                while_body_param->shape().tuple_shapes(i), while_body_param,
+                i));
+        // Replace while.body.root Tuple operand at 'tuple_index' with
+        // 'pass_thru_gte', making prior operand a dead root (to be cleaned
+        // up with a subsequent DCE pass).
+        TF_RETURN_IF_ERROR(
+            while_body_root->ReplaceOperandWith(i, pass_thru_gte));
+        changed = true;
+      }
+    }
+  }
+  return changed;
+}
+
+}  // namespace
+
+StatusOr<bool> HloModuleDCE::Run(HloModule* module) {
+  VLOG(2) << "Before HloModuleDCE:";
+  XLA_VLOG_LINES(3, module->ToString());
+
+  std::unique_ptr<HloLivenessAnalysis> liveness;
+  TF_ASSIGN_OR_RETURN(liveness, HloLivenessAnalysis::Run(*module));
+
+  // Sweep through while instructions, transforming dead while tuple element
+  // computations to pass through tuple values (creating dead roots in while
+  // body computation in the process).
+  TF_ASSIGN_OR_RETURN(bool hlo_module_dce_changed,
+                      RunWhileDCE(module, liveness.get()));
+
+  // Run HloDCE to clean up any dead code created during HloModuleDCE.
+  HloDCE hlo_dce;
+  TF_ASSIGN_OR_RETURN(bool hlo_dce_changed, hlo_dce.Run(module));
+
+  VLOG(2) << "After HloModuleDCE:";
+  XLA_VLOG_LINES(3, module->ToString());
+
+  return hlo_module_dce_changed | hlo_dce_changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce.h b/tensorflow/compiler/xla/service/hlo_module_dce.h
new file mode 100644
index 0000000000000000000000000000000000000000..29024085c1038961ef2b3721de1ce0e8a55ccf45
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_module_dce.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_DCE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_DCE_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// HLO pass which removes dead code from computations in the module using
+// HloModule-scoped analysis (HloLivenessAnalysis).
+//
+// Sweeps through live instructions which cross computation boundaries (kWhile),
+// and removes code at dead shape indices.
+//
+class HloModuleDCE : public HloPassInterface {
+ public:
+  ~HloModuleDCE() override {}
+  tensorflow::StringPiece name() const override { return "hlo-module-dce"; }
+
+  // Run the pass on the given module. Returns whether the module was changed
+  // (instructions were removed).
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_DCE_H_
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..53b7d0ed3964ca8a2c3bb73c62015a1c7dbfe487
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
@@ -0,0 +1,371 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_module_dce.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class HloModuleDceTest : public HloTestBase {
+ protected:
+  HloModuleDceTest() {}
+
+  // Returns whether the given instruction exists in the given computation.
+  bool HasInstruction(const HloComputation& computation,
+                      const HloInstruction* instruction) {
+    return std::find(computation.instructions().begin(),
+                     computation.instructions().end(),
+                     instruction) != computation.instructions().end();
+  }
+
+  // Returns whether the while instruction with name 'while_name' in
+  // 'computation' passes through its tuple element at 'tuple_index' from
+  // parameter to root instruction.
+  bool WhileBodyHasPassThroughTupleElement(const HloComputation* computation,
+                                           const string& while_name,
+                                           const int64 tuple_index) {
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kWhile &&
+          instruction->name() == while_name) {
+        auto* while_body_comp = instruction->while_body();
+        auto* while_body_param = while_body_comp->parameter_instruction(0);
+        auto* while_body_root = while_body_comp->root_instruction();
+        if (while_body_root->opcode() != HloOpcode::kTuple) {
+          return false;
+        }
+        auto* operand = while_body_root->operand(tuple_index);
+        if (operand->opcode() == HloOpcode::kGetTupleElement &&
+            operand->tuple_index() == tuple_index &&
+            operand->operand(0) == while_body_param) {
+          return true;
+        }
+        return false;
+      }
+    }
+    return false;
+  }
+};
+
+// Tests that a while with all outputs live is unmodified.
+TEST_F(HloModuleDceTest, WhileWithLiveOutputs) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s32[], s32[3]{0}) tuple(add, multiply)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s32[] constant(0)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s32[], s32[3]{0}) tuple(constant.3, constant.4)
+    ROOT while = (s32[], s32[3]{0}) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+  })")
+                    .ValueOrDie();
+
+  HloModuleDCE dce;
+  EXPECT_FALSE(dce.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 0));
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 1));
+}
+
+// Tests a while loop with one unused output (which is used in the while loop
+// body by an instruction with side-effects: rng) is unmodified.
+TEST_F(HloModuleDceTest, WhileWithUnusedSideEffectingTupleElement) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], f32[]) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = f32[] get-tuple-element(loop_var.1), index=1
+    constant.2 = f32[] constant(1.0)
+    rng = f32[] rng(constant.2, get-tuple-element.2), distribution=rng_uniform
+    add.1 = s32[] add(get-tuple-element.2, constant.2)
+    ROOT tuple = (s32[], f32[]) tuple(add, add.1)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], f32[]) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.3 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.3)
+  }
+  ENTRY SimpleLoop {
+    constant.4 = s32[] constant(0)
+    constant.5 = f32[] constant(0.0)
+    tuple.1 = (s32[], f32[]) tuple(constant.4, constant.5)
+    while = (s32[], f32[]) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT get-tuple-element.4 = s32[] get-tuple-element(while), index=0
+  })")
+                    .ValueOrDie();
+
+  HloModuleDCE dce;
+  EXPECT_FALSE(dce.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 0));
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 1));
+}
+
+// Tests that a while loop with one dead tuple element at {1} has its while
+// loop body modified to make that tuple element pass-through the while body.
+TEST_F(HloModuleDceTest, OneWhileWithDeadTupleElement) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s32[], s32[3]{0}) tuple(add, multiply)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s32[] constant(0)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s32[], s32[3]{0}) tuple(constant.3, constant.4)
+    while = (s32[], s32[3]{0}) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT get-tuple-element.4 = s32[] get-tuple-element(while), index=0
+  })")
+                    .ValueOrDie();
+
+  HloModuleDCE dce;
+  // While tuple element {1} should not be pass-through before ModuleDCE.
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 1));
+  EXPECT_TRUE(dce.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 0));
+  // While tuple element {1} should now be pass-through after ModuleDCE.
+  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                  "while", 1));
+}
+
+// Tests that a tuple element {1} used by condition computation (which appears
+// dead in while.body{1} and at while.result{1}) propgates liveness of this
+// tuple element to while.body{1} and at while.result{1}.
+TEST_F(HloModuleDceTest, OneWhileWithTupleElementUsedByCond) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], s32[]) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[] get-tuple-element(loop_var.1), index=1
+    multiply = s32[] multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s32[], s32[]) tuple(add, multiply)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], s32[]) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=1
+    constant.2 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s32[] constant(0)
+    constant.4 = s32[] constant(0)
+    tuple.1 = (s32[], s32[]) tuple(constant.3, constant.4)
+    while = (s32[], s32[]) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT get-tuple-element.4 = s32[] get-tuple-element(while), index=0
+  })")
+                    .ValueOrDie();
+
+  HloModuleDCE dce;
+  // While tuple element {1} should not be pass-through before ModuleDCE.
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 1));
+  EXPECT_FALSE(dce.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 0));
+  // While tuple element {1} still be pass-through after ModuleDCE.
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 1));
+}
+
+// Tests that HloModuleDCE can remove a dead tuple element at index {1} between
+// two dependent while loops.
+TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElement) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body0 {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s32[], s32[3]{0}) tuple(add, multiply)
+  }
+  SimpleLoop.condition0 {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+  }
+  SimpleLoop.body1 {
+    loop_var.3 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.4 = s32[] get-tuple-element(loop_var.3), index=0
+    constant.3 = s32[] constant(1)
+    add.1 = s32[] add(get-tuple-element.4, constant.3)
+    get-tuple-element.5 = s32[3]{0} get-tuple-element(loop_var.3), index=1
+    multiply.1 = s32[3]{0} multiply(get-tuple-element.5, get-tuple-element.5)
+    ROOT tuple.1 = (s32[], s32[3]{0}) tuple(add.1, multiply.1)
+  }
+  SimpleLoop.condition1 {
+    loop_var.4 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.6 = s32[] get-tuple-element(loop_var.4), index=0
+    constant.4 = s32[] constant(5)
+    ROOT less-than.1 = pred[] less-than(get-tuple-element.6, constant.4)
+  }
+  ENTRY SimpleLoop {
+    constant.5 = s32[] constant(0)
+    constant.6 = s32[3]{0} constant({0, 1, 2})
+    tuple.2 = (s32[], s32[3]{0}) tuple(constant.5, constant.6)
+    while.1 = (s32[], s32[3]{0}) while(tuple.2), condition=
+      SimpleLoop.condition0, body=SimpleLoop.body0
+    get-tuple-element.7 = s32[] get-tuple-element(while.1), index=0
+    tuple.3 = (s32[], s32[3]{0}) tuple(get-tuple-element.7, constant.6)
+    while.2 = (s32[], s32[3]{0}) while(tuple.3), condition=
+      SimpleLoop.condition1, body=SimpleLoop.body1
+    ROOT get-tuple-element.8 = s32[] get-tuple-element(while.2), index=0
+  })")
+                    .ValueOrDie();
+
+  HloModuleDCE dce;
+  // Before HloModuleDCE while.1 and while.2 should not have pass-thru elements.
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.1", 1));
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.2", 1));
+  EXPECT_TRUE(dce.Run(module.get()).ValueOrDie());
+  // After HloModuleDCE while.1 and while.2 should have pass-thru elements,
+  // after being modified to pass through unused tuple element {1}.
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.1", 0));
+  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                  "while.1", 1));
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.2", 0));
+  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                  "while.2", 1));
+}
+
+// Tests that HloModuleDCE can remove a dead tuple element at while.1{0} and
+// while.2{1}, between two dependent while loops.
+TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElementSwizzled) {
+  auto module = tools::Parse(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body0 {
+    loop_var.1 = (s32[3]{0}, s32[]) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=1
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=0
+    multiply = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s32[3]{0}, s32[]) tuple(multiply, add)
+  }
+  SimpleLoop.condition0 {
+    loop_var.2 = (s32[3]{0}, s32[]) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=1
+    constant.2 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+  }
+  SimpleLoop.body1 {
+    loop_var.3 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.4 = s32[] get-tuple-element(loop_var.3), index=0
+    constant.3 = s32[] constant(1)
+    add.1 = s32[] add(get-tuple-element.4, constant.3)
+    get-tuple-element.5 = s32[3]{0} get-tuple-element(loop_var.3), index=1
+    multiply.1 = s32[3]{0} multiply(get-tuple-element.5, get-tuple-element.5)
+    ROOT tuple.1 = (s32[], s32[3]{0}) tuple(add.1, multiply.1)
+  }
+  SimpleLoop.condition1 {
+    loop_var.4 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.6 = s32[] get-tuple-element(loop_var.4), index=0
+    constant.4 = s32[] constant(5)
+    ROOT less-than.1 = pred[] less-than(get-tuple-element.6, constant.4)
+  }
+  ENTRY SimpleLoop {
+    constant.5 = s32[] constant(0)
+    constant.6 = s32[3]{0} constant({0, 1, 2})
+    tuple.2 = (s32[3]{0}, s32[]) tuple(constant.6, constant.5)
+    while.1 = (s32[3]{0}, s32[]) while(tuple.2), condition=
+      SimpleLoop.condition0, body=SimpleLoop.body0
+    get-tuple-element.7 = s32[] get-tuple-element(while.1), index=1
+    tuple.3 = (s32[], s32[3]{0}) tuple(get-tuple-element.7, constant.6)
+    while.2 = (s32[], s32[3]{0}) while(tuple.3), condition=
+      SimpleLoop.condition1, body=SimpleLoop.body1
+    ROOT get-tuple-element.8 = s32[] get-tuple-element(while.2), index=0
+  })")
+                    .ValueOrDie();
+
+  HloModuleDCE dce;
+  // Before HloModuleDCE while.1{0} and while.2{1} should not be pass-thru.
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.1", 0));
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.2", 1));
+  EXPECT_TRUE(dce.Run(module.get()).ValueOrDie());
+  // After HloModuleDCE while.1{0} and while.2{1} not be pass-thru elements.
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.1", 1));
+  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                  "while.1", 0));
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.2", 0));
+  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                  "while.2", 1));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index 54c34ce116651608e6d91cdcba9c708ca3a5f75e..b4cd3c730e323b8459312edbebc564e08f9d6840 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_module_group_metadata.h"
 
+#include <sstream>
 #include <string>
 #include <utility>
 
@@ -47,13 +48,16 @@ string HloModuleGroupMetadata::TrackedInstruction::ToString() const {
     case ComputationKind::kConditionalFalse:
       repr += ":CONDITIONAL_FALSE";
       break;
+    case ComputationKind::kCallFunction:
+      repr += ":CALL";
+      break;
   }
   return repr;
 }
 
 /* static */ StatusOr<std::unique_ptr<HloModuleGroupMetadata>>
 HloModuleGroupMetadata::Build(const std::vector<HloModule*>& modules) {
-  auto metadata = absl::make_unique<HloModuleGroupMetadata>(modules);
+  auto metadata = MakeUnique<HloModuleGroupMetadata>(modules);
   TF_RETURN_IF_ERROR(metadata->Build());
   return std::move(metadata);
 }
@@ -107,6 +111,31 @@ Status HloModuleGroupMetadata::Build() {
       TF_RETURN_IF_ERROR(computation->Accept(visitor));
     }
   }
+  TF_RETURN_IF_ERROR(VerifyCompanionSets());
+  return Status::OK();
+}
+
+Status HloModuleGroupMetadata::VerifyCompanionSets() const {
+  // TODO(dlibenzi): Migrate this to use the device instead of module ID, once
+  // the kDomain CL goes in.
+  for (const auto& companions : companion_sets_) {
+    // A companion set must be composed at most of an instruction per
+    // device/module.
+    std::unordered_set<int64> devices;
+    for (HloInstruction* instruction : *companions) {
+      int64 device = GetModuleId(instruction->parent()->parent());
+      if (!devices.insert(device).second) {
+        std::stringstream ss;
+        ss << "Companion set:" << std::endl;
+        for (HloInstruction* hlo : *companions) {
+          ss << "  " << hlo->name() << " ("
+             << GetModuleId(hlo->parent()->parent()) << ")" << std::endl;
+        }
+        ss << "has multiple instructions on the same device";
+        return FailedPrecondition("%s", ss.str().c_str());
+      }
+    }
+  }
   return Status::OK();
 }
 
@@ -206,6 +235,9 @@ Status HloModuleGroupMetadata::RecordInstructions() {
           TrackedInstruction(hlo, ComputationKind::kConditionalTrue);
       tracked_instructions_[hlo->false_computation()] =
           TrackedInstruction(hlo, ComputationKind::kConditionalFalse);
+    } else if (hlo->opcode() == HloOpcode::kCall) {
+      tracked_instructions_[hlo->to_apply()] =
+          TrackedInstruction(hlo, ComputationKind::kCallFunction);
     }
     if (!IsChannelInstruction(hlo)) {
       return Status::OK();
@@ -258,14 +290,15 @@ Status HloModuleGroupMetadata::RecordInstructions() {
 Status HloModuleGroupMetadata::AddCompanion(HloInstruction* instruction1,
                                             HloInstruction* instruction2) {
   TF_RET_CHECK(instruction1->opcode() == HloOpcode::kWhile ||
-               instruction1->opcode() == HloOpcode::kConditional);
+               instruction1->opcode() == HloOpcode::kConditional ||
+               instruction1->opcode() == HloOpcode::kCall);
   VLOG(2) << "adding as companions:" << instruction1->ToString() << " and "
           << instruction2->ToString();
 
   if (!ContainsKey(companion_set_index_, instruction1) &&
       !ContainsKey(companion_set_index_, instruction2)) {
     companion_sets_.push_back(
-        absl::make_unique<std::unordered_set<HloInstruction*>>());
+        tensorflow::MakeUnique<std::unordered_set<HloInstruction*>>());
     auto companion_set = companion_sets_.back().get();
     companion_set->insert(instruction1);
     companion_set->insert(instruction2);
@@ -336,21 +369,11 @@ Status HloModuleGroupMetadata::VerifyChannelInstructions() {
     }
   }
 
-  // Check if channel instructions are used only in allowed computations.
-  const auto allowed = [this](HloInstruction* hlo) {
-    HloComputation* computation = hlo->parent();
-    const HloModule* module = computation->parent();
-    if (module->entry_computation() == computation ||
-        tracked_instructions_.count(computation) > 0) {
-      return true;
-    }
-    return false;
-  };
   for (const Channel& channel : channels_) {
-    if (!allowed(channel.send) || !allowed(channel.send_done) ||
-        !allowed(channel.recv) || !allowed(channel.recv_done)) {
-      return FailedPrecondition("channel is used in disallowed computation");
-    }
+    TF_RETURN_IF_ERROR(CheckCommunicatingInstruction(channel.send));
+    TF_RETURN_IF_ERROR(CheckCommunicatingInstruction(channel.send_done));
+    TF_RETURN_IF_ERROR(CheckCommunicatingInstruction(channel.recv));
+    TF_RETURN_IF_ERROR(CheckCommunicatingInstruction(channel.recv_done));
   }
   // Check if the nest levels match for each channel.
   for (const Channel& channel : channels_) {
@@ -368,4 +391,15 @@ Status HloModuleGroupMetadata::VerifyChannelInstructions() {
   return Status::OK();
 }
 
+Status HloModuleGroupMetadata::CheckCommunicatingInstruction(
+    HloInstruction* instruction) const {
+  HloComputation* computation = instruction->parent();
+  const HloModule* module = computation->parent();
+  if (module->entry_computation() == computation ||
+      tracked_instructions_.count(computation) > 0) {
+    return Status::OK();
+  }
+  return FailedPrecondition("channel is used in disallowed computation");
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index c48a7ab0b59269474f7406ef24a249355528e085..3ef4542f9129632de4975688ae7e9e2c5f43a7ee 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -60,6 +60,7 @@ class HloModuleGroupMetadata {
     kWhileBody,
     kConditionalTrue,
     kConditionalFalse,
+    kCallFunction,
   };
 
   // Tracks the instruction mapped to a given computation, and the computation
@@ -202,6 +203,15 @@ class HloModuleGroupMetadata {
   Status AddCompanion(HloInstruction* instruction1,
                       HloInstruction* instruction2);
 
+  // Checks whether a communicating instruction is placed in a valid position
+  // within the graph.
+  Status CheckCommunicatingInstruction(HloInstruction* instruction) const;
+
+  // Performs a consistency check on the companion sets built for the input
+  // modules. Check that a companion set does not include instructions from the
+  // same module/device.
+  Status VerifyCompanionSets() const;
+
   // Retrieves a pointer to the stored TrackedInstruction associated with a
   // tracked computation, or nullptr in case such computation is not tracked.
   const TrackedInstruction* GetTrackedInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.cc b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
index 289c96b0a7b90c5f8a122cd3fc327a5762099106..5a0d1e264eb5095ff53721416ebcf4842a063f97 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -289,7 +290,7 @@ HloModuleGroupUtil::ComputeReachability(
     TF_RETURN_IF_ERROR(
         VisitTopologicalOrder(&visit_states, visit_function, root));
   }
-  auto reachability = absl::make_unique<HloReachabilityMap>(post_order);
+  auto reachability = MakeUnique<HloReachabilityMap>(post_order);
   for (HloInstruction* hlo : post_order) {
     reachability->SetReachabilityToUnion(GlobalPredecessors(hlo), hlo);
   }
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index ca763076a16af1150a8623fb7dbf22c46a5ca263..ac7cd2f2f517cf8831416d9265fc48bbf9fce340 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -74,6 +74,7 @@ namespace xla {
   V(kDynamicUpdateSlice, "dynamic-update-slice")             \
   V(kEq, "equal-to", kHloOpcodeIsComparison)                 \
   V(kExp, "exponential")                                     \
+  V(kExpm1, "exponential-minus-one")                         \
   V(kFft, "fft")                                             \
   V(kFloor, "floor")                                         \
   V(kFusion, "fusion", kHloOpcodeIsVariadic)                 \
@@ -87,6 +88,7 @@ namespace xla {
   V(kIsFinite, "is-finite")                                  \
   V(kLe, "less-than-or-equal-to", kHloOpcodeIsComparison)    \
   V(kLog, "log")                                             \
+  V(kLog1p, "log-plus-one")                                  \
   V(kAnd, "and")                                             \
   V(kNot, "not")                                             \
   V(kOr, "or")                                               \
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index e89d94bede6c437ca1131a1b1b0098390d58c0d9..dcd4725fe78e8b9b5d14437e964cb5aaf1664117 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -170,10 +169,10 @@ bool HloOrdering::UseIsBeforeValueDefinition(
   // is before the def if the instruction allows buffer sharing (in place
   // computation).
   if (use.instruction == value.defining_instruction() &&
-      CanShareOperandBufferWithUser(
+      dataflow.CanShareOperandBufferWithUser(
           use.instruction->mutable_operand(use.operand_number),
           use.operand_index, value.defining_instruction(),
-          value.defining_index(), dataflow)) {
+          value.defining_index())) {
     VLOG(4) << "  use is value def, and instruction can share use buffer";
     return true;
   }
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 5120775737bfa32bbb656421216f2b3fbef590ea..d8f1ab916b5c5c500c2d8dcd8605be083f95862a 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -90,7 +90,7 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
     return Status::OK();
   };
 
-  string prefix = name().ToString() + ": pipeline start";
+  string prefix = std::string(name()) + ": pipeline start";
   bool changed = false;
   string message;
   TF_RETURN_IF_ERROR(
@@ -98,12 +98,12 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
   const string xla_dump_per_pass_hlo_proto_to =
       module->config().debug_options().xla_dump_per_pass_hlo_proto_to();
   if (!xla_dump_per_pass_hlo_proto_to.empty()) {
-    DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, name().ToString(),
-                    "pipeline_start");
+    DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to,
+                    std::string(name()), "pipeline_start");
   }
 
   for (auto& pass : passes_) {
-    if (disabled_passes.count(pass->name().ToString()) > 0) {
+    if (disabled_passes.count(std::string(pass->name())) > 0) {
       VLOG(1) << "  Skipping HLO pass " << pass->name()
               << ", disabled by --xla_disable_hlo_passes";
       continue;
@@ -121,7 +121,7 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
         run_invariant_checkers(StrCat("after running pass: ", pass->name())));
     if (!xla_dump_per_pass_hlo_proto_to.empty()) {
       DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to,
-                      name().ToString(), pass->name().ToString());
+                      std::string(name()), std::string(pass->name()));
     }
 
     changed |= changed_this_pass;
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index b171d41a31ed23f0886e7363289ea56c92216572..39b85de0f12024f5e20ddd37618987c6d06bc307 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
-#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -274,9 +273,8 @@ ItemList GetUsers(const InstructionList& instruction_list,
   for (const BufferAlias& buffer_alias :
        points_to_analysis.GetBufferAliases(*logical_buffer)) {
     for (const HloInstruction* user : buffer_alias.instruction()->users()) {
-      if (DoesNotUseOperandBuffer(buffer_alias.instruction(),
-                                  buffer_alias.index(), user,
-                                  points_to_analysis)) {
+      if (points_to_analysis.DoesNotUseOperandBuffer(
+              buffer_alias.instruction(), buffer_alias.index(), user)) {
         // The alias may be an operand of 'user', but the LogicalBuffer cannot
         // possibly be used by the instruction so ignore 'user'. This is the
         // case, for example, for the tuple element buffers in a GetTupleElement
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 48da1a505c9bea72378aaba7824548cca0eef447..2a601ec3d183023954b6f1b6bca7594384378169 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "absl/memory/memory.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -171,7 +170,7 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
     int64 device = device_assignment(i, 0);
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                         backend().stream_executor(device));
-    streams.push_back(absl::make_unique<se::Stream>(executor));
+    streams.push_back(MakeUnique<se::Stream>(executor));
     streams.back()->Init();
     service_run_options.emplace_back(GetServiceRunOptionsForDevice(
         device, streams.back().get(), &device_assignment));
@@ -198,7 +197,7 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
     num_threads += options.num_replicas;
   }
   if (num_threads > 0) {
-    pool = absl::make_unique<tensorflow::thread::ThreadPool>(
+    pool = MakeUnique<tensorflow::thread::ThreadPool>(
         tensorflow::Env::Default(), "infeed_outfeed",
         /*num_threads=*/num_threads);
   }
@@ -229,7 +228,7 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
         VLOG(1) << "Starting outfeed on device " << device;
         for (int64 step = 1;
              options.infeed_steps < 0 || step <= options.infeed_steps; ++step) {
-          auto literal = absl::make_unique<Literal>();
+          auto literal = MakeUnique<Literal>();
           TF_CHECK_OK(backend().transfer_manager()->TransferLiteralFromOutfeed(
               executor, options.outfeed_shape, literal.get()));
           if (options.outfeed_values != nullptr) {
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.cc b/tensorflow/compiler/xla/service/hlo_scheduling.cc
index 1a767628f6e2d33df353366974fb866e89f0df5a..854aa943199397c0e3f84d48a74ef41ae0d3db56 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.cc
@@ -62,7 +62,34 @@ StatusOr<int64> MinimumMemoryForSequence(
 namespace {
 
 // Class implementing a list scheduler of HLO instructions which produces a
-// sequence which minimizes memory usage.
+// sequence which minimizes memory usage by preferring to schedule the node that
+// frees bigger buffer and defines smaller outputs.
+//
+// Note that list scheduler is a greedy algorithm which cannot guarantee a
+// global optimal solution. As a counterexample, considering the following
+// graph:
+//
+//      +--> B ===> C -------+
+// A -> |                    |
+//      |                    v
+//      +--> D ---> F=======>G
+//      |           ^
+//      |           |
+//      +--> E -----+
+//
+//  --> : Buffer with size 1
+//  ==> : Buffer with size 2
+//
+// The list scheduler will always try to defer scheduling B in a greedy way
+// since its output buffer is bigger than input. The sequence it creates will
+// be:
+//   A D E F B C G
+// , which has a maximum memory usage of 6 (B is alive while F is executing).
+//
+// An optimal way to shedule the previous graph is:
+//   A B C D E F G
+// , which has a maximum memory usage of 5 (when F is executing).
+//
 class ListScheduler {
  public:
   // Construct and return a memory-minimizing sequence of HLO instructions
@@ -70,8 +97,11 @@ class ListScheduler {
   static StatusOr<std::vector<const HloInstruction*>> Run(
       const HloComputation& computation,
       const TuplePointsToAnalysis& points_to_analysis,
-      const LogicalBuffer::SizeFunction& size_function) {
-    ListScheduler scheduler(computation, points_to_analysis, size_function);
+      const LogicalBuffer::SizeFunction& size_function,
+      const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+          memory_by_computation) {
+    ListScheduler scheduler(computation, points_to_analysis, size_function,
+                            memory_by_computation);
     return scheduler.CreateSchedule();
   }
 
@@ -92,10 +122,13 @@ class ListScheduler {
 
   ListScheduler(const HloComputation& computation,
                 const TuplePointsToAnalysis& points_to_analysis,
-                const LogicalBuffer::SizeFunction& size_function)
+                const LogicalBuffer::SizeFunction& size_function,
+                const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+                    memory_by_computation)
       : computation_(computation),
         points_to_analysis_(points_to_analysis),
-        size_function_(size_function) {
+        size_function_(size_function),
+        memory_by_computation_(memory_by_computation) {
     // Create a map containing the LogicalBuffer uses for each HLO
     // instruction. An HLO instruction "uses" a LogicalBuffer if the
     // LogicalBuffer is in an operand of the instruction as indicated by
@@ -185,6 +218,12 @@ class ListScheduler {
   }
 
   // Returns the number of bytes freed if the HLO instruction is scheduled.
+  // If the instruction calls subcomputations, we count the memory used by the
+  // subcomputations as memory "defined" by the instruction. This is not
+  // entirely accurate, because subcomputation memory will be freed after the
+  // instruction finishes. But it is more accurate than not taking
+  // subcomputations into account at all. In the future, we may improve
+  // accounting for subcomputation memory (b/65409243).
   int64 BytesFreedIfScheduled(const ReadyListEntry& entry) {
     int64 freed_bytes = 0;
     for (const auto& kv : entry.used_buffer_unscheduled_use_counts) {
@@ -194,7 +233,19 @@ class ListScheduler {
         freed_bytes += size_function_(*buffer);
       }
     }
-    return freed_bytes - entry.bytes_defined;
+    // We only count the memory usage of the largest subcomputation, instead of
+    // adding them all, because subcomputations won't execute in parallel.
+    int64 max_subcomputation_bytes = 0;
+    for (const auto* c : entry.instruction->called_computations()) {
+      auto it = memory_by_computation_.find(c);
+      if (it != memory_by_computation_.end()) {
+        int64 subcomputation_bytes = it->second;
+        if (subcomputation_bytes > max_subcomputation_bytes) {
+          max_subcomputation_bytes = subcomputation_bytes;
+        }
+      }
+    }
+    return freed_bytes - entry.bytes_defined - max_subcomputation_bytes;
   }
 
   // Constructs the scheduling priority of the given instruction.
@@ -315,6 +366,11 @@ class ListScheduler {
   const HloComputation& computation_;
   const TuplePointsToAnalysis& points_to_analysis_;
   const LogicalBuffer::SizeFunction& size_function_;
+  // Computations are analyzed in post-order. When scheduling an instruction
+  // that includes subcomputations, such as a while loop, we use this map to
+  // look up the memory needed by subcomputations.
+  const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+      memory_by_computation_;
 
   // A map containing the LogicalBuffers that each instruction uses.
   tensorflow::gtl::FlatMap<const HloInstruction*,
@@ -340,6 +396,24 @@ int64 SumLogicalBufferSizes(
   return size;
 }
 
+StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function,
+    const MemorySchedulerAlgorithm& algorithm,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation) {
+  VLOG(2) << "Computation: " << computation.name();
+  if (algorithm) {
+    return algorithm(computation, points_to_analysis, size_function,
+                     memory_by_computation);
+  }
+  return DefaultMemoryScheduler(computation, points_to_analysis, size_function,
+                                memory_by_computation);
+}
+
+}  // namespace
+
 StatusOr<int64> MinimumMemoryForComputation(
     const HloComputation& computation,
     const std::vector<const HloInstruction*>& sequence,
@@ -352,30 +426,17 @@ StatusOr<int64> MinimumMemoryForComputation(
   return result.heap_size;
 }
 
-StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
-    const HloComputation& computation,
-    const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function,
-    const MemorySchedulerAlgorithm& algorithm) {
-  VLOG(2) << "Computation: " << computation.name();
-  if (algorithm) {
-    return algorithm(computation, points_to_analysis, size_function);
-  }
-  return DefaultMemoryScheduler(computation, points_to_analysis, size_function);
-}
-
-}  // namespace
-
 StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function) {
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation) {
   // This ordering is based on DFS post-order, with a heuristic to decide which
   // operand to visit first.  The heuristic is based on 'extra_users', which is
   // simply users-1 for each instruction.  By subtracting 1, we're saying that
   // instructions with no users or a single user don't count; instructions with
   // lots of fan-out will be visited earlier.
-  int64 cumulative_total_size = 0;
   tensorflow::gtl::FlatMap<const HloInstruction*, int64> extra_users;
   tensorflow::gtl::FlatMap<const HloInstruction*, int64> total_sizes;
   for (const HloInstruction* hlo : computation.MakeInstructionPostOrder()) {
@@ -388,14 +449,12 @@ StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
     int64 logical_buffer_size = SumLogicalBufferSizes(
         points_to_analysis.GetBuffersDefinedByInstruction(hlo), size_function);
     total_sizes[hlo] = logical_buffer_size;
-    cumulative_total_size += logical_buffer_size;
     tensorflow::gtl::FlatSet<const HloInstruction*> unique_operands(
         hlo->operands().begin(), hlo->operands().end());
     for (const HloInstruction* operand : unique_operands) {
       extra_users[hlo] += extra_users[operand];
       total_sizes[hlo] += total_sizes[operand];
     }
-    total_sizes[hlo] = std::min(total_sizes[hlo], cumulative_total_size);
   }
   CHECK_EQ(extra_users.size(), computation.instruction_count());
   CHECK_EQ(total_sizes.size(), computation.instruction_count());
@@ -421,52 +480,87 @@ StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
       }));
   CHECK_EQ(sequence.size(), computation.instruction_count());
   return sequence;
-}
+}  // namespace xla
 
 StatusOr<std::vector<const HloInstruction*>> ListMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function) {
-  return ListScheduler::Run(computation, points_to_analysis, size_function);
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation) {
+  return ListScheduler::Run(computation, points_to_analysis, size_function,
+                            memory_by_computation);
+}
+
+StatusOr<std::vector<const HloInstruction*>> PostOrderMemoryScheduler(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation) {
+  const auto& post_order = computation.MakeInstructionPostOrder();
+  return std::vector<const HloInstruction*>{post_order.begin(),
+                                            post_order.end()};
 }
 
 StatusOr<std::vector<const HloInstruction*>> DefaultMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function) {
-  // We try both a list-scheduler based ordering and a DFS based ordering, and
-  // choose whichever returns a lower min-memory, not accounting for
-  // fragmentation.
-  //
-  // Note that this is just a heuristic. One obvious inaccuracy is that the
-  // memory required for sub-computations might be different when considered
-  // within the caller's context. But it's good enough for now.
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation) {
+  // We try a few schedulers and choose whichever returns a lower min-memory,
+  // not accounting for fragmentation.
+  // - List is a scheduler that uses greedy heuristics.
+  // - DFS visits HLOs in postorder, with a heuristic to decide the order of
+  //   children.
+  // - Postorder does not use any heuristics.
+  // List wins for most of our benchmarks; postorder-based schedulers win for
+  // some RNNs.
   TF_ASSIGN_OR_RETURN(
       std::vector<const HloInstruction*> list_sequence,
-      ListMemoryScheduler(computation, points_to_analysis, size_function));
+      ListMemoryScheduler(computation, points_to_analysis, size_function,
+                          memory_by_computation));
   TF_ASSIGN_OR_RETURN(
       const int64 list_memory,
       MinimumMemoryForComputation(computation, list_sequence,
                                   points_to_analysis, size_function));
   VLOG(2) << "Min-memory list sequence: " << HumanReadableNumBytes(list_memory);
 
-  TF_ASSIGN_OR_RETURN(
-      std::vector<const HloInstruction*> dfs_sequence,
-      DFSMemoryScheduler(computation, points_to_analysis, size_function));
+  TF_ASSIGN_OR_RETURN(std::vector<const HloInstruction*> dfs_sequence,
+                      DFSMemoryScheduler(computation, points_to_analysis,
+                                         size_function, memory_by_computation));
   TF_ASSIGN_OR_RETURN(
       const int64 dfs_memory,
       MinimumMemoryForComputation(computation, dfs_sequence, points_to_analysis,
                                   size_function));
   VLOG(2) << "Min-memory dfs sequence: " << HumanReadableNumBytes(dfs_memory);
 
-  if (list_memory <= dfs_memory) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<const HloInstruction*> post_order_sequence,
+      PostOrderMemoryScheduler(computation, points_to_analysis, size_function,
+                               memory_by_computation));
+  TF_ASSIGN_OR_RETURN(
+      const int64 post_order_memory,
+      MinimumMemoryForComputation(computation, post_order_sequence,
+                                  points_to_analysis, size_function));
+  VLOG(2) << "Min-memory post order sequence: "
+          << HumanReadableNumBytes(post_order_memory);
+
+  auto min_memory = std::min({dfs_memory, post_order_memory, list_memory});
+
+  if (min_memory == list_memory) {
     VLOG(2) << "Chose min-memory list sequence: "
             << HumanReadableNumBytes(list_memory);
     return list_sequence;
-  } else {
+  } else if (min_memory == dfs_memory) {
     VLOG(2) << "Chose min-memory dfs sequence: "
             << HumanReadableNumBytes(dfs_memory);
     return dfs_sequence;
+  } else {
+    VLOG(2) << "Chose min-memory post_order sequence: "
+            << HumanReadableNumBytes(post_order_memory);
+    return post_order_sequence;
   }
 }
 
@@ -477,24 +571,32 @@ CreateMemoryMinimizingSequence(const HloModule& module,
   SequentialHloOrdering::HloModuleSequence sequence;
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(&module));
-  for (const auto* computation : module.MakeNonfusionComputations()) {
-    TF_ASSIGN_OR_RETURN(
-        sequence[computation],
-        CreateMemoryMinimizingSequence(*computation, *points_to_analysis,
-                                       size_function, algorithm));
+  tensorflow::gtl::FlatMap<const HloComputation*, int64> memory_by_computation;
+  for (const auto* computation : module.MakeComputationPostOrder()) {
+    if (!computation->IsFusionComputation()) {
+      TF_ASSIGN_OR_RETURN(auto one_computation_sequence,
+                          CreateMemoryMinimizingSequence(
+                              *computation, *points_to_analysis, size_function,
+                              algorithm, memory_by_computation));
+      memory_by_computation[computation] =
+          MinimumMemoryForComputation(*computation, one_computation_sequence,
+                                      *points_to_analysis, size_function)
+              .ValueOrDie();
+      sequence[computation] = std::move(one_computation_sequence);
+    }
   }
   return sequence;
 }
 
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
-    const LogicalBuffer::SizeFunction& size_function,
-    const MemorySchedulerAlgorithm& algorithm) {
+    const LogicalBuffer::SizeFunction& size_function) {
   CHECK(!computation.IsFusionComputation());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(computation.parent()));
+  tensorflow::gtl::FlatMap<const HloComputation*, int64> empty_map;
   return CreateMemoryMinimizingSequence(computation, *points_to_analysis,
-                                        size_function, algorithm);
+                                        size_function, nullptr, empty_map);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.h b/tensorflow/compiler/xla/service/hlo_scheduling.h
index 068e68383deb170ded1c9b09a8b7ceb8c4c0ab4b..49b927eefd24f4e26df781dd8d2b977bedba2b80 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.h
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.h
@@ -34,26 +34,47 @@ StatusOr<int64> MinimumMemoryForSequence(
     const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const LogicalBuffer::SizeFunction& size_function);
 
+// Returns the minimum memory required to compute the given computation,
+// assuming no fragmentation.
+StatusOr<int64> MinimumMemoryForComputation(
+    const HloComputation& computation,
+    const std::vector<const HloInstruction*>& sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function);
+
 // A memory scheduler computes an execution sequence for the HLO instructions in
 // 'computation' that minimizes peak memory, given a points-to analysis result
 // that describes buffer aliasing, together with a target-specific size function
 // that maps a tensor's logical size to its padded size.
 typedef std::function<StatusOr<std::vector<const HloInstruction*>>(
     const HloComputation&, const TuplePointsToAnalysis&,
-    const LogicalBuffer::SizeFunction&)>
+    const LogicalBuffer::SizeFunction&,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&)>
     MemorySchedulerAlgorithm;
 
 // List scheduler
 StatusOr<std::vector<const HloInstruction*>> ListMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function);
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation);
 
 // DFS-order scheduler
 StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function);
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation);
+
+// Naive Post Order scheduler
+StatusOr<std::vector<const HloInstruction*>> PostOrderMemoryScheduler(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation);
 
 // The default scheduling algorithm. Runs both the list scheduler
 // and the DFS scheduler, and chooses whichever returns a lower min-memory,
@@ -61,7 +82,9 @@ StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
 StatusOr<std::vector<const HloInstruction*>> DefaultMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function);
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation);
 
 // Returns an HloModuleSequence which seeks to minimize the memory required for
 // the computation. size_function is the function returning the number of bytes
@@ -72,10 +95,10 @@ CreateMemoryMinimizingSequence(const HloModule& module,
                                const MemorySchedulerAlgorithm& algorithm = {});
 
 // Overload of above that computes the sequence for a single computation.
+// Currently only used by the GPU backend.
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
-    const LogicalBuffer::SizeFunction& size_function,
-    const MemorySchedulerAlgorithm& algorithm = {});
+    const LogicalBuffer::SizeFunction& size_function);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
index 92df7c1427f282ccdde2df494c41b3f2a98cf7b3..c018ba2ffc404d0c6a0d08b8f5c63a9f90888b70 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
@@ -190,5 +190,104 @@ ENTRY root {
                                       instructions_by_name.at("e")));
 }
 
+TEST_F(HloSchedulingTest, ListAccountsForSubcomputations) {
+  // %WhileCond (cond_param: f32[4]) -> pred[] {
+  //   %cond_param = f32[4]{0} parameter(0)
+  //   %constant = f32[1,4]{1,0} constant(f32[1,4] { { 0, 0, 0, 0 } })
+  //   ROOT %not-equal-to = pred[] not-equal-to(
+  //     f32[4]{0} %cond_param, f32[1,4]{1,0} %constant)
+  // }
+  // %WhileBody (body_param: f32[4]) -> f32[4] {
+  //   %body_param = f32[4]{0} parameter(0)
+  //   %constant.1 = f32[1,4]{1,0} constant(f32[1,4] { { 1, 1, 1, 1 } })
+  //   ROOT %subtract = f32[4]{0} subtract(
+  //     f32[4]{0} %body_param, f32[1,4]{1,0} %constant.1)
+  // }
+  // %SubcomputationsNotAccounted () -> f32[2,4] {
+  //   %constant.3 = f32[2,4]{1,0} constant(
+  //     f32[2,4] { { 1, 2, 3, 4 }, { 1, 2, 3, 4 } })
+  //   %transpose = f32[2,4]{1,0} transpose(
+  //     f32[2,4]{1,0} %constant.3), dimensions={0,1}
+  //   %constant.2 = f32[1,4]{1,0} constant(f32[1,4] { { 1, 1, 1, 1 } })
+  //   %while = f32[4]{0} while(f32[1,4]{1,0} %constant.2),
+  //      condition=%WhileCond,
+  //      body=%WhileBody
+  //   %broadcast = f32[2,4]{1,0} broadcast(f32[4]{0} %while), dimensions={0}
+  //   ROOT %add = f32[2,4]{1,0} add(
+  //     f32[2,4]{1,0} %transpose, f32[2,4]{1,0} %broadcast)
+  // }
+
+  auto module = CreateNewModule();
+  const Shape r1f32 = ShapeUtil::MakeShape(F32, {4});
+  const Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 4});
+
+  // param != 0
+  // Needs 17 bytes
+  auto cond_builder = HloComputation::Builder("WhileCond");
+  HloInstruction* cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "cond_param"));
+  HloInstruction* zero_vector = cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2<float>({{0, 0, 0, 0}})));
+  cond_builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kNe, cond_param, zero_vector));
+  auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build());
+
+  // param - 1
+  // Needs 16 bytes
+  auto body_builder = HloComputation::Builder("WhileBody");
+  HloInstruction* body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "body_param"));
+  HloInstruction* one_vector = body_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2<float>({{1, 1, 1, 1}})));
+  body_builder.AddInstruction(HloInstruction::CreateBinary(
+      r1f32, HloOpcode::kSubtract, body_param, one_vector));
+  auto body_computation = module->AddEmbeddedComputation(body_builder.Build());
+
+  // transpose(matrix) + bcast(while)
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* while_init = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2<float>({{1, 1, 1, 1}})));
+  // Creates 16 bytes, ignoring subcomputations
+  HloInstruction* while_loop =
+      builder.AddInstruction(HloInstruction::CreateWhile(
+          r1f32, cond_computation, body_computation, while_init));
+
+  // Creates 32 bytes and frees 16
+  HloInstruction* bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r2f32, while_loop, {0}));
+
+  HloInstruction* matrix = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2<float>(
+          {{1.0, 2.0, 3.0, 4.0}, {1.0, 2.0, 3.0, 4.0}})));
+  // Creates 32 bytes
+  HloInstruction* transpose = builder.AddInstruction(
+      HloInstruction::CreateTranspose(r2f32, matrix, {0, 1}));
+
+  // Creates 32 bytes and frees 64
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, transpose, bcast));
+
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(SequentialHloOrdering::HloModuleSequence sequence,
+                          CreateMemoryMinimizingSequence(
+                              *module,
+                              [](const BufferValue& buffer) {
+                                return ShapeUtil::ByteSizeOf(buffer.shape());
+                              },
+                              ListMemoryScheduler));
+  // Verify that all instructions are in the sequence.
+  EXPECT_EQ(module->entry_computation()->instruction_count(),
+            sequence.at(module->entry_computation()).size());
+  SequentialHloOrdering ordering(module.get(), sequence);
+  // This schedule is an example of List's greedy heuristics being suboptimal.
+  // The while_loop is more expensive than transpose, so it would have been
+  // better to schedule it first, instead of during the busy time.
+  EXPECT_TRUE(ordering.ExecutesBefore(transpose, while_loop));
+  EXPECT_TRUE(ordering.ExecutesBefore(transpose, bcast));
+  EXPECT_TRUE(ordering.ExecutesBefore(bcast, add));
+  EXPECT_TRUE(ordering.ExecutesBefore(transpose, add));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 8a30cbf9cd622ffb64d345ddaf0dc88f34850bfc..7d6d0d9eaf70969c1a3762959233b561706398c2 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -106,9 +106,7 @@ Status ShapeVerifier::HandleReducePrecision(HloInstruction* reduce_precision) {
                                           reduce_precision->mantissa_bits()));
 }
 
-Status ShapeVerifier::HandleInfeed(HloInstruction*) {
-  return tensorflow::Status::OK();
-}
+Status ShapeVerifier::HandleInfeed(HloInstruction*) { return Status::OK(); }
 
 Status ShapeVerifier::HandleOutfeed(HloInstruction* outfeed) {
   // Outfeed has a separate shape field for the value which is outfed to the
@@ -116,7 +114,7 @@ Status ShapeVerifier::HandleOutfeed(HloInstruction* outfeed) {
   // produces no HLO value in the graph.
   if (!ShapeUtil::Compatible(outfeed->outfeed_shape(),
                              outfeed->operand(0)->shape())) {
-    return InvalidArgument(
+    return InternalError(
         "Expected outfeed to have shape compatible with operand's shape %s, "
         "actual shape is %s:\n%s",
         ShapeUtil::HumanString(outfeed->operand(0)->shape()).c_str(),
@@ -127,12 +125,10 @@ Status ShapeVerifier::HandleOutfeed(HloInstruction* outfeed) {
 }
 
 Status ShapeVerifier::HandleHostCompute(HloInstruction*) {
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-Status ShapeVerifier::HandleRng(HloInstruction*) {
-  return tensorflow::Status::OK();
-}
+Status ShapeVerifier::HandleRng(HloInstruction*) { return Status::OK(); }
 
 Status ShapeVerifier::HandleReverse(HloInstruction* reverse) {
   return CheckShape(
@@ -164,7 +160,7 @@ Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
 }
 
 Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
@@ -183,7 +179,7 @@ Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
                  operand_shape.dimensions(operand_dimension))
         << broadcast->ToString() << " operand shape " << operand_shape;
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
@@ -191,7 +187,7 @@ Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
   TF_RETURN_IF_ERROR(CheckShape(reshape, reshape->shape()));
   TF_RET_CHECK(ShapeUtil::ElementsIn(reshape->shape()) ==
                ShapeUtil::ElementsIn(reshape->operand(0)->shape()));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status ShapeVerifier::HandleTranspose(HloInstruction* transpose) {
@@ -200,22 +196,18 @@ Status ShapeVerifier::HandleTranspose(HloInstruction* transpose) {
                      transpose->operand(0)->shape(), transpose->dimensions()));
 }
 
-Status ShapeVerifier::HandleParameter(HloInstruction*) {
-  return tensorflow::Status::OK();
+Status ShapeVerifier::HandleParameter(HloInstruction* hlo) {
+  return Status::OK();
 }
 
-Status ShapeVerifier::HandleFusion(HloInstruction*) {
-  return tensorflow::Status::OK();
-}
+Status ShapeVerifier::HandleFusion(HloInstruction*) { return Status::OK(); }
 
 Status ShapeVerifier::HandleCall(HloInstruction* call) {
   // The shape of kCall should match the shape of the computation it calls.
   return CheckShape(call, call->to_apply()->ComputeProgramShape().result());
 }
 
-Status ShapeVerifier::HandleCustomCall(HloInstruction*) {
-  return tensorflow::Status::OK();
-}
+Status ShapeVerifier::HandleCustomCall(HloInstruction*) { return Status::OK(); }
 
 Status ShapeVerifier::HandleSlice(HloInstruction* slice) {
   return CheckShape(slice,
@@ -410,7 +402,7 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
               if (fp_type == PRIMITIVE_TYPE_INVALID) {
                 fp_type = subshape.element_type();
               } else if (fp_type != subshape.element_type()) {
-                return FailedPrecondition(
+                return InternalError(
                     "Seen floating point types of different precisions in "
                     "%s, but mixed precision is disallowed.",
                     instruction->ToString().c_str());
@@ -490,14 +482,14 @@ Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
       }
   }
   if (!compatible) {
-    return InvalidArgument(
+    return InternalError(
         "Expected instruction to have shape compatible with %s, actual "
         "shape is %s:\n%s",
         ShapeUtil::HumanString(inferred_shape).c_str(),
         ShapeUtil::HumanString(instruction->shape()).c_str(),
         instruction->ToString().c_str());
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
@@ -541,13 +533,13 @@ Status ShapeVerifier::CheckVariadicShape(const HloInstruction* instruction) {
 Status ShapeVerifier::CheckSameChannel(const HloInstruction* instr1,
                                        const HloInstruction* instr2) {
   if (instr1->channel_id() != instr2->channel_id()) {
-    return FailedPrecondition(
+    return InternalError(
         "Expected to have the same channel id, actual channel ids are: %s "
         "(%lld), %s (%lld)",
         instr1->ToString().c_str(), instr1->channel_id(),
         instr2->ToString().c_str(), instr2->channel_id());
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 string ComputationsToString(
@@ -571,22 +563,22 @@ string ComputationsToString(
 Status VerifyHloStructure(HloModule* module) {
   for (const HloComputation* computation : module->computations()) {
     if (computation->parent() == nullptr) {
-      return FailedPrecondition("Computation %s has a null parent pointer",
-                                computation->name().c_str());
+      return InternalError("Computation %s has a null parent pointer",
+                           computation->name().c_str());
     }
     if (computation->parent() != module) {
-      return FailedPrecondition(
+      return InternalError(
           "Computation %s parent() does not point to parent module",
           computation->name().c_str());
     }
 
     for (const HloInstruction* instruction : computation->instructions()) {
       if (instruction->parent() == nullptr) {
-        return FailedPrecondition("Instruction %s has a null parent pointer",
-                                  instruction->name().c_str());
+        return InternalError("Instruction %s has a null parent pointer",
+                             instruction->name().c_str());
       }
       if (instruction->parent() != computation) {
-        return FailedPrecondition(
+        return InternalError(
             "Instruction %s parent() does not point to parent computation",
             instruction->name().c_str());
       }
@@ -602,7 +594,7 @@ Status VerifyHloStructure(HloModule* module) {
       for (int i = 0; i < instruction->operand_count(); ++i) {
         const HloInstruction* operand = instruction->operand(i);
         if (operand->parent() != instruction->parent()) {
-          return FailedPrecondition(
+          return InternalError(
               "Operand %d (%s) of instruction %s is in a different "
               "computation: %s vs %s",
               i, operand->name().c_str(), instruction->name().c_str(),
@@ -612,14 +604,14 @@ Status VerifyHloStructure(HloModule* module) {
       }
     }
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
   // The parent fusion instruction of the fusion computation must be 'fusion'.
   HloComputation* fused_computation = fusion->fused_instructions_computation();
   if (fusion != fused_computation->FusionInstruction()) {
-    return FailedPrecondition(
+    return InternalError(
         "Instruction of fused computation does not match expected instruction "
         "%s.",
         fusion->ToString().c_str());
@@ -635,37 +627,37 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
   for (auto* instruction : fused_computation->instructions()) {
     if (fused_root == instruction) {
       if (root_owned) {
-        return FailedPrecondition("Root appears more than once in %s.",
-                                  fusion->ToString().c_str());
+        return InternalError("Root appears more than once in %s.",
+                             fusion->ToString().c_str());
       }
       root_owned = true;
     }
     for (int i = 0; i < fused_parameters.size(); ++i) {
       if (fused_parameters[i] == instruction) {
         if (parameter_owned[i]) {
-          return FailedPrecondition("Parameter appears more than once in %s.",
-                                    fusion->ToString().c_str());
+          return InternalError("Parameter appears more than once in %s.",
+                               fusion->ToString().c_str());
         }
         parameter_owned[i] = true;
       }
     }
   }
   if (!root_owned) {
-    return FailedPrecondition("Root not found in computation of %s.",
-                              fusion->ToString().c_str());
+    return InternalError("Root not found in computation of %s.",
+                         fusion->ToString().c_str());
   }
   // Make sure all the parameter_owned entries are set
   for (int i = 0; i < parameter_owned.size(); i++) {
     if (!parameter_owned[i]) {
-      return FailedPrecondition("Parameter %d not found in computation of %s.",
-                                i, fusion->ToString().c_str());
+      return InternalError("Parameter %d not found in computation of %s.", i,
+                           fusion->ToString().c_str());
     }
   }
 
   // Fused root must have no users.
   if (fused_root->user_count() != 0) {
-    return FailedPrecondition("Root of %s may not have users.",
-                              fusion->ToString().c_str());
+    return InternalError("Root of %s may not have users.",
+                         fusion->ToString().c_str());
   }
 
   // All uses of fused instructions must be in the fusion computation, and every
@@ -674,13 +666,13 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
        fusion->fused_instructions_computation()->instructions()) {
     if (instruction != fused_root) {
       if (instruction->user_count() == 0) {
-        return FailedPrecondition(
-            "Non-root instruction %s in %s must have users.",
-            instruction->ToString().c_str(), fusion->ToString().c_str());
+        return InternalError("Non-root instruction %s in %s must have users.",
+                             instruction->ToString().c_str(),
+                             fusion->ToString().c_str());
       }
       for (auto& user : instruction->users()) {
         if (fused_computation != user->parent()) {
-          return FailedPrecondition(
+          return InternalError(
               "Non-root instruction %s in %s may not have external users.",
               instruction->ToString().c_str(), fusion->ToString().c_str());
         }
@@ -695,41 +687,40 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
   for (auto fused_param : fused_parameters) {
     int64 param_no = fused_param->parameter_number();
     if (param_no < 0) {
-      return FailedPrecondition(
-          "Unexpected negative parameter number %lld in %s.", param_no,
-          fusion->ToString().c_str());
+      return InternalError("Unexpected negative parameter number %lld in %s.",
+                           param_no, fusion->ToString().c_str());
     }
     if (param_no >= fused_parameters.size()) {
-      return FailedPrecondition(
+      return InternalError(
           "Unexpected parameter number %lld in %s: higher then number of "
           "parameters %lu.",
           param_no, fusion->ToString().c_str(), fused_parameters.size());
     }
     if (parameter_numbers[param_no]) {
-      return FailedPrecondition(
+      return InternalError(
           "Did not expect parameter number %lld more than once in %s.",
           param_no, fusion->ToString().c_str());
     }
     parameter_numbers[param_no] = true;
     if (!ShapeUtil::Compatible(fused_param->shape(),
                                fusion->operand(param_no)->shape())) {
-      return FailedPrecondition(
+      return InternalError(
           "Shape mismatch between parameter number %lld and its operand in %s.",
           param_no, fusion->ToString().c_str());
     }
   }
-  // Make sure all the parameter_numbers entries were seen
+  // Make sure all the parameter_numbers entries were seen.
   for (int i = 0; i < parameter_numbers.size(); i++) {
     if (!parameter_numbers[i]) {
-      return FailedPrecondition("Did not see parameter number %d in %s.", i,
-                                fusion->ToString().c_str());
+      return InternalError("Did not see parameter number %d in %s.", i,
+                           fusion->ToString().c_str());
     }
   }
 
   // TODO(b/65423525): We'd like to check that all operands are distinct.
   // This is currently disabled due to the invariant being violated by
   // multi-output fusion.
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) {
@@ -778,7 +769,7 @@ Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) {
         "init: %s, body: %s",
         init->ToString().c_str(), body_root->ToString().c_str());
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) {
@@ -796,7 +787,7 @@ Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) {
           ShapeUtil::HumanString(operand_shape).c_str());
     }
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 StatusOr<bool> HloVerifier::Run(HloModule* module) {
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 6208887547a14d22b512ef308dd2668af2f4468d..1392a78097aa026b2f7cffa2b0135402d3ca7ae5 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -82,9 +82,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override;
   Status HandleGather(HloInstruction* gather) override;
 
-  Status FinishVisit(HloInstruction*) override {
-    return tensorflow::Status::OK();
-  }
+  Status FinishVisit(HloInstruction*) override { return Status::OK(); }
 
  protected:
   // Check the instruction's shape against the shape given by ShapeInference
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
index 13e4557317f74b3fb46f07fb91c339fd2f34752f..dc3bfce0c495bc40a2df7b985cab67e02a3e15ce 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@@ -27,6 +27,7 @@ using tensorflow::strings::HumanReadableElapsedTime;
 using tensorflow::strings::HumanReadableNumBytes;
 using tensorflow::strings::Printf;
 using tensorflow::strings::StrAppend;
+using tensorflow::strings::StrCat;
 
 string HumanReadableProfileBuilder::ToString() const {
   string s;
@@ -35,20 +36,26 @@ string HumanReadableProfileBuilder::ToString() const {
           computation_name_.c_str(),
           HumanReadableElapsedTime(CyclesToSeconds(total_cycles_)).c_str());
 
-  auto append_op = [&](const OpInfo& op) {
+  auto print_op = [&](const OpInfo& op) {
+    // Skip ops with 0 optimal seconds and 0 actual cycles.  These are ops that
+    // were expected to be free and are actually free -- things like (on most
+    // backends) kParameter or kConstant HLOs.  There's no need to clutter the
+    // profile with these.
+    if (op.optimal_seconds == 0 && op.cycles == 0) {
+      return;
+    }
+
     string bytes_per_sec;
     string bytes_per_cycle;
-    if (op.cycles <= 0 || op.bytes_accessed < 0) {
-      bytes_per_sec = "<unknown>";
-      bytes_per_cycle = "<unknown>";
-    } else {
-      bytes_per_sec =
-          HumanReadableNumBytes(op.bytes_accessed / CyclesToSeconds(op.cycles));
+    if (op.cycles > 0 && op.bytes_accessed >= 0) {
+      bytes_per_sec = StrCat(
+          HumanReadableNumBytes(op.bytes_accessed / CyclesToSeconds(op.cycles)),
+          "/s");
+      double bpc = static_cast<double>(op.bytes_accessed) / op.cycles;
       if (op.bytes_accessed > op.cycles) {
-        bytes_per_cycle = HumanReadableNumBytes(op.bytes_accessed / op.cycles);
+        bytes_per_cycle = StrCat(HumanReadableNumBytes(bpc), "/cycle");
       } else {
-        bytes_per_cycle =
-            Printf("%.3fB", static_cast<float>(op.bytes_accessed) / op.cycles);
+        bytes_per_cycle = Printf("%.3fB/cycle", bpc);
       }
     }
 
@@ -59,14 +66,16 @@ string HumanReadableProfileBuilder::ToString() const {
 
     double nsecs = op.cycles / clock_rate_ghz_;
     Appendf(&s,
-            "%15lld cycles (%6.2f%%) :: %12.1f usec (%12.1f optimal) :: %18s "
-            ":: %18s :: %12s/s :: %12s/cycle :: %s\n",
+            "%15lld cycles (%6.2f%%) :: %12.1f usec %22s :: %18s "
+            ":: %18s :: %14s :: %16s :: %s\n",
             op.cycles, cycles_percent, CyclesToMicroseconds(op.cycles),
-            op.optimal_seconds * 1e6,
+            op.optimal_seconds < 0
+                ? ""
+                : Printf("(%12.1f optimal)", op.optimal_seconds * 1e6).c_str(),
             op.flop_count <= 0
-                ? "<none>"
+                ? ""
                 : HumanReadableNumFlops(op.flop_count, nsecs).c_str(),
-            op.transcendental_count <= 0 ? "<none>"
+            op.transcendental_count <= 0 ? ""
                                          : HumanReadableNumTranscendentalOps(
                                                op.transcendental_count, nsecs)
                                                .c_str(),
@@ -78,24 +87,26 @@ string HumanReadableProfileBuilder::ToString() const {
   int64 total_transcendentals = 0.;
   int64 total_bytes = 0;
   for (const auto& op : op_infos_) {
-    optimal_seconds_sum += op.optimal_seconds;
-    total_flops += op.flop_count;
-    total_transcendentals += op.transcendental_count;
-    total_bytes += op.bytes_accessed;
+    if (op.optimal_seconds > 0) {
+      optimal_seconds_sum += op.optimal_seconds;
+    }
+    total_flops += std::max(op.flop_count, int64{0});
+    total_transcendentals += std::max(op.transcendental_count, int64{0});
+    total_bytes += std::max(op.bytes_accessed, int64{0});
   }
 
   VLOG(1) << "Total floating point ops: " << total_flops;
 
-  append_op({"[total]", "[total]", /*category=*/"", total_cycles_, total_flops,
-             total_transcendentals, total_bytes, optimal_seconds_sum});
+  print_op({"[total]", "[total]", /*category=*/"", total_cycles_, total_flops,
+            total_transcendentals, total_bytes, optimal_seconds_sum});
 
-  // Sort ops in decreasing order of cycles.
+  // Sort ops in decreasing order of cycles, and print them.
   std::vector<OpInfo> sorted_ops(op_infos_);
   std::sort(
       sorted_ops.begin(), sorted_ops.end(),
       [](const OpInfo& a, const OpInfo& b) { return a.cycles > b.cycles; });
   for (const auto& op : sorted_ops) {
-    append_op(op);
+    print_op(op);
   }
 
   if (total_cycles_ <= 0) {
@@ -109,8 +120,20 @@ string HumanReadableProfileBuilder::ToString() const {
       table.SetMetricName("microseconds above estimated optimum");
       table.SetEntryName("ops");
       table.SetShowCategoryTable();
+      table.SetShowAllEntries();
       float total_discrepancy_in_microseconds = 0.0f;
-      for (const auto& op : sorted_ops) {
+      for (const auto& op : op_infos_) {
+        // Skip ops with < 0 optimal seconds.  These are ops for which we don't
+        // know the optimal time.
+        if (op.optimal_seconds < 0) {
+          continue;
+        }
+        // Also skip ops with 0 actual cycles.  These ops were free; there's no
+        // need to clutter the "above estimated optimum" table with them,
+        // because they can't be optimized further.
+        if (op.cycles == 0) {
+          continue;
+        }
         MetricTableReport::Entry entry;
         entry.text = op.name;
         entry.short_text = op.short_name;
@@ -128,7 +151,14 @@ string HumanReadableProfileBuilder::ToString() const {
       table.SetMetricName("microseconds");
       table.SetEntryName("ops");
       table.SetShowCategoryTable();
-      for (const auto& op : sorted_ops) {
+      table.SetShowAllEntries();
+      for (const auto& op : op_infos_) {
+        // Skip ops with 0 optimal seconds and 0 actual cycles.  As in
+        // print_op(), these are uninteresting because they're expected to be
+        // free, and they were actually free.
+        if (op.cycles == 0 && op.optimal_seconds == 0) {
+          continue;
+        }
         MetricTableReport::Entry entry;
         entry.text = op.name;
         entry.short_text = op.short_name;
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.h b/tensorflow/compiler/xla/service/human_readable_profile_builder.h
index fc24acd2713f4cd8af2816ffdf085e84a4920cbc..6f56c3aa82e9d1c942fd67ff7a5948cf2e54370d 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.h
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.h
@@ -32,7 +32,7 @@ class HumanReadableProfileBuilder {
   explicit HumanReadableProfileBuilder(tensorflow::StringPiece computation_name,
                                        int64 total_cycles,
                                        double clock_rate_ghz)
-      : computation_name_(computation_name.ToString()),
+      : computation_name_(std::string(computation_name)),
         total_cycles_(total_cycles),
         clock_rate_ghz_(clock_rate_ghz) {
     CHECK_GE(clock_rate_ghz, 1e-9);
@@ -41,15 +41,17 @@ class HumanReadableProfileBuilder {
   int64 total_cycles() const { return total_cycles_; }
 
   // Adds an operation to the profile.  If you don't know the number of
-  // floating-point ops or bytes touched by the op, pass -1 for that param.
+  // floating-point ops or bytes touched by the op, or if you don't know how
+  // fast it would run optimally, pass -1 for that param.
   void AddOp(tensorflow::StringPiece op_name,
              tensorflow::StringPiece short_name,
              tensorflow::StringPiece category, int64 cycles, int64 flop_count,
              int64 transcendental_count, int64 bytes_accessed,
              float optimal_seconds) {
-    op_infos_.push_back(
-        {op_name.ToString(), short_name.ToString(), category.ToString(), cycles,
-         flop_count, transcendental_count, bytes_accessed, optimal_seconds});
+    op_infos_.push_back({std::string(op_name), std::string(short_name),
+                         std::string(category), cycles, flop_count,
+                         transcendental_count, bytes_accessed,
+                         optimal_seconds});
   }
 
   // Gets the human-readable profile.
@@ -61,10 +63,10 @@ class HumanReadableProfileBuilder {
     string short_name;
     string category;
     int64 cycles;
-    int64 flop_count;
+    int64 flop_count;  // -1 if unknown
     int64 transcendental_count;
-    int64 bytes_accessed;
-    float optimal_seconds;
+    int64 bytes_accessed;   // -1 if unknown
+    float optimal_seconds;  // -1 if unknown
   };
 
   double CyclesToSeconds(int64 cycles) const {
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.cc b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
new file mode 100644
index 0000000000000000000000000000000000000000..15b2d8f4990735c56f105e7c1b9b7dc70609d898
--- /dev/null
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
@@ -0,0 +1,269 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace xla {
+namespace gtl = ::tensorflow::gtl;
+
+namespace {
+using Analysis = IndexedArrayAnalysis;
+using UnknownArray = Analysis::UnknownArray;
+using ConstantArray = Analysis::ConstantArray;
+using ScalarIndexedArray = Analysis::ScalarIndexedArray;
+}  // namespace
+
+string IndexedArrayAnalysis::ToString(Array* root) {
+  switch (root->kind()) {
+    case Array::kUnknown: {
+      auto* unknown_tensor = root->as<UnknownArray>();
+      return tensorflow::strings::StrCat("%",
+                                         unknown_tensor->instruction().name());
+    }
+
+    case Array::kConstant: {
+      return tensorflow::strings::StrCat(
+          "(constant ", ShapeUtil::HumanString(root->shape()), ")");
+    }
+
+    case Array::kScalarIndexedConstant:
+    case Array::kScalarIndexed: {
+      auto* indexed_array = root->as<ScalarIndexedArray>();
+      string name = root->kind() == Array::kScalarIndexedConstant
+                        ? "scalar-indexed-const"
+                        : "scalar-indexed";
+      return tensorflow::strings::StrCat(
+          "(", name, " ", ToString(indexed_array->source()), " ",
+          ToString(indexed_array->indices()), " ", indexed_array->source_dim(),
+          "->[", tensorflow::str_util::Join(indexed_array->output_dims(), ","),
+          "])");
+    }
+  }
+}
+
+Analysis::Array* IndexedArrayAnalysis::GetArrayFor(
+    const HloInstruction* instr) {
+  auto it = cache_.find(instr);
+  if (it != cache_.end()) {
+    return it->second;
+  }
+
+  TraverseAndPopulateCache(instr);
+  return FindOrDie(cache_, instr);
+}
+
+void IndexedArrayAnalysis::TraverseAndPopulateCache(
+    const HloInstruction* root) {
+  // Depth first search over the DAG, invoking ComputeArrayFor in post order.
+  // The HLO instructions already in the cache are considered leaves.
+
+  gtl::InlinedVector<const HloInstruction*, 4> stack;
+
+  enum DfsState { kDiscovered, kVisited };
+  gtl::FlatMap<const HloInstruction*, DfsState> dfs_state_map;
+
+  stack.push_back(root);
+  InsertOrDie(&dfs_state_map, root, kDiscovered);
+
+  do {
+    const HloInstruction* instr = stack.back();
+    if (cache_.count(instr)) {
+      stack.pop_back();
+      continue;
+    }
+
+    switch (FindOrDie(dfs_state_map, instr)) {
+      case kDiscovered: {
+        for (const HloInstruction* operand : instr->operands()) {
+          if (!cache_.count(operand)) {
+            stack.push_back(operand);
+            CHECK(!dfs_state_map.count(operand) ||
+                  dfs_state_map[operand] == kDiscovered);
+            dfs_state_map[operand] = kDiscovered;
+          }
+        }
+        dfs_state_map[instr] = kVisited;
+        break;
+      }
+
+      case kVisited:
+        stack.pop_back();
+        InsertOrDie(&cache_, instr, ComputeArrayFor(instr));
+        break;
+    }
+  } while (!stack.empty());
+}
+
+Analysis::Array* IndexedArrayAnalysis::ComputeArrayFor(
+    const HloInstruction* instr) {
+  Array* computed_array;
+  switch (instr->opcode()) {
+    default:
+      computed_array = nullptr;
+      break;
+    case HloOpcode::kConstant:
+      computed_array = ComputeArrayForConstant(instr->literal());
+      break;
+    case HloOpcode::kGather:
+      computed_array = ComputeArrayForGather(
+          instr->shape(), instr->gather_dimension_numbers(),
+          instr->gather_window_bounds(), FindOrDie(cache_, instr->operand(0)),
+          FindOrDie(cache_, instr->operand(1)));
+      break;
+  }
+
+  if (!computed_array) {
+    computed_array = Construct<UnknownArray>(instr);
+  }
+
+  return computed_array;
+}
+
+Analysis::Array* IndexedArrayAnalysis::ComputeArrayForConstant(
+    const Literal& literal) {
+  return Construct<ConstantArray>(&literal);
+}
+
+ScalarIndexedArray* IndexedArrayAnalysis::FoldGatherOfGather(
+    ScalarIndexedArray* source, Array* indices, int64 source_dim,
+    tensorflow::gtl::ArraySlice<int64> output_dims, Shape shape) {
+  // We want to transform Gather(Gather(A, X), Y) => Gather(A, Gather(X, Y)).
+  // `source` is the inner Gather(A, X).
+
+  Array* a = source->source();
+  Array* x = source->indices();
+  Array* y = indices;
+
+  // This bit is slightly tricky, so we do a naive "simulation" of the two
+  // consecutive gather operations to infer what the composed gather should look
+  // like.
+
+  enum class IndexComponent { Ungathered, GatheredFirst, GatheredSecond };
+
+  std::vector<IndexComponent> simulated_index(a->shape().dimensions_size(),
+                                              IndexComponent::Ungathered);
+
+  // Simulate the first gather.
+  simulated_index.erase(simulated_index.begin() + source->source_dim());
+  for (int64 gather_dim : source->output_dims()) {
+    simulated_index.insert(simulated_index.begin() + gather_dim,
+                           IndexComponent::GatheredFirst);
+  }
+
+  // Simulate the second gather.
+  simulated_index.erase(simulated_index.begin() + source_dim);
+  for (int64 output_dim : output_dims) {
+    simulated_index.insert(simulated_index.begin() + output_dim,
+                           IndexComponent::GatheredSecond);
+  }
+
+  int64 source_dim_for_index_array =
+      FindIndex(source->output_dims(), source_dim);
+  CHECK_NE(source_dim_for_index_array, source->output_dims().size());
+
+  std::vector<int64> output_dims_for_index_array;
+  int64 gathered_index_components_seen = 0;
+  for (IndexComponent simulation_dim : simulated_index) {
+    if (simulation_dim == IndexComponent::GatheredSecond) {
+      output_dims_for_index_array.push_back(gathered_index_components_seen);
+    }
+    if (simulation_dim != IndexComponent::Ungathered) {
+      gathered_index_components_seen++;
+    }
+  }
+
+  std::vector<int64> dim_sizes_for_composed_index;
+  std::vector<int64> output_dims_for_new_gather;
+  for (int64 i = 0, e = simulated_index.size(); i < e; i++) {
+    if (simulated_index[i] != IndexComponent::Ungathered) {
+      dim_sizes_for_composed_index.push_back(shape.dimensions(i));
+      output_dims_for_new_gather.push_back(i);
+    }
+  }
+
+  Array* inner_indices = ConstructScalarIndexedArray(
+      x, y, source_dim_for_index_array, output_dims_for_index_array,
+      ShapeUtil::MakeShape(x->shape().element_type(),
+                           dim_sizes_for_composed_index));
+  return ConstructScalarIndexedArray(a, inner_indices, source->source_dim(),
+                                     output_dims_for_new_gather,
+                                     std::move(shape));
+}
+
+Analysis::Array* IndexedArrayAnalysis::ComputeArrayForGather(
+    const Shape& shape, const GatherDimensionNumbers& dim_numbers,
+    tensorflow::gtl::ArraySlice<int64> window_bounds, Array* source,
+    Array* indices) {
+  if (dim_numbers.index_vector_dim() != indices->shape().dimensions_size()) {
+    return nullptr;
+  }
+
+  CHECK_EQ(dim_numbers.gather_dims_to_operand_dims_size(), 1);
+  if (!c_binary_search(dim_numbers.elided_window_dims(),
+                       dim_numbers.gather_dims_to_operand_dims(0))) {
+    return nullptr;
+  }
+
+  int64 source_dim = dim_numbers.gather_dims_to_operand_dims(0);
+  std::vector<int64> output_dims;
+  for (int64 i = 0, e = shape.dimensions_size(); i < e; i++) {
+    if (!c_binary_search(dim_numbers.output_window_dims(), i)) {
+      output_dims.push_back(i);
+    }
+  }
+
+  if (auto* indexed = dynamic_cast<ScalarIndexedArray*>(source)) {
+    auto it = c_find(indexed->output_dims(), source_dim);
+    if (it != indexed->output_dims().end()) {
+      return FoldGatherOfGather(indexed, indices, source_dim, output_dims,
+                                shape);
+    }
+  } else if (auto* constant = dynamic_cast<ConstantArray*>(source)) {
+    return Construct<ScalarIndexedConstantArray>(constant, indices, source_dim,
+                                                 output_dims, shape);
+  }
+
+  return Construct<ScalarIndexedArray>(source, indices, source_dim, output_dims,
+                                       shape);
+}
+
+tensorflow::StringPiece IndexedArrayAnalysisPrinterPass::name() const {
+  return "indexed-array-analysis-printer-pass";
+}
+
+StatusOr<bool> IndexedArrayAnalysisPrinterPass::Run(HloModule* module) {
+  if (!VLOG_IS_ON(2)) {
+    return false;
+  }
+
+  IndexedArrayAnalysis analysis;
+  for (auto* computation : module->MakeNonfusionComputations()) {
+    for (auto* instr : computation->instructions()) {
+      auto* t = analysis.GetArrayFor(instr);
+      if (!dynamic_cast<UnknownArray*>(t) && !dynamic_cast<ConstantArray*>(t)) {
+        VLOG(2) << instr->ToString() << "   ->   " << analysis.ToString(t);
+      }
+    }
+  }
+
+  return false;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.h b/tensorflow/compiler/xla/service/indexed_array_analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..b132a8f25153d2e86e8aa477fdb851f1c9c8e719
--- /dev/null
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.h
@@ -0,0 +1,298 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INDEXED_ARRAY_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_INDEXED_ARRAY_ANALYSIS_H_
+
+#include <type_traits>
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace xla {
+
+// IndexedArrayAnalysis decides if an HLO instruction can be rewritten as a
+// gather from another array.  It does this by mapping HLO instructions to
+// instances of IndexedArrayAnalysis::Array, which can be inspected to discover
+// whether said HLO is equivalent to a gather.
+class IndexedArrayAnalysis {
+ public:
+  // IndexedArrayAnalysis maps each HLO instruction to an instance of a Array.
+  // Array really just a sum type of the classes that inherit from it.  The
+  // meaning of each of the subtypes is documented on the subtype declaration.
+  //
+  // Array instances are immutable once created.
+  class Array {
+   public:
+    enum Kind { kUnknown, kConstant, kScalarIndexedConstant, kScalarIndexed };
+
+    virtual Kind kind() const = 0;
+    virtual const Shape& shape() const = 0;
+
+    // Does a checked downcast from `Array` to `T` which must be one of its
+    // subtypes.
+    template <typename T>
+    T* as() {
+      static_assert((std::is_base_of<Array, T>::value),
+                    "target type not derived from source type");
+      // We skip the CHECK and hence the dynamic_cast if RTTI is disabled.
+#if !defined(__GNUC__) || defined(__GXX_RTTI)
+      CHECK_NE(dynamic_cast<T*>(this), nullptr);
+#endif  // !defined(__GNUC__) || defined(__GXX_RTTI)
+
+      return static_cast<T*>(this);
+    }
+
+    virtual ~Array() = default;
+
+    Array& operator=(const Array& other) = delete;
+  };
+
+  // Represents an HLO instruction that was not analyzable by this
+  // IndexedArrayAnalysis.  Instances of UnknownArray just wrap an existing
+  // HloInstruction.
+  class UnknownArray : public Array {
+   public:
+    Kind kind() const override { return kUnknown; }
+    const Shape& shape() const override { return instruction().shape(); }
+    const HloInstruction& instruction() const { return instruction_; }
+
+   private:
+    explicit UnknownArray(const HloInstruction* instr) : instruction_(*instr) {}
+
+    const HloInstruction& instruction_;
+
+    friend class IndexedArrayAnalysis;
+  };
+
+  // Represents a constant value.  This constant value may be present in the HLO
+  // module being analyzed, or it could have been created on the fly by the
+  // analysis.
+  class ConstantArray : public Array {
+   public:
+    Kind kind() const override { return kConstant; }
+    const Shape& shape() const override { return literal()->shape(); }
+    const Literal* literal() const { return literal_; }
+
+   private:
+    explicit ConstantArray(const Literal* literal) : literal_(literal) {}
+    const Literal* literal_;
+
+    friend class IndexedArrayAnalysis;
+  };
+
+  // ---------------------------------------------------------------------------
+  // Indexed Array Overview
+  // ---------------------------------------------------------------------------
+  //
+  // ScalarIndexedArray and ScalarIndexedConstantArray form the core of this
+  // analysis.  ScalarIndexedConstantArray is just a specialization of
+  // ScalarIndexedArray so we will only discuss ScalarIndexedArray in this
+  // overview.
+  //
+  // A ScalarIndexedArray represents an array that can be computed by indexing
+  // into a "source" array using an "indices" tensor.  A simple example is a
+  // gather operation gathering 12 rows out of a [100,100] matrix -- such an
+  // operation will be represented by an instance of a ScalarIndexedArray with
+  // the [100,100] matrix as the "source" array and the [12]-shaped indices
+  // array as the "indices" tensor.  The ScalarIndexedArray operation itself
+  // will be of shape [12,100] (assuming we were gathering with axis=0).
+  //
+  // Gather operations are not the only operation that maps to
+  // ScalarIndexedArray instances (if that were true there would be little point
+  // in having a separate analysis).  We can often infer ScalarIndexedArrays for
+  // other operations too.  For instance, consider:
+  //
+  //   %source = f32[100,100] constant
+  //   %indices = s32[12] ...
+  //   %gather = f32[12,100] ... gather from %source using %indices at axis 0
+  //   %dot = dot(%gather, other_constant) [canonical contracting dims]
+  //
+  // The dot operation itself is also a ScalarIndexedArray with source =
+  // dot(constant, other_constant) and indices = %indices.  A reshape of %gather
+  // to [12,5,20] too is a ScalarIndexedArray with source = an appropriately
+  // reshaped constant and indices = %indices.
+
+  // Represents the result of a gather operation.  This gather operation may
+  // explicitly be present in the HLO module being analyzed, or it could have
+  // been created on the fly by the analysis.
+  //
+  // An instance of ScalarIndexedArray represents a array whose I'th element can
+  // be mapped to the J'th element of the `source` array (where I and J are
+  // multidimensional indices) in this way:
+  //
+  //   I' = remove components at positions `output_dims` from I
+  //   G' = remove components not at positions `output_dims` from I
+  //   T  = indices[G']
+  //   J  = I' with T inserted at position `source_dim`
+  //
+  // For example, if source is of shape [11,13,17,19], indices is of shape
+  // [23,29], output_dims is [0,2] and source_dim is 2 then the output is of
+  // shape [23,11,29,19] and the output index [A,B,C,D,E] is mapped to the input
+  // index [B,D,indices[A,C],E].
+  class ScalarIndexedArray : public Array {
+   public:
+    Kind kind() const override { return kScalarIndexed; }
+    const Shape& shape() const override { return shape_; }
+
+    Array* source() const { return source_; }
+    Array* indices() const { return indices_; }
+    int64 source_dim() const { return source_dim_; }
+    tensorflow::gtl::ArraySlice<int64> output_dims() const {
+      return output_dims_;
+    }
+
+   private:
+    explicit ScalarIndexedArray(Array* source, Array* indices, int64 source_dim,
+                                std::vector<int64> output_dims, Shape shape)
+        : source_(source),
+          indices_(indices),
+          source_dim_(source_dim),
+          output_dims_(std::move(output_dims)),
+          shape_(std::move(shape)) {}
+
+    Array* source_;
+    Array* indices_;
+    int64 source_dim_;
+    std::vector<int64> output_dims_;
+    Shape shape_;
+
+    friend class IndexedArrayAnalysis;
+  };
+
+  // A ScalarIndexedConstantArray is just a ScalarIndexedArray constrained to
+  // have a ConstantArray instance as the source.  This is an ergonomic
+  // concession -- in theory it is possible to just keep ScalarIndexedArray and
+  // check source()->kind().
+  class ScalarIndexedConstantArray : public ScalarIndexedArray {
+   public:
+    Kind kind() const override { return kScalarIndexedConstant; }
+
+    const Literal& literal() const {
+      return *source()->as<ConstantArray>()->literal();
+    }
+
+   private:
+    explicit ScalarIndexedConstantArray(Array* source, Array* indices,
+                                        int64 source_dim,
+                                        std::vector<int64> output_dims,
+                                        Shape shape)
+        : ScalarIndexedArray(source, indices, source_dim,
+                             std::move(output_dims), std::move(shape)) {
+      CHECK(dynamic_cast<ConstantArray*>(source));
+    }
+
+    friend class IndexedArrayAnalysis;
+  };
+
+  // Returns an Array instance for `instr`.  The IndexedArrayAnalysis instance
+  // keeps ownership of the returned Array instance.
+  //
+  // Caching Behavior: IndexedArrayAnalysis has a cache mapping HLO
+  // instructions to IndexedArrayAnalysis::Array instances.  This entire cache
+  // becomes stale and may cause the analysis to return incorrect results if any
+  // transitive operand (stopping at the containing computation) is modified for
+  // any HLO instruction on which GetArrayFor has been invoked.
+  //
+  // NB!  By inspecting the implementation, you may be able to infer a stronger
+  // caching guarantee than what is mentioned above.  Nevertheless, what is
+  // stated above is the contract.
+  Array* GetArrayFor(const HloInstruction* instr);
+
+  // Pretty-prints the expression rooted at `root`.
+  string ToString(Array* root);
+
+ private:
+  // Helper function that ensures that every HLO instruction that is
+  // transitively used by `root` has an entry in `cache_`.
+  void TraverseAndPopulateCache(const HloInstruction* root);
+
+  // Creates an Array instance for `instr` under the assumption that all
+  // operations of `instr` are present in `cache_`.
+  Array* ComputeArrayFor(const HloInstruction* instr);
+
+  Array* ComputeArrayForConstant(const Literal& literal);
+
+  Array* ComputeArrayForGather(const Shape& shape,
+                               const GatherDimensionNumbers& dim_numbers,
+                               tensorflow::gtl::ArraySlice<int64> window_bounds,
+                               Array* source, Array* indices);
+
+  // This tries to fold a ScalarIndexedArray which has another
+  // ScalarIndexedArray as a source into a ScalarIndexedArray that instead has a
+  // ScalarIndexedArray as indices.  If `source` happened to be a
+  // ScalarIndexedConstantArray this can result in an expression that is more
+  // canonical.
+  //
+  // As an example, consider a gather operation, G0, gathering 7 elements from
+  // an array "Arr" of shape [100] resulting in an array of shape [7], and a
+  // second gather operation, G1, which gathers 3 elements out of the result of
+  // G0 resulting in an array of shape [3].  Let the indices uses by G0 be I0
+  // (of shape [7]) and the indices used by G1 be I1 (of shape [3]).  We can
+  // instead rewrite G1 to gather directly from "Arr" with the three indices
+  // from I0 as per I1.  In other words, we can rewrite:
+  //
+  //    G0 = [Arr[i] for i in I0]
+  //    G1 = [G0[i]  for i in I1]
+  //
+  // into
+  //
+  //    I2 = [I0[i]  for i in I1]
+  //    G1 = [Arr[i] for i in I2]
+  ScalarIndexedArray* FoldGatherOfGather(
+      ScalarIndexedArray* source, Array* indices, int64 source_dim,
+      tensorflow::gtl::ArraySlice<int64> output_dims, Shape shape);
+
+  template <typename T, typename... Args>
+  T* Construct(Args&&... args) {
+    T* new_tensor = new T(std::forward<Args>(args)...);
+    owned_tensors_.push_back(std::unique_ptr<T>(new_tensor));
+    return new_tensor;
+  }
+
+  ScalarIndexedArray* ConstructScalarIndexedArray(
+      Array* source, Array* indices, int64 source_dim,
+      std::vector<int64> output_dims, Shape shape) {
+    if (source->kind() == Array::kConstant) {
+      return Construct<ScalarIndexedConstantArray>(source, indices, source_dim,
+                                                   std::move(output_dims),
+                                                   std::move(shape));
+    } else {
+      return Construct<ScalarIndexedArray>(source, indices, source_dim,
+                                           std::move(output_dims),
+                                           std::move(shape));
+    }
+  }
+
+  std::vector<std::unique_ptr<Array>> owned_tensors_;
+  std::vector<std::unique_ptr<Literal>> owned_literals_;
+  tensorflow::gtl::FlatMap<const HloInstruction*, Array*> cache_;
+};
+
+// A pass that prints all non-trivial results returned by IndexedArrayAnalysis.
+// This pass is a no-op if !VLOG_IS_ON(2) so it should be fine to
+// unconditionally add to the regular HLO pass pipeline.
+class IndexedArrayAnalysisPrinterPass : public HloPassInterface {
+ public:
+  tensorflow::StringPiece name() const override;
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INDEXED_ARRAY_ANALYSIS_H_
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2731b7c51a45c4f9b713d99ef3e4623ad2c9c83
--- /dev/null
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
@@ -0,0 +1,191 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
+#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+
+namespace xla {
+namespace {
+class IndexedArrayAnalysisTest : public HloVerifiedTestBase {
+ protected:
+  void AssertArrayForRootExpressionIs(const string& hlo_text,
+                                      const string& root_expression) {
+    IndexedArrayAnalysis indexed_tensor_analysis;
+    ParseAndVerifyModule(hlo_text);
+
+    string result =
+        indexed_tensor_analysis.ToString(indexed_tensor_analysis.GetArrayFor(
+            module().entry_computation()->root_instruction()));
+    LOG(INFO) << result;
+    ASSERT_EQ(result, root_expression);
+  }
+};
+
+TEST_F(IndexedArrayAnalysisTest, SimpleOneToOneGather) {
+  string hlo_text = R"(
+HloModule SimpleGather
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[5] parameter(1)
+  ROOT gather = s32[5,3] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,3}
+}
+)";
+
+  AssertArrayForRootExpressionIs(hlo_text,
+                                 "(scalar-indexed %operand %indices 0->[0])");
+}
+
+TEST_F(IndexedArrayAnalysisTest, SimpleOneToOneConstantGather) {
+  string hlo_text = R"(
+HloModule SimpleGather
+
+ENTRY main {
+  operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}})
+  indices = s32[5] parameter(0)
+  ROOT gather = s32[5,3] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,3}
+}
+)";
+
+  AssertArrayForRootExpressionIs(
+      hlo_text, "(scalar-indexed-const (constant s32[3,3]) %indices 0->[0])");
+}
+
+TEST_F(IndexedArrayAnalysisTest, GatherOfGather_OneToOne) {
+  string hlo_text = R"(
+HloModule SimpleGather
+
+ENTRY main {
+  operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}})
+  indices_a = s32[5] parameter(0)
+  indices_b = s32[2] parameter(1)
+  gather_a = s32[5,3] gather(operand, indices_a),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,3}
+  ROOT gather_b = s32[2,3] gather(gather_a, indices_b),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,3}
+}
+)";
+
+  AssertArrayForRootExpressionIs(
+      hlo_text,
+      "(scalar-indexed-const (constant s32[3,3]) (scalar-indexed %indices_a "
+      "%indices_b 0->[0]) 0->[0])");
+}
+
+TEST_F(IndexedArrayAnalysisTest, GatherOfGather_ManyToOneWithOneToOne) {
+  string hlo_text = R"(
+HloModule SimpleGather
+
+ENTRY main {
+  operand = s32[3,2] parameter(0)
+  indices_a = s32[5,7] parameter(1)
+  indices_b = s32[2] parameter(2)
+  gather_a = s32[5,3,7] gather(operand, indices_a),
+      output_window_dims={1},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=2,
+      window_bounds={3,1}
+  ROOT gather_b = s32[5,3,2] gather(gather_a, indices_b),
+      output_window_dims={0,1},
+      elided_window_dims={2},
+      gather_dims_to_operand_dims={2},
+      index_vector_dim=1,
+      window_bounds={5,3,1}
+}
+)";
+
+  AssertArrayForRootExpressionIs(hlo_text,
+                                 "(scalar-indexed %operand (scalar-indexed "
+                                 "%indices_a %indices_b 1->[1]) 1->[0,2])");
+}
+
+TEST_F(IndexedArrayAnalysisTest, GatherOfGather_OneToOneWithManyToOne) {
+  string hlo_text = R"(
+HloModule SimpleGather
+
+ENTRY main {
+  operand = s32[3,6] parameter(0)
+  indices_a = s32[2] parameter(1)
+  indices_b = s32[5,7] parameter(2)
+  gather_a = s32[2,6] gather(operand, indices_a),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,6}
+  ROOT gather_b = s32[5,6,7] gather(gather_a, indices_b),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=2,
+      window_bounds={1,6}
+}
+)";
+
+  AssertArrayForRootExpressionIs(hlo_text,
+                                 "(scalar-indexed %operand (scalar-indexed "
+                                 "%indices_a %indices_b 0->[0,1]) 0->[0,2])");
+}
+
+TEST_F(IndexedArrayAnalysisTest, GatherOfGather_ManyToOneWithManyToOne) {
+  string hlo_text = R"(
+HloModule SimpleGather
+
+ENTRY main {
+  operand = s32[3,2] parameter(0)
+  indices_a = s32[5,7] parameter(1)
+  indices_b = s32[4,8] parameter(2)
+  gather_a = s32[5,3,7] gather(operand, indices_a),
+      output_window_dims={1},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=2,
+      window_bounds={3,1}
+  ROOT gather_b = s32[4,5,3,8] gather(gather_a, indices_b),
+      output_window_dims={1,2},
+      elided_window_dims={2},
+      gather_dims_to_operand_dims={2},
+      index_vector_dim=2,
+      window_bounds={5,3,1}
+}
+)";
+
+  AssertArrayForRootExpressionIs(
+      hlo_text,
+      "(scalar-indexed %operand (scalar-indexed %indices_a %indices_b "
+      "1->[0,2]) 1->[0,1,3])");
+}
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/inliner_test.cc b/tensorflow/compiler/xla/service/inliner_test.cc
index 7aa1c7c8358318d02a000d968a2672123400ad6e..d2af261008f40ee83e0676cfc7e67c45f8be1844 100644
--- a/tensorflow/compiler/xla/service/inliner_test.cc
+++ b/tensorflow/compiler/xla/service/inliner_test.cc
@@ -71,7 +71,7 @@ TEST_F(InlinerTest, MapMax) {
   // Verify execution on CPU.
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
   auto expected = Literal::CreateR1<float>({4, 3, 3, 4});
-  LiteralTestUtil::ExpectEqual(*result, *expected);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result, *expected));
 }
 
 // Test that `constant` function is changed to `broadcast`.
@@ -105,7 +105,7 @@ TEST_F(InlinerTest, MapConstant) {
   // Verify execution on CPU.
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
   auto expected = Literal::CreateR2<float>({{2, 2, 2, 2}, {2, 2, 2, 2}});
-  LiteralTestUtil::ExpectEqual(*result, *expected);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result, *expected));
 }
 
 TEST_F(InlinerTest, MapSubtractOppositeOrder) {
@@ -143,7 +143,7 @@ TEST_F(InlinerTest, MapSubtractOppositeOrder) {
   // Verify execution on CPU.
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
   auto expected = Literal::CreateR1<float>({3, 1, -1, -3});
-  LiteralTestUtil::ExpectEqual(*result, *expected);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result, *expected));
 }
 
 
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index dc1a39e9fa9fd3ef5c55bd86309fe23f5ef51dd5..cb6c98c48171a06539499b723a8d8b7aa0ccc96a 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -28,6 +28,25 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
+namespace {
+// These nodes can always be duplicated into consumers, even if
+// InstructionFusion::may_duplicate_ is false.
+//
+// In general these should be nodes that get *cheaper* the more they're
+// duplicated (and fused into consumers).
+//
+// TODO(jlebar): Duplicating instructions when we have a variable called "may
+// duplicate" that's equal to false is not pretty.
+bool IsAlwaysDuplicable(const HloInstruction& instruction) {
+  // We are always willing to duplicate a widening type-conversion instruction
+  // if it means we can fuse the convert into a consumer.  This allows the
+  // consumer to read less memory, which is almost always a performance win.
+  return instruction.opcode() == HloOpcode::kConvert &&
+         ShapeUtil::ByteSizeOf(instruction.operand(0)->shape()) <
+             ShapeUtil::ByteSizeOf(instruction.shape());
+}
+}  // namespace
+
 /*static*/ bool InstructionFusion::IsExpensive(
     const HloInstruction& instruction) {
   switch (instruction.opcode()) {
@@ -101,11 +120,13 @@ namespace xla {
     case HloOpcode::kDivide:
     case HloOpcode::kDot:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kFft:
     case HloOpcode::kFusion:
     case HloOpcode::kGather:
     case HloOpcode::kHostCompute:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kMap:
     case HloOpcode::kParameter:
     case HloOpcode::kPower:
@@ -393,12 +414,9 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
   return changed;
 }
 
-HloInstruction* InstructionFusion::Fuse(HloInstruction* producer,
-                                        HloInstruction* consumer) {
+HloInstruction* InstructionFusion::AddFusionInstruction(
+    HloInstruction* producer, HloInstruction* consumer) {
   HloInstruction* fusion_instruction;
-
-  VLOG(2) << "Fusing " << producer->ToString() << " into "
-          << consumer->ToString();
   auto kind = ChooseKind(producer, consumer);
   if (consumer->opcode() == HloOpcode::kFusion) {
     fusion_instruction = consumer;
@@ -410,17 +428,35 @@ HloInstruction* InstructionFusion::Fuse(HloInstruction* producer,
         HloInstruction::CreateFusion(consumer->shape(), kind, consumer));
     TF_CHECK_OK(computation_->ReplaceInstruction(consumer, fusion_instruction));
   }
+  return fusion_instruction;
+}
 
+HloInstruction* InstructionFusion::Fuse(HloInstruction* producer,
+                                        HloInstruction* consumer) {
+  VLOG(2) << "Fusing " << producer->ToString() << " into "
+          << consumer->ToString();
+  HloInstruction* fusion_instruction = AddFusionInstruction(producer, consumer);
   fusion_instruction->FuseInstruction(producer);
   return fusion_instruction;
 }
 
+HloInstruction* InstructionFusion::FuseIntoMultiOutput(
+    HloInstruction* producer, HloInstruction* consumer) {
+  VLOG(2) << "Multi-output fusing " << producer->ToString() << " into "
+          << consumer->ToString();
+  HloInstruction* fusion_instruction = AddFusionInstruction(producer, consumer);
+  fusion_instruction->FuseInstructionIntoMultiOutput(producer);
+  return fusion_instruction;
+}
+
 bool InstructionFusion::ShouldFuse(HloInstruction* consumer,
                                    int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
+
   // Cost condition: don't duplicate expensive instructions.
   if (FusionWouldDuplicate(*producer, *consumer) &&
-      (is_expensive_(*producer) || !may_duplicate_)) {
+      (!may_duplicate_ || is_expensive_(*producer)) &&
+      !IsAlwaysDuplicable(*producer)) {
     return false;
   }
 
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index 2ea1fcf937ceaf2cce3f8ed0891399384d93dbd0..c3c2ed0aaa81d6f346ec6e70d9c8b3b923e0a3d2 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -70,6 +70,13 @@ class InstructionFusion : public HloPassInterface {
   virtual HloInstruction* Fuse(HloInstruction* producer,
                                HloInstruction* consumer);
 
+  // Creates a new fusion instruction containing `producer` and `consumer`. A
+  // tuple is added as the fusion instruction's root, which consumes from both,
+  // `producer` and `consumer`. This style of fusion is referred to as
+  // multi-output fusion.
+  virtual HloInstruction* FuseIntoMultiOutput(HloInstruction* producer,
+                                              HloInstruction* consumer);
+
   // An "effectively unary" operation is one that has at most one "large"
   // input with the others being negligible in terms of memory usage.
   // We use "has a smaller true rank than the output" as a heuristic
@@ -95,6 +102,9 @@ class InstructionFusion : public HloPassInterface {
   // The set of producers whose consumers we cannot fuse into.
   using DoNotFuseSet = std::unordered_set<HloInstruction*>;
 
+  HloInstruction* AddFusionInstruction(HloInstruction* producer,
+                                       HloInstruction* consumer);
+
   // Whether or not we can fuse producer into consumer on all paths
   // from the producer to the consumer where nodes are HLOs and edges are uses.
   bool CanFuseOnAllPaths(HloInstruction* producer, HloInstruction* consumer,
diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
index e78b99a80cf41318faa1cb709428b8ba0f531944..df109df7877eefe4c337f93cc5a3a7a48e2e76c7 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
@@ -21,8 +21,95 @@ limitations under the License.
 
 namespace xla {
 
+namespace op = xla::testing::opcode_matchers;
+
 using InstructionFusionTest = HloTestBase;
 
+// Subclass of InstructionFusion exposing the protected methods Fuse and
+// FuseIntoMultiOutput for testing.
+class InstructionFusionForTesting : public InstructionFusion {
+ public:
+  explicit InstructionFusionForTesting(HloModule* module)
+      : InstructionFusion(InstructionFusion::IsExpensive) {
+    module_ = module;
+    computation_ = module->entry_computation();
+  }
+
+  HloInstruction* Fuse(HloInstruction* producer,
+                       HloInstruction* consumer) override {
+    return InstructionFusion::Fuse(producer, consumer);
+  }
+
+  HloInstruction* FuseIntoMultiOutput(HloInstruction* producer,
+                                      HloInstruction* consumer) override {
+    return InstructionFusion::FuseIntoMultiOutput(producer, consumer);
+  }
+};
+
+TEST_F(InstructionFusionTest, FuseInstructions) {
+  auto module = tools::Parse(R"(
+  HloModule test_module
+  ENTRY entry_computation {
+    p0 = f32[4,3]{1,0} parameter(0)
+    add = f32[4,3]{1,0} add(p0, p0)
+    ROOT sub = f32[4,3]{1,0} subtract(add, p0)
+  })")
+                    .ValueOrDie();
+  HloInstruction* sub = module->entry_computation()->root_instruction();
+  HloInstruction* add = sub->mutable_operand(0);
+  HloInstruction* fusion =
+      InstructionFusionForTesting(module.get()).Fuse(add, sub);
+
+  ASSERT_THAT(fusion, op::Fusion()) << module->ToString();
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Subtract(op::Add(), op::Parameter()))
+      << module->ToString();
+}
+
+TEST_F(InstructionFusionTest, FuseIntoFusionInstruction) {
+  auto module = tools::Parse(R"(
+  HloModule test_module
+  fused_computation {
+    p1 = f32[4,3] parameter(0)
+    add = f32[4,3] add(p1, p1)
+  }
+  ENTRY entry_computation {
+    p0 = f32[4,3] parameter(0)
+    abs = f32[4,3] abs(p0)
+    ROOT fusion = f32[4,3] fusion(abs), kind=kLoop, calls=fused_computation
+  })")
+                    .ValueOrDie();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  HloInstruction* abs = root->mutable_operand(0);
+  HloInstruction* fusion =
+      InstructionFusionForTesting(module.get()).Fuse(abs, root);
+
+  ASSERT_THAT(fusion, op::Fusion()) << module->ToString();
+  EXPECT_THAT(fusion->fused_expression_root(), op::Add(op::Abs(), op::Abs()))
+      << module->ToString();
+}
+
+TEST_F(InstructionFusionTest, FuseInstructionsIntoMultiOutput) {
+  auto module = tools::Parse(R"(
+  HloModule test_module
+  ENTRY entry_computation {
+    p0 = f32[4,3]{1,0} parameter(0)
+    abs = f32[4,3]{1,0} abs(p0)
+    tanh = f32[4,3]{1,0} tanh(abs)
+    ROOT add = f32[4,3]{1,0} add(abs, tanh)
+  })")
+                    .ValueOrDie();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  HloInstruction* abs = root->mutable_operand(0);
+  HloInstruction* tanh = root->mutable_operand(1);
+  HloInstruction* fusion =
+      InstructionFusionForTesting(module.get()).FuseIntoMultiOutput(abs, tanh);
+
+  ASSERT_THAT(fusion, op::Fusion()) << module->ToString();
+  EXPECT_THAT(fusion->fused_expression_root(), op::Tuple(op::Tanh(), op::Abs()))
+      << module->ToString();
+}
+
 TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfParameterUnfused) {
   HloComputation::Builder builder(TestName());
   auto param0 = builder.AddInstruction(
@@ -90,7 +177,8 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusable) {
   EXPECT_FALSE(
       InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
           .Run(module.get())
-          .ValueOrDie());
+          .ValueOrDie())
+      << module->ToString();
 }
 
 // Counts the number of HLO ops with a given op code in the specified module.
@@ -124,7 +212,7 @@ TEST_F(InstructionFusionTest, FuseCheapNonDuplicatableOps) {
   EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString();
 
   // Make sure the add hasn't been duplicated.
-  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString();
+  EXPECT_EQ(Count(*module, HloOpcode::kAdd), 1) << module->ToString();
 }
 
 TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) {
@@ -149,7 +237,11 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) {
           .Run(module.get())
           .ValueOrDie())
       << module->ToString();
-  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion());
+  EXPECT_THAT(root->fused_expression_root(),
+              op::Subtract(op::Abs(op::Parameter()), op::Parameter()))
+      << module->ToString();
 
   // Make sure the add hasn't been duplicated.
   EXPECT_EQ(Count(*module, HloOpcode::kAdd), 1) << module->ToString();
@@ -242,7 +334,12 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) {
           .Run(module.get())
           .ValueOrDie())
       << module->ToString();
-  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString();
+  root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion());
+  EXPECT_THAT(root->fused_expression_root(),
+              op::Tuple(op::Subtract(op::Parameter(), op::Parameter()),
+                        op::Subtract(op::Parameter(), op::Parameter())))
+      << module->ToString();
 
   // Make sure we didn't duplicate any adds.
   EXPECT_EQ(Count(*module, HloOpcode::kAdd), 2) << module->ToString();
@@ -291,4 +388,29 @@ TEST_F(InstructionFusionTest, AllowEffectiveUnaryDuplication) {
           .ValueOrDie());
 }
 
+TEST_F(InstructionFusionTest,
+       WideningConvertsAreAlwaysDuplicableIntoConsumers) {
+  auto module = tools::Parse(R"(
+  HloModule test_module
+  ENTRY Test {
+    p0 = f16[100] parameter(0)
+    c = f32[100] convert(p0)
+    add = f32[100] add(c, c)
+    ROOT mul = f32[100] multiply(c, c)
+  })")
+                    .ValueOrDie();
+
+  // The convert should be fused into the add and mul, even though may_duplicate
+  // is false, because it's always beneficial to fuse/duplicate widening
+  // converts into consumers.
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/false)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion(op::Parameter()));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 45505484951abfcee93a62fec7a99e86cbb9150c..524d3234eb4eff9c7d000eca1a0d9f5c4fae90af 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -18,7 +18,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/service/interpreter:platform_id",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
     ],
     alwayslink = True,  # Contains per-platform transfer manager registration
 )
@@ -117,6 +116,5 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_headers_lib",
-        "//tensorflow/core:stream_executor_no_cuda",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index eecbbcb93df64b09acb5e009d3db79e51dab0c93..c1666530687f2f8407a9dcb4e271c9d95552a689 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -45,8 +44,7 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
 
   pipeline.AddPass<LayoutAssignment>(
-      hlo_module->device_entry_computation_layout());
-
+      hlo_module->mutable_device_entry_computation_layout());
   return pipeline.Run(hlo_module).status();
 }
 
@@ -71,7 +69,8 @@ StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
 
   // Create executable from only the Hlo module.
   std::unique_ptr<Executable> executable =
-      xla::MakeUnique<InterpreterExecutable>(std::move(hlo_module));
+      xla::MakeUnique<InterpreterExecutable>(std::move(hlo_module),
+                                             xla::MakeUnique<HloEvaluator>());
 
   return std::move(executable);
 }
@@ -101,17 +100,14 @@ HloCostAnalysis::ShapeSizeFunction InterpreterCompiler::ShapeSizeBytesFunction()
   return InterpreterExecutable::ShapeSizeBytes;
 }
 
-static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
-  return xla::MakeUnique<xla::ComputationPlacer>();
-}
-
 static bool InitModule() {
   xla::Compiler::RegisterCompilerFactory(
       se::interpreter::kXlaInterpreterPlatformId, []() {
         return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
       });
   xla::ComputationPlacer::RegisterComputationPlacer(
-      se::interpreter::kXlaInterpreterPlatformId, &CreateComputationPlacer);
+      se::interpreter::kXlaInterpreterPlatformId,
+      []() { return xla::MakeUnique<xla::ComputationPlacer>(); });
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 61f199bc9e8f4f95a2f097af4abf9395a1e05f64..029e71058a7373b9310c6d9ffdb65f72ca28e5af 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/interpreter/executor.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
@@ -32,16 +31,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
 namespace interpreter {
 
 InterpreterExecutable::InterpreterExecutable(
-    std::unique_ptr<const HloModule> hlo_module)
+    std::unique_ptr<const HloModule> hlo_module,
+    std::unique_ptr<HloEvaluator> evaluator)
     : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,
-                 /*hlo_profile_index_map=*/nullptr) {}
+                 /*hlo_profile_index_map=*/nullptr),
+      evaluator_(std::move(evaluator)) {}
 
 InterpreterExecutable::~InterpreterExecutable() {}
 
@@ -82,10 +82,13 @@ StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteOnStream(
   }
 
   // Execute the graph using the HloEvaluator.
-  HloEvaluator evaluator;
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Literal> result_literal,
-      evaluator.Evaluate<std::unique_ptr<Literal>>(*computation, arg_literals));
+  std::unique_ptr<Literal> result_literal;
+  {
+    tensorflow::mutex_lock lock(evaluator_lock_);
+    TF_ASSIGN_OR_RETURN(result_literal,
+                        evaluator_->Evaluate<std::unique_ptr<Literal>>(
+                            *computation, arg_literals));
+  }
 
   // Transform the result literal back into a ShapedBuffer.
   TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index b0b797ca7d6f449a11c662ffba7c2a0a0040e47e..91d8148d26dc8eddbafdaf4870d9efbb73a12816 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -40,13 +42,15 @@ namespace interpreter {
 // buffer allocation. Refer to interpreter/README.md for more.
 class InterpreterExecutable : public Executable {
  public:
-  InterpreterExecutable(std::unique_ptr<const HloModule> hlo_module);
+  InterpreterExecutable(std::unique_ptr<const HloModule> hlo_module,
+                        std::unique_ptr<HloEvaluator> evaluator);
   ~InterpreterExecutable() override;
 
   StatusOr<ScopedShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
+      HloExecutionProfile* hlo_execution_profile) override
+      LOCKS_EXCLUDED(evaluator_lock_);
 
   StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
@@ -54,6 +58,11 @@ class InterpreterExecutable : public Executable {
 
   static int64 ShapeSizeBytes(const Shape& shape);
 
+ protected:
+  // The interpreter interprets executables with an HloEvaluator.
+  std::unique_ptr<HloEvaluator> evaluator_ PT_GUARDED_BY(evaluator_lock_);
+  mutable tensorflow::mutex evaluator_lock_;
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(InterpreterExecutable);
 };
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc
index 92e069a8c67c1d441ba9d396dee503c9b3bde0df..42c2c28997d5f3b02f1fe4effca164c893e4071d 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/service/interpreter/executor.h"
-#include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
 #include "tensorflow/stream_executor/device_options.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/ptr_util.h"
@@ -31,13 +30,13 @@ limitations under the License.
 namespace stream_executor {
 namespace interpreter {
 
-XlaInterpreterPlatform::XlaInterpreterPlatform() : name_("Interpreter") {}
+XlaInterpreterPlatform::XlaInterpreterPlatform(const string& name,
+                                               const Platform::Id& id)
+    : name_(name), id_(id) {}
 
 XlaInterpreterPlatform::~XlaInterpreterPlatform() {}
 
-Platform::Id XlaInterpreterPlatform::id() const {
-  return kXlaInterpreterPlatformId;
-}
+Platform::Id XlaInterpreterPlatform::id() const { return id_; }
 
 int XlaInterpreterPlatform::VisibleDeviceCount() const { return 1; }
 
@@ -106,8 +105,6 @@ REGISTER_MODULE_INITIALIZER(
     interpreter_platform,
     stream_executor::interpreter::InitializeXlaInterpreterPlatform());
 
-DECLARE_MODULE_INITIALIZER(multi_platform_manager);
-
 // Note that module initialization sequencing is not supported in the
 // open-source project, so this will be a no-op there.
 REGISTER_MODULE_INITIALIZER_SEQUENCE(interpreter_platform,
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.h b/tensorflow/compiler/xla/service/interpreter/platform.h
index d68c5aa20dda7ac246ed4aa667851e385a604c04..0187f6d473b19f50136e214708e56f833627d9d1 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
 #include "tensorflow/stream_executor/executor_cache.h"
 #include "tensorflow/stream_executor/plugin.h"
 #include "tensorflow/stream_executor/stream_executor.h"
@@ -28,7 +29,8 @@ namespace interpreter {
 
 class XlaInterpreterPlatform : public Platform {
  public:
-  XlaInterpreterPlatform();
+  XlaInterpreterPlatform(const string& name = "Interpreter",
+                         const Platform::Id& id = kXlaInterpreterPlatformId);
   ~XlaInterpreterPlatform() override;
 
   Platform::Id id() const override;
@@ -55,6 +57,8 @@ class XlaInterpreterPlatform : public Platform {
  private:
   // This platform's name.
   string name_;
+  // This platform's id.
+  Platform::Id id_;
 
   // Cache of created StreamExecutors.
   ExecutorCache executor_cache_;
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index cfa7ba5e81ddd003978a2bd763384581c55b5c83..7067b6f86a0fb24fb946ad236bca9bbd48d53722 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -31,10 +31,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -400,9 +402,9 @@ string LayoutConstraints::ToString() const {
 }
 
 Status LayoutAssignment::AddMandatoryConstraints(
-    const ComputationLayout& computation_layout,
-    const ChannelLayoutConstraints* channel_constraints,
-    HloComputation* computation, LayoutConstraints* constraints) {
+    const ComputationLayout* computation_layout,
+    ChannelLayoutConstraints* channel_constraints, HloComputation* computation,
+    LayoutConstraints* constraints) {
   VLOG(3) << "Adding mandatory layout constraints to computation "
           << computation->name();
 
@@ -424,11 +426,16 @@ Status LayoutAssignment::AddMandatoryConstraints(
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
           instruction->outfeed_shape(), instruction, 0));
     } else if (instruction->opcode() == HloOpcode::kParameter) {
-      // Parameter layouts must match the respective layout in
-      // ComputationLayout.
-      shape_with_layout =
-          &computation_layout.parameter_layout(instruction->parameter_number())
-               .shape();
+      if (computation_layout != nullptr) {
+        const ShapeLayout& parameter_layout =
+            computation_layout->parameter_layout(
+                instruction->parameter_number());
+        if (parameter_layout.LayoutIsSet()) {
+          // Parameter layouts must match the respective layout in
+          // ComputationLayout, if there is one.
+          shape_with_layout = &parameter_layout.shape();
+        }
+      }
     }
     if (shape_with_layout != nullptr) {
       TF_RETURN_IF_ERROR(
@@ -493,9 +500,8 @@ Status LayoutAssignment::AddMandatoryConstraints(
       HloComputation* body = instruction->while_body();
       HloComputation* condition = instruction->while_condition();
       const HloInstruction* init = instruction->operand(0);
-      const ComputationLayout& body_layout =
-          FindOrDie(computation_layouts_, body);
-      const ComputationLayout& condition_layout =
+      ComputationLayout& body_layout = FindOrDie(computation_layouts_, body);
+      ComputationLayout& condition_layout =
           FindOrDie(computation_layouts_, condition);
 
       // Check a few invariants irrespective of layout.
@@ -508,26 +514,19 @@ Status LayoutAssignment::AddMandatoryConstraints(
                                    condition_layout.parameter_shape(0)));
       DCHECK(ShapeUtil::Compatible(body_layout.result_shape(), init->shape()));
 
-      // Return error if earlier layout assignment of the embedded computations
-      // has produced conflicting layouts.
-      if (!ShapeUtil::Equal(body_layout.result_shape(),
-                            body_layout.parameter_shape(0))) {
-        return InternalError(
-            "Parameter and result of body computation %s of while instruction "
-            "%s have different layouts: %s vs %s",
-            body->name().c_str(), instruction->name().c_str(),
-            ShapeUtil::HumanString(body_layout.result_shape()).c_str(),
-            ShapeUtil::HumanString(body_layout.parameter_shape(0)).c_str());
+      if (body_layout.result_layout() != body_layout.parameter_layout(0)) {
+        VLOG(2) << "Reset %while body parameter layout: body=" << body->name()
+                << " while=" << instruction->name()
+                << " shape=" << body_layout.result_layout().ToString();
+        *body_layout.mutable_parameter_layout(0) = body_layout.result_layout();
       }
-      if (!ShapeUtil::Equal(body->root_instruction()->shape(),
-                            condition->parameter_instruction(0)->shape())) {
-        return InternalError(
-            "Parameter of condition computation %s of while instruction "
-            "%s does not match body computation %s result: %s vs %s",
-            condition->name().c_str(), instruction->name().c_str(),
-            body->name().c_str(),
-            ShapeUtil::HumanString(condition_layout.parameter_shape(0)).c_str(),
-            ShapeUtil::HumanString(body_layout.result_shape()).c_str());
+      if (condition_layout.parameter_layout(0) !=
+          body_layout.parameter_layout(0)) {
+        VLOG(2) << "Reset %while condition parameter layout: cond="
+                << condition->name() << " while=" << instruction->name()
+                << " shape=" << body_layout.parameter_layout(0).ToString();
+        *condition_layout.mutable_parameter_layout(0) =
+            body_layout.parameter_layout(0);
       }
 
       // Constrain the output and the operand of the while instruction to match
@@ -557,7 +556,20 @@ Status LayoutAssignment::AddMandatoryConstraints(
                                    true_computation_layout.parameter_shape(0)));
       DCHECK(ShapeUtil::Compatible(
           false_operand->shape(), false_computation_layout.parameter_shape(0)));
-
+      if (true_computation_layout.result_layout() !=
+          false_computation_layout.result_layout()) {
+        // We assign layouts in DFS fashion, so the true and false computations
+        // might have negotiated a different layout. But for the conditional
+        // instruction POV the layout must match, so we run again on the false
+        // computation, this time with proper computation layout.
+        VLOG(2) << "Reset %conditional false computation result layout: "
+                   "false_computation="
+                << false_computation->name()
+                << " conditional=" << instruction->name() << " shape="
+                << true_computation_layout.result_layout().ToString();
+        *false_computation_layout.mutable_result_layout() =
+            true_computation_layout.result_layout();
+      }
       TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
           true_computation_layout.result_shape(), instruction));
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
@@ -593,10 +605,14 @@ Status LayoutAssignment::AddMandatoryConstraints(
       }
     }
   }
-
-  // Finally set the result layout to match ComputationLayout.
-  return constraints->SetResultLayout(
-      computation_layout.result_layout().shape());
+  // Finally set the result layout to match ComputationLayout, if there is one.
+  if (computation_layout != nullptr) {
+    const ShapeLayout& result_layout = computation_layout->result_layout();
+    if (result_layout.LayoutIsSet()) {
+      TF_RETURN_IF_ERROR(constraints->SetResultLayout(result_layout.shape()));
+    }
+  }
+  return Status::OK();
 }
 
 namespace {
@@ -760,6 +776,7 @@ StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
     HloInstruction* copy =
         instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
             instruction->shape(), HloOpcode::kCopy, instruction));
+    RegisterAddedCopy(copy);
     SetupCopiedInstruction(*instruction, copy, {});
     LayoutUtil::ClearLayout(copy->mutable_shape());
     TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
@@ -783,13 +800,19 @@ Status LayoutAssignment::CopyOperandIfLayoutsDiffer(
   TF_RET_CHECK(LayoutUtil::HasLayout(operand->shape()));
 
   if (ShapeUtil::Equal(operand_layout.shape(), operand->shape())) {
+    VLOG(5) << "Operand " << operand->ToString() << " layout matches in "
+            << instruction->ToString();
     // Operand layout already matches our constraint. Nothing to do.
     return Status::OK();
   }
+  VLOG(4) << "Operand " << operand->ToString() << " layout does not match "
+          << operand_layout.ToString() << " in " << instruction->ToString();
 
   TF_ASSIGN_OR_RETURN(HloInstruction * operand_copy,
                       CreateCopyWithNewLayout(operand_layout.shape(), operand));
 
+  VLOG(4) << "New copy of " << operand->ToString() << " is "
+          << operand_copy->ToString();
   return instruction->ReplaceOperandWith(operand_no, operand_copy);
 }
 
@@ -896,32 +919,31 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
       }
     }
   }
-
-  // Finally verify the result layout matches the layout of the entry
+  // Finally verify the result layout, if set, matches the layout of the entry
   // computation root.
-  TF_RET_CHECK(ShapeUtil::Equal(
-      module->entry_computation()->root_instruction()->shape(),
+  const ShapeLayout& result_layout =
       FindOrDie(computation_layouts_, module->entry_computation())
-          .result_layout()
-          .shape()));
-
+          .result_layout();
+  if (result_layout.LayoutIsSet()) {
+    TF_RET_CHECK(ShapeUtil::Equal(
+        module->entry_computation()->root_instruction()->shape(),
+        result_layout.shape()));
+  }
   return Status::OK();
 }
 
 LayoutAssignment::LayoutAssignment(
-    const ComputationLayout& entry_computation_layout,
+    ComputationLayout* entry_computation_layout,
     ChannelLayoutConstraints* channel_constraints)
     : entry_computation_layout_(entry_computation_layout),
       channel_layout_constraints_(channel_constraints) {
-  VLOG(1) << "entry computation layout given to layout assignment: "
-          << entry_computation_layout_.ToString();
+  VLOG(1) << "Entry computation layout given to layout assignment: "
+          << entry_computation_layout_->ToString();
   // Layouts of all parameter instructions must be set.
   for (const ShapeLayout& parameter_layout :
-       entry_computation_layout_.parameter_layouts()) {
+       entry_computation_layout_->parameter_layouts()) {
     CHECK(parameter_layout.LayoutIsSet());
   }
-  // TODO(b/29118294): Choose a better layout if the result layout is not set.
-  CHECK(entry_computation_layout_.result_layout().LayoutIsSet());
 }
 
 std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
@@ -1481,16 +1503,60 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
   return Status::OK();
 }
 
+Status LayoutAssignment::CalculateComputationLayout(
+    HloComputation* computation) {
+  ComputationLayout computation_layout(computation->ComputeProgramShape(),
+                                       /*ignore_layouts=*/false);
+  InsertOrDie(&computation_layouts_, computation, computation_layout);
+  VLOG(2) << "  Calculated ComputationLayout = "
+          << computation_layout.ToString();
+  return Status::OK();
+}
+
+Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
+  // Clear existing layouts of the instructions.  All layouts must be assigned
+  // by the LayoutAssignment pass, except for those on infeeds, parameters,
+  // and the computation result. The latter two are specified in
+  // computation_layout, so we only need to keep the existing layouts for
+  // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
+  // layout assignment pass that may accidently use the existing layout.
+  for (HloInstruction* instruction : computation->instructions()) {
+    if (instruction->opcode() == HloOpcode::kBitcast) {
+      // bitcasts are inherently layout sensitive and so a bitcast instruction
+      // present in the IR before layout assignment is a bug.
+      return InternalError(
+          "Unexpected bitcast operation seen during layout assignment: %s.",
+          instruction->ToString().c_str());
+    }
+    if (instruction->opcode() != HloOpcode::kInfeed) {
+      LayoutUtil::ClearLayout(instruction->mutable_shape());
+    }
+  }
+  return Status::OK();
+}
+
 Status LayoutAssignment::RunOnComputation(
-    const ComputationLayout& computation_layout,
+    ComputationLayout* computation_layout,
     const TuplePointsToAnalysis& points_to_analysis,
     HloComputation* computation,
     ChannelLayoutConstraints* channel_constraints) {
-  DCHECK(computation_layout.LayoutIsSet());
-  InsertOrDie(&computation_layouts_, computation, computation_layout);
   VLOG(2) << "LayoutAssignment::RunOnComputation(" << computation->name()
           << ")";
-  VLOG(2) << "  ComputationLayout = " << computation_layout.ToString();
+  TF_RETURN_IF_ERROR(ClearComputationLayouts(computation));
+  if (computation_layout != nullptr) {
+    auto it = computation_layouts_.find(computation);
+    if (it == computation_layouts_.end()) {
+      VLOG(2) << "  New ComputationLayout = " << computation_layout->ToString();
+      computation_layouts_.emplace(computation, *computation_layout);
+    } else {
+      TF_RET_CHECK(computation_layout == &it->second ||
+                   computation_layout == entry_computation_layout_);
+      VLOG(2) << "  Existing ComputationLayout = "
+              << computation_layout->ToString();
+    }
+  } else {
+    VLOG(2) << "  No ComputationLayout specified (will be calculated)";
+  }
 
   // Construct LayoutConstraints with all layout constraints of the computation.
   LayoutConstraints constraints(points_to_analysis, computation);
@@ -1533,12 +1599,19 @@ Status LayoutAssignment::RunOnComputation(
     CHECK_LT(constraints.unconstrained_buffer_ids().size(),
              unconstrained_count);
   }
-
   // All logical buffers should have constraints at this point. All that
   // remains is assign the constraints to the buffers and infer layouts for
   // aliased buffers.
   TF_RETURN_IF_ERROR(AssignLayouts(constraints, computation));
 
+  // If the computation layout wasn't specified, now it is the time to compute
+  // it according to the parameters and root instruction layouts.
+  // This allows the first pass through this API to record the best flowing
+  // layout to parameters and root instruction.
+  if (computation_layout == nullptr) {
+    TF_RETURN_IF_ERROR(CalculateComputationLayout(computation));
+  }
+
   // Record the layouts assigned for any communication ops in
   // channel_constraints so that they are constrained for future modules.
   for (HloInstruction* instruction : computation->instructions()) {
@@ -1553,6 +1626,34 @@ Status LayoutAssignment::RunOnComputation(
   return Status::OK();
 }
 
+Status LayoutAssignment::PropagateComputationLayouts(
+    HloComputation* computation, ComputationLayout* computation_layout) {
+  ComputationLayout computed_computation_layout(
+      computation->ComputeProgramShape(),
+      /*ignore_layouts=*/false);
+  for (int64 i = 0; i < computed_computation_layout.parameter_count(); ++i) {
+    ShapeLayout* param_layout = computation_layout->mutable_parameter_layout(i);
+    if (!param_layout->LayoutIsSet()) {
+      VLOG(4) << "Assigning layout to parameter " << i << " of computation "
+              << computation->name() << ": "
+              << computed_computation_layout.parameter_layout(i).ToString();
+      *param_layout = computed_computation_layout.parameter_layout(i);
+    } else {
+      TF_RET_CHECK(computed_computation_layout.parameter_layout(i) ==
+                   *param_layout);
+    }
+  }
+  ShapeLayout* result_layout = computation_layout->mutable_result_layout();
+  if (!result_layout->LayoutIsSet()) {
+    VLOG(4) << "Assigning result layout of computation " << computation->name()
+            << ": " << computed_computation_layout.result_layout().ToString();
+    *result_layout = computed_computation_layout.result_layout();
+  } else {
+    TF_RET_CHECK(computed_computation_layout.result_layout() == *result_layout);
+  }
+  return Status::OK();
+}
+
 StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   VLOG(2) << "Running layout assignment on module " << module->name();
   XLA_VLOG_LINES(3, module->ToString());
@@ -1561,52 +1662,45 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
                                 "before layout assignment",
                                 module->config().debug_options());
   }
-
-  TF_ASSIGN_OR_RETURN(auto points_to_analysis,
-                      TuplePointsToAnalysis::Run(module));
-
-  // Assign layouts to computations in an order such that a callee computation
-  // is handled before its caller computation. This ensures that the layout of
-  // all callers of a computation will agree.
-  std::list<HloComputation*> computation_post_order =
-      module->MakeComputationPostOrder();
-  for (auto* computation : module->MakeComputationPostOrder()) {
-    if (computation->IsFusionComputation()) {
-      continue;
-    }
-    // Clear existing layouts of the instructions.  All layouts must be assigned
-    // by the LayoutAssignment pass, except for those on infeeds, parameters,
-    // and the computation result. The latter two are specified in
-    // computation_layout, so we only need to keep the existing layouts for
-    // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
-    // layout assignment pass that may accidently use the existing layout.
-    for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kBitcast) {
-        // bitcasts are inherently layout sensitive and so a bitcast instruction
-        // present in the IR before layout assignment is a bug.
-        return InternalError(
-            "Unexpected bitcast operation seen during layout assignment: %s.",
-            instruction->ToString().c_str());
+  TF_RETURN_IF_ERROR(Init());
+
+  // We do two passes. The first one we pass a nullptr ComputationLayout to
+  // the RunOnComputation() calls (for non entry computations), and we register
+  // the ComputationLayout which are naturally flowing in DFS fashion to the
+  // parameters and root instruction.
+  // Walking in DFS mode though, means that we can end up with incorrect layouts
+  // when seen from an outer instruction, which has across-computation
+  // constraints to impose.
+  // For example, the kWhile instruction needs to enforce the same layouts for
+  // the parameters and root of the bosy, as well as the condition parameters.
+  // Similarly, the kConditional instruction needs to enforce the same layouts
+  // for the root of the true and false computations.
+  // So in the first pass, while allowing the layouts to flow to parameters and
+  // root, we also fix up the eventually inconsistent ComputationLayout, which
+  // will be then made mandatory by the second pass.
+  for (int64 i = 0; i < 2; ++i) {
+    TF_RETURN_IF_ERROR(ClearPreviousPassSideEffects(module));
+    TF_ASSIGN_OR_RETURN(auto points_to_analysis,
+                        TuplePointsToAnalysis::Run(module));
+    for (auto* computation : module->MakeComputationPostOrder()) {
+      if (computation->IsFusionComputation()) {
+        continue;
       }
-      if (instruction->opcode() != HloOpcode::kInfeed) {
-        LayoutUtil::ClearLayout(instruction->mutable_shape());
+      if (computation == module->entry_computation()) {
+        TF_RETURN_IF_ERROR(RunOnComputation(
+            entry_computation_layout_, *points_to_analysis,
+            module->entry_computation(), channel_layout_constraints_));
+      } else {
+        ComputationLayout* computation_layout =
+            (i == 0) ? nullptr : &FindOrDie(computation_layouts_, computation);
+        TF_RETURN_IF_ERROR(RunOnComputation(computation_layout,
+                                            *points_to_analysis, computation,
+                                            channel_layout_constraints_));
       }
     }
-    if (computation == module->entry_computation()) {
-      TF_RETURN_IF_ERROR(RunOnComputation(
-          entry_computation_layout_, *points_to_analysis,
-          module->entry_computation(), channel_layout_constraints_));
-    } else {
-      ComputationLayout computation_layout(computation->ComputeProgramShape());
-      // Setting all embedded computations to the default layout is potentially
-      // suboptimal.
-      computation_layout.SetToDefaultLayout();
-      TF_RETURN_IF_ERROR(RunOnComputation(computation_layout,
-                                          *points_to_analysis, computation,
-                                          channel_layout_constraints_));
-    }
   }
-
+  TF_RETURN_IF_ERROR(PropagateComputationLayouts(module->entry_computation(),
+                                                 entry_computation_layout_));
   TF_RETURN_IF_ERROR(CheckLayouts(module));
 
   VLOG(3) << "After layout assignment:";
@@ -1616,9 +1710,54 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
                                 "after layout assignment",
                                 module->config().debug_options());
   }
-
   // All layouts are reset then reassigned by this pass.
   return true;
 }
 
+Status LayoutAssignment::Init() {
+  computation_layouts_.clear();
+  return Status::OK();
+}
+
+Status LayoutAssignment::ClearPreviousPassSideEffects(HloModule* module) {
+  // Clear all the copies which have been added, and all the related
+  // instructions (like GTE and tuples).
+  int64 removed_copies = 0;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      if (instruction->opcode() == HloOpcode::kCopy &&
+          added_copies_.count(instruction) > 0) {
+        VLOG(5) << "Removing added copy: " << instruction->ToString();
+        TF_RETURN_IF_ERROR(
+            instruction->ReplaceAllUsesWith(instruction->mutable_operand(0)));
+        TF_RETURN_IF_ERROR(computation->RemoveInstruction(instruction));
+        ++removed_copies;
+      }
+    }
+  }
+  added_copies_.clear();
+  if (removed_copies > 0) {
+    TupleSimplifier tuple_simplifier;
+    HloDCE dce;
+    TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
+    TF_RETURN_IF_ERROR(dce.Run(module).status());
+  }
+  return Status::OK();
+}
+
+Status LayoutAssignment::AddCopyForOperand(HloInstruction* instruction,
+                                           int64 operand_number) {
+  HloInstruction* operand = instruction->mutable_operand(operand_number);
+  if (operand->opcode() != HloOpcode::kCopy || operand->user_count() > 1) {
+    HloInstruction* copy =
+        instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
+            operand->shape(), HloOpcode::kCopy, operand));
+    SetupCopiedInstruction(*operand, copy, {});
+    LayoutUtil::ClearLayout(copy->mutable_shape());
+    TF_RETURN_IF_ERROR(instruction->ReplaceOperandWith(operand_number, copy));
+  }
+  return Status::OK();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 9663a793fdd7d4968700707a1003319e89ea19a3..c287cca0c54ba1bb514bd8d243c137eca99b258f 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -288,7 +289,7 @@ class LayoutAssignment : public HloPassInterface {
   // If channel_constraints is nullptr, no kSend or kRecvs must be contained
   // within any module passed to `Run`.
   explicit LayoutAssignment(
-      const ComputationLayout& entry_computation_layout,
+      ComputationLayout* entry_computation_layout,
       ChannelLayoutConstraints* channel_constraints = nullptr);
   ~LayoutAssignment() override {}
   tensorflow::StringPiece name() const override { return "layout-assignment"; }
@@ -362,12 +363,15 @@ class LayoutAssignment : public HloPassInterface {
       int64 operand_no);
 
  private:
+  // Initializes the layout assignment object for a new Run() call.
+  Status Init();
+
   // Adds constraints which must be satisfied for correctness on all
   // backends. Called once prior to propagating constraints.
-  Status AddMandatoryConstraints(
-      const ComputationLayout& computation_layout,
-      const ChannelLayoutConstraints* channel_constraints,
-      HloComputation* computation, LayoutConstraints* constraints);
+  Status AddMandatoryConstraints(const ComputationLayout* computation_layout,
+                                 ChannelLayoutConstraints* channel_constraints,
+                                 HloComputation* computation,
+                                 LayoutConstraints* constraints);
 
   // This method can be overridden to add backend-specific constraints to the
   // layout of the instructions of a computation. This method is called after
@@ -378,10 +382,12 @@ class LayoutAssignment : public HloPassInterface {
   }
 
   // Construct contraints and assign layouts to all instructions in the
-  // computation satisfying the given ComputationLayout. Layouts constraints are
-  // added, then propagated until all LogicalBuffers in the computation are
-  // constrained.
-  Status RunOnComputation(const ComputationLayout& computation_layout,
+  // computation satisfying the given ComputationLayout, if not nullptr.
+  // Otherwise the ComputationLayout will be calculated by propagating the
+  // computation instruction contraints.
+  // Layouts constraints are added, then propagated until all LogicalBuffers in
+  // the computation are constrained.
+  Status RunOnComputation(ComputationLayout* computation_layout,
                           const TuplePointsToAnalysis& points_to_analysis,
                           HloComputation* computation,
                           ChannelLayoutConstraints* channel_constraints);
@@ -402,7 +408,26 @@ class LayoutAssignment : public HloPassInterface {
   // necessary conditions.
   Status CheckLayouts(HloModule* module);
 
-  const ComputationLayout& entry_computation_layout_;
+  // Computes the ComputationLayout of the given computation based of the
+  // layouts assigned to parameters and root instruction, and inserts it to the
+  // computation_layouts_ map.
+  Status CalculateComputationLayout(HloComputation* computation);
+
+  // Clears all the layouts which can be cleared within a computation.
+  Status ClearComputationLayouts(HloComputation* computation);
+
+  // Clears the side effects of a previous pass, like added copy instructions.
+  Status ClearPreviousPassSideEffects(HloModule* module);
+
+  // Propagates the layouts computed by the layout assignment pass on the given
+  // computation, to the computation layout passed in to this API.
+  // This API propagates missing layout, and also checks that the caller
+  // specified have been respected, by comparing those with the parameters and
+  // root computation instruction.
+  Status PropagateComputationLayouts(HloComputation* computation,
+                                     ComputationLayout* computation_layout);
+
+  ComputationLayout* entry_computation_layout_;
 
  protected:
   // Sets up the copy instruction according to the characteristic (sharding,
@@ -418,21 +443,37 @@ class LayoutAssignment : public HloPassInterface {
   // Creates and returns a copy of the given instruction with a different
   // layout. Tuple-shaped instructions will be deep-copied, and the last Tuple
   // instruction producing the copy is returned.
-  static StatusOr<HloInstruction*> CreateCopyWithNewLayout(
+  StatusOr<HloInstruction*> CreateCopyWithNewLayout(
       const Shape& shape_with_layout, HloInstruction* instruction);
 
   // Creates a copy of the given operand if the operand's layout does not match
   // the given layout. This copy replaces the use in the given instruction.
   // Tuple operands will be deep-copied.
-  static Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
-                                           HloInstruction* instruction,
-                                           int64 operand_no);
+  Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
+                                    HloInstruction* instruction,
+                                    int64 operand_no);
+
+  // Registers a copy instruction added by the layout assignment pass.
+  void RegisterAddedCopy(HloInstruction* copy) {
+    CHECK_EQ(copy->opcode(), HloOpcode::kCopy);
+    added_copies_.insert(copy);
+  }
+
+  // Adds a copy for the operand of an instruction, unless such operand is
+  // already a copy, and has a single user (which is forcibly the instruction
+  // itself).
+  Status AddCopyForOperand(HloInstruction* instruction, int64 operand_number);
 
   // Map containing the layouts of all computations assigned so
   // far. Computations are handled in a topological sort where computations are
   // handled before their caller instructions so the layouts of caller
   // instructions can be set to match the computation.
   std::map<HloComputation*, ComputationLayout> computation_layouts_;
+
+  // Every copy added to the module by the layout assignment pass is registered
+  // here.
+  tensorflow::gtl::FlatSet<HloInstruction*> added_copies_;
+
   ChannelLayoutConstraints* channel_layout_constraints_;
 };
 
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 7e1bb11eaada0e62b82c50903c9848f0a3a8307b..7508013199a82267efc0e1426cb5989d5fe844a0 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -53,7 +53,7 @@ class LayoutAssignmentTest : public HloTestBase {
  protected:
   void AssignLayouts(HloModule* module,
                      ComputationLayout* entry_computation_layout) {
-    LayoutAssignment layout_assignment(*entry_computation_layout);
+    LayoutAssignment layout_assignment(entry_computation_layout);
     EXPECT_IS_OK(layout_assignment.Run(module).status());
   }
 };
@@ -285,7 +285,7 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
   TF_CHECK_OK(computation_layout.mutable_result_layout()->CopyLayoutFromShape(
       result_shape));
 
-  LayoutAssignment layout_assignment(computation_layout);
+  LayoutAssignment layout_assignment(&computation_layout);
   AssignLayouts(module.get(), &computation_layout);
 
   // Layout assignment should have deep copied the result of the computation to
@@ -488,7 +488,7 @@ class OperandsMustBeTheSameLayoutAssignment : public LayoutAssignment {
  public:
   explicit OperandsMustBeTheSameLayoutAssignment(
       ComputationLayout* entry_computation_layout)
-      : LayoutAssignment(*entry_computation_layout) {}
+      : LayoutAssignment(entry_computation_layout) {}
 
  protected:
   Status PropagateBufferConstraint(
@@ -660,13 +660,12 @@ TEST_F(LayoutAssignmentTest, TransposeWithinFusionDoesNotCrash) {
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
 
-  EXPECT_EQ(
-      ::tensorflow::Status::OK(),
-      backend()
-          .compiler()
-          ->RunBackend(std::move(module), backend().default_stream_executor(),
-                       /*device_allocator=*/nullptr)
-          .status());
+  EXPECT_EQ(Status::OK(), backend()
+                              .compiler()
+                              ->RunBackend(std::move(module),
+                                           backend().default_stream_executor(),
+                                           /*device_allocator=*/nullptr)
+                              .status());
 }
 
 // A GTE inside of a fusion node inherits the layout of its operand (which
@@ -808,7 +807,7 @@ TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) {
 
   ComputationLayout computation_layout(
       module->entry_computation()->ComputeProgramShape());
-  LayoutAssignment layout_assignment(computation_layout);
+  LayoutAssignment layout_assignment(&computation_layout);
   Status error_status = layout_assignment.Run(module.get()).status();
   EXPECT_FALSE(error_status.ok());
   EXPECT_THAT(
diff --git a/tensorflow/compiler/xla/service/liveness_util.cc b/tensorflow/compiler/xla/service/liveness_util.cc
deleted file mode 100644
index 68c99256a246edcf43a8358f667fc4458b9b4fea..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/liveness_util.cc
+++ /dev/null
@@ -1,379 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/liveness_util.h"
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/logical_buffer.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-
-namespace xla {
-
-bool DoesNotUseOperandBuffer(const HloInstruction* operand,
-                             const ShapeIndex& index,
-                             const HloInstruction* user,
-                             const TuplePointsToAnalysis& points_to_analysis) {
-  CHECK(user->IsUserOf(operand))
-      << "user: " << user->ToString() << " operand: " << operand->ToString();
-  if (user->opcode() == HloOpcode::kGetTupleElement && !index.empty()) {
-    // GetTupleElement instructions only access the top-level buffer of their
-    // operand.
-    return true;
-  } else if (user->opcode() == HloOpcode::kFusion &&
-             user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
-    // Find fusion parameter associated with 'operand'.
-    auto it = std::find_if(
-        user->fused_parameters().begin(), user->fused_parameters().end(),
-        [=](HloInstruction* fused_param) {
-          return user->operand(fused_param->parameter_number()) == operand;
-        });
-    CHECK(it != user->fused_parameters().end());
-    // Iterate through all users of all buffer aliases of the buffer in the
-    // points-to set of fusion parameter at 'index'.
-    // Return false if any uses are detected at 'index', returns true otherwise.
-    const LogicalBuffer* buffer =
-        points_to_analysis.GetBufferDefinedAt(*it, index).ValueOrDie();
-    for (const BufferAlias& alias :
-         points_to_analysis.GetBufferAliases(*buffer)) {
-      for (HloInstruction* alias_user : alias.instruction()->users()) {
-        if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(),
-                                    alias_user, points_to_analysis)) {
-          continue;
-        }
-        // Return false: use detected at 'buffer' -> 'alias' -> 'alias_user'.
-        return false;
-      }
-    }
-    // Return true: found no uses of 'operand' at 'index' in 'user'.
-    return true;
-  }
-  return false;
-}
-
-bool DoesNotUseOperandBuffer(const HloInstruction* operand,
-                             const ShapeIndex& index,
-                             const HloInstruction* user,
-                             const HloDataflowAnalysis& dataflow) {
-  CHECK(user->IsUserOf(operand))
-      << "user: " << user->ToString() << " operand: " << operand->ToString();
-  if (user->opcode() == HloOpcode::kFusion &&
-      user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
-    // Find fusion parameter associated with 'operand'.
-    HloInstruction* fusion_param =
-        user->fused_parameter(user->operand_index(operand));
-    // Iterate through all users of all uses of the fusion parameter value.
-    // Return false if any uses are detected, returns true otherwise.
-    const HloValue& value = dataflow.GetValueDefinedAt(fusion_param, index);
-    return value.uses().empty();
-  } else {
-    // Return false if no value at 'operand' and 'index' is used at 'user'.
-    for (const HloValue* value :
-         dataflow.GetValueSet(operand, index).values()) {
-      for (const HloUse& use : value->uses()) {
-        if (use.instruction == user) {
-          return false;
-        }
-      }
-    }
-  }
-
-  return true;
-}
-
-namespace {
-
-// Returns all uses of all aliases of 'instruction' at 'index' in 'uses'.
-// Each use in 'uses' is a pair (HloInstruction* user, int64 operand_index)
-// where 'user' is a user of an alias of 'instruction' at 'index', and
-// 'operand_index' is the operand index at which the alias appears in the
-// operand list of 'user'.
-std::vector<std::pair<HloInstruction*, int64>> GetAllUsesOfInstructionAtIndex(
-    HloInstruction* instruction, const ShapeIndex& index,
-    const TuplePointsToAnalysis& points_to_analysis) {
-  std::vector<std::pair<HloInstruction*, int64>> uses;
-  const PointsToSet::BufferList& points_to =
-      points_to_analysis.GetPointsToSet(instruction).element(index);
-  for (const LogicalBuffer* buffer : points_to) {
-    for (const BufferAlias& alias :
-         points_to_analysis.GetBufferAliases(*buffer)) {
-      for (HloInstruction* alias_user : alias.instruction()->users()) {
-        if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(),
-                                    alias_user, points_to_analysis)) {
-          continue;
-        }
-        for (int64 op_idx : alias_user->OperandIndices(alias.instruction())) {
-          uses.emplace_back(alias_user, op_idx);
-        }
-      }
-    }
-  }
-  return uses;
-}
-
-// Returns true if there is exactly one use of 'operand' at 'operand_index'
-// in 'fusion.fused_instructions', where the singleton use is the fused
-// root at operand index 'use_operand_index'. Returns false otherwise.
-//
-// REQUIRES: 'fusion' opcode is a kFusion instruction.
-bool HasUniqueFusedUseOfOperandAt(
-    HloInstruction* operand, const ShapeIndex& operand_index,
-    HloInstruction* fusion, const int64 use_operand_index,
-    const TuplePointsToAnalysis& points_to_analysis) {
-  CHECK_EQ(HloOpcode::kFusion, fusion->opcode());
-  // Check that 'operand' is unique in the operand list of 'fusion'.
-  if (fusion->OperandIndices(operand).size() > 1) {
-    return false;
-  }
-  // Find fusion parameter associated with 'operand'.
-  const auto& fused_params = fusion->fused_parameters();
-  auto fused_param_it = std::find_if(
-      fused_params.begin(), fused_params.end(),
-      [&](HloInstruction* fused_param) {
-        return fusion->operand(fused_param->parameter_number()) == operand;
-      });
-  if (fused_param_it == fused_params.end()) {
-    return false;
-  }
-  auto* fused_param = *fused_param_it;
-  // Get all uses of 'operand' at 'index' from 'fusion.fused_instructions'.
-  auto fused_param_uses = GetAllUsesOfInstructionAtIndex(
-      fused_param, operand_index, points_to_analysis);
-  // Return true iff there is exactly one use of 'operand' at 'index', and
-  // this singleton use is the fused root (at index in 'use_operand_indices').
-  return fused_param_uses.size() == 1 &&
-         fused_param_uses[0].first == fusion->fused_expression_root() &&
-         fused_param_uses[0].second == use_operand_index;
-}
-
-}  // namespace
-
-// User and operand can share buffers iff both instructions emit the same shape
-// and layout, and 'user' meets one of the following qualifications:
-//
-// (1) Is element-wise. Or...
-// (2) Is a loop fusion instruction where the only use of 'operand' at 'index'
-//     in the set 'user.fused_instructions' is a DynamicUpdateSlice fused root
-//     at operand 0. Or...
-// (3) Is a kDot -> kAdd (or fused kTransposeDot -> kAdd) output fusion
-//     instruction where the only use of 'operand' at 'index' in the set
-//     'user.fused_instructions' is a kAdd fused root at operand 0 or 1. Or...
-// (4) The 'user' of 'operand' is DynamicUpdateSlice or While at operand index
-//     0.
-//
-// (2) and (3) can only be determined if points-to analysis is available.
-bool CanShareOperandBufferWithUser(
-    HloInstruction* operand, const ShapeIndex& operand_index,
-    HloInstruction* user, const ShapeIndex& user_index,
-    const TuplePointsToAnalysis& points_to_analysis) {
-  CHECK(user->IsUserOf(operand))
-      << "user: " << user->ToString() << " operand: " << operand->ToString();
-  const Shape& operand_subshape =
-      ShapeUtil::GetSubshape(operand->shape(), operand_index);
-  const Shape& user_subshape =
-      ShapeUtil::GetSubshape(user->shape(), user_index);
-  // Check that operand and user emit the same shape and layout.
-  if (!ShapeUtil::Equal(operand_subshape, user_subshape)) {
-    return false;
-  }
-  if (user->opcode() == HloOpcode::kFusion) {
-    if (user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
-        user->fused_expression_root()->opcode() ==
-            HloOpcode::kDynamicUpdateSlice) {
-      // Loop fusion with kDynamicUpdateSlice fused root.
-      //
-      // Returns true iff there is exactly one use of 'operand' at shape index
-      // 'operand_index', and this singleton use is the fused root at operand
-      // index 0.
-      return HasUniqueFusedUseOfOperandAt(operand, operand_index, user, 0,
-                                          points_to_analysis);
-    } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
-               user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
-      // Output fusion with kAdd fused root.
-
-      // Check if one operand of kAdd fused root is either kDot, or nested
-      // kFusion of kind kTransposeDot.
-      auto* add = user->fused_expression_root();
-      auto add_operand_it =
-          std::find_if(add->operands().begin(), add->operands().end(),
-                       [&](HloInstruction* operand) {
-                         return operand->opcode() == HloOpcode::kConvolution ||
-                                operand->opcode() == HloOpcode::kDot ||
-                                (operand->opcode() == HloOpcode::kFusion &&
-                                 operand->fusion_kind() ==
-                                     HloInstruction::FusionKind::kTransposeDot);
-                       });
-      if (add_operand_it == add->operands().end()) {
-        return false;
-      }
-      auto* matched_add_operand = *add_operand_it;
-      // Calculate operand index of 'add' operand which was not matched above.
-      const int64 other_add_operand_index =
-          matched_add_operand == add->operand(0) ? 1 : 0;
-      // Returns true iff there is exactly one use of 'operand' at shape index
-      // 'operand_index', and this singleton use is the fused root (at operand
-      // index 'other_add_operand_index').
-      return HasUniqueFusedUseOfOperandAt(operand, operand_index, user,
-                                          other_add_operand_index,
-                                          points_to_analysis);
-    }
-  }
-  if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
-      user->opcode() == HloOpcode::kWhile) {
-    // We eliminated other users in BufferLiveness::live_range_strictly_before,
-    // so here we just need to check that the use is at operand index 0.
-    std::vector<int64> operand_indices = user->OperandIndices(operand);
-    return operand_indices.size() == 1 && operand_indices[0] == 0;
-  }
-  if (user->opcode() == HloOpcode::kCall) {
-    // TODO(b/62548313): Remove when buffer assignment is module scoped and
-    // does not assign buffers to calls.
-    // Find called computation parameter associated with 'operand'.
-    const std::vector<int64> operand_indices = user->OperandIndices(operand);
-    if (operand_indices.size() > 1) {
-      return false;
-    }
-    CHECK_EQ(1, operand_indices.size());
-    auto* param = user->to_apply()->parameter_instruction(operand_indices[0]);
-    // Get all uses of 'operand' at 'index' in called computation.
-    auto param_uses = GetAllUsesOfInstructionAtIndex(param, operand_index,
-                                                     points_to_analysis);
-
-    // Return true iff:
-    // *) There exists exactly one use of 'operand' in called computation.
-    // *) The unique use is by the root instruction of called computation.
-    //    (Note: we check the root of the called computation, because the
-    //     root result buffer is required to alias with the Call result buffer).
-    // *) The root instruction of the called computation is element-wise on
-    //    'operand'.
-    auto* callee_root = user->to_apply()->root_instruction();
-    return param_uses.size() == 1 && param_uses[0].first == callee_root &&
-           callee_root->IsElementwiseOnOperand(param_uses[0].second);
-  }
-  // Check if 'user' is element-wise.
-  return user->IsElementwise();
-}
-
-bool CanShareOperandBufferWithUser(HloInstruction* operand,
-                                   const ShapeIndex& operand_index,
-                                   HloInstruction* user,
-                                   const ShapeIndex& user_index,
-                                   const HloDataflowAnalysis& dataflow) {
-  CHECK(user->IsUserOf(operand))
-      << "user: " << user->ToString() << " operand: " << operand->ToString();
-  const Shape& operand_subshape =
-      ShapeUtil::GetSubshape(operand->shape(), operand_index);
-  const Shape& user_subshape =
-      ShapeUtil::GetSubshape(user->shape(), user_index);
-  // Check that operand and user emit the same shape and layout.
-  if (!ShapeUtil::Equal(operand_subshape, user_subshape)) {
-    return false;
-  }
-
-  if (user->opcode() == HloOpcode::kFusion) {
-    // Get the parameter associated with 'operand';
-    HloInstruction* fusion_param =
-        user->fused_parameter(user->operand_index(operand));
-
-    const HloValue& value =
-        dataflow.GetValueDefinedAt(fusion_param, operand_index);
-    if (value.uses().size() != 1) {
-      return false;
-    }
-    const HloUse& use = value.uses()[0];
-
-    if (user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
-        user->fused_expression_root()->opcode() ==
-            HloOpcode::kDynamicUpdateSlice) {
-      // Loop fusion with kDynamicUpdateSlice fused root.
-      //
-      // Returns true iff there is exactly one use of 'operand' at shape index
-      // 'operand_index', and this singleton use is the fused root at operand
-      // index 0.
-      return use.instruction == user->fused_expression_root() &&
-             use.operand_number == 0;
-    } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
-               user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
-      // Output fusion with kAdd fused root.
-
-      // Check if one operand of kAdd fused root is either kDot, or nested
-      // kFusion of kind kTransposeDot.
-      auto* add = user->fused_expression_root();
-      auto add_operand_it =
-          std::find_if(add->operands().begin(), add->operands().end(),
-                       [&](HloInstruction* operand) {
-                         return operand->opcode() == HloOpcode::kConvolution ||
-                                operand->opcode() == HloOpcode::kDot ||
-                                (operand->opcode() == HloOpcode::kFusion &&
-                                 operand->fusion_kind() ==
-                                     HloInstruction::FusionKind::kTransposeDot);
-                       });
-      if (add_operand_it == add->operands().end()) {
-        return false;
-      }
-      auto* matched_add_operand = *add_operand_it;
-      // Calculate operand index of 'add' operand which was not matched above.
-      const int64 other_add_operand_index =
-          matched_add_operand == add->operand(0) ? 1 : 0;
-      // Returns true iff there is exactly one use of 'operand' at shape index
-      // 'operand_index', and this singleton use is the fused root (at operand
-      // index 'other_add_operand_index').
-      return use.instruction == user->fused_expression_root() &&
-             use.operand_number == other_add_operand_index;
-    }
-  }
-  if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
-      user->opcode() == HloOpcode::kWhile) {
-    // We eliminated other users in BufferLiveness::live_range_strictly_before,
-    // so here we just need to check that the use is at operand index 0.
-    std::vector<int64> operand_indices = user->OperandIndices(operand);
-    return operand_indices.size() == 1 && operand_indices[0] == 0;
-  }
-  if (user->opcode() == HloOpcode::kCall) {
-    // Get all uses of value defined by 'operand' at 'operand_index'.
-    const auto& uses =
-        dataflow.GetValueDefinedAt(operand, operand_index).uses();
-    // Return true iff:
-    // *) There exists two uses of 'operand'.
-    // *) One use is by 'user' (caller).
-    // *) One use is by root instruction of called computation (callee root).
-    //    (Note: we check the root of the called computation, because the
-    //     root result buffer is required to alias with the Call result buffer).
-    // *) The root instruction of the called computation is element-wise on
-    //    'operand'.
-    const bool found_caller_use =
-        std::find_if(uses.begin(), uses.end(), [user](const HloUse& use) {
-          return use.instruction == user;
-        }) != uses.end();
-    auto* callee_root = user->to_apply()->root_instruction();
-    const bool found_elementwise_callee_use =
-        std::find_if(
-            uses.begin(), uses.end(), [callee_root](const HloUse& use) {
-              return use.instruction == callee_root &&
-                     callee_root->IsElementwiseOnOperand(use.operand_number);
-            }) != uses.end();
-    return uses.size() == 2 && found_caller_use && found_elementwise_callee_use;
-  }
-  // Check if 'user' is element-wise.
-  return user->IsElementwise();
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/liveness_util.h b/tensorflow/compiler/xla/service/liveness_util.h
deleted file mode 100644
index 28ef991880039de73cc158a67ef2a5f78fc90e6d..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/liveness_util.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// A collection of utilities on the HLO graph.
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LIVENESS_UTIL_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_LIVENESS_UTIL_H_
-
-#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/types.h"
-
-namespace xla {
-
-// Returns true if 'user' cannot possibly use the buffer at 'index' in
-// 'operand'. Returns false otherwise.
-//
-// REQUIRES: 'operand' is an operand of 'user'.
-//
-// TODO(b/65835246): Remove TuplePointsToAnalysis overload when all users have
-// moved over to the dataflow overload.
-bool DoesNotUseOperandBuffer(const HloInstruction* operand,
-                             const ShapeIndex& index,
-                             const HloInstruction* user,
-                             const TuplePointsToAnalysis& points_to_analysis);
-bool DoesNotUseOperandBuffer(const HloInstruction* operand,
-                             const ShapeIndex& index,
-                             const HloInstruction* user,
-                             const HloDataflowAnalysis& dataflow);
-
-// Returns true if 'user' (at 'user_index') can share a buffer with its operand
-// 'operand' (at 'operand_index'). Returns false otherwise.
-//
-// REQUIRES: 'operand' is an operand of 'user'.
-//
-// TODO(b/65835246): Remove TuplePointsToAnalysis overload when all users have
-// moved over to the dataflow overload.
-bool CanShareOperandBufferWithUser(
-    HloInstruction* operand, const ShapeIndex& operand_index,
-    HloInstruction* user, const ShapeIndex& user_index,
-    const TuplePointsToAnalysis& points_to_analysis);
-bool CanShareOperandBufferWithUser(HloInstruction* operand,
-                                   const ShapeIndex& operand_index,
-                                   HloInstruction* user,
-                                   const ShapeIndex& user_index,
-                                   const HloDataflowAnalysis& dataflow);
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LIVENESS_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/liveness_util_test.cc b/tensorflow/compiler/xla/service/liveness_util_test.cc
deleted file mode 100644
index f8b309488eeb5391b1cad5db760934ec1f7e3521..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/liveness_util_test.cc
+++ /dev/null
@@ -1,463 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/liveness_util.h"
-
-#include <memory>
-
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
-#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-
-namespace xla {
-namespace {
-
-class PointsToAnalysisTestBase : public HloTestBase {
- protected:
-  void BuildModule(std::unique_ptr<HloComputation> computation) {
-    module_ = CreateNewModule();
-    computation_ = module_->AddEntryComputation(std::move(computation));
-  }
-
-  void RunAnalysis() {
-    CHECK_NOTNULL(module_.get());
-    points_to_analysis_ =
-        TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
-    dataflow_analysis_ = HloDataflowAnalysis::Run(*module_).ConsumeValueOrDie();
-  }
-
-  void BuildModuleAndRunAnalysis(std::unique_ptr<HloComputation> computation) {
-    BuildModule(std::move(computation));
-    RunAnalysis();
-  }
-
-  std::unique_ptr<HloModule> module_;
-  HloComputation* computation_ = nullptr;
-  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
-  std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
-};
-
-class DoesNotUseOperandBufferTest : public PointsToAnalysisTestBase {};
-
-TEST_F(DoesNotUseOperandBufferTest, GetTupleElement) {
-  auto builder = HloComputation::Builder(TestName());
-
-  Shape elem_shape = ShapeUtil::MakeShape(F32, {8});
-  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeTupleShape({elem_shape, elem_shape}), "tuple"));
-  auto gte0 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(elem_shape, tuple, 0));
-  auto gte1 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(elem_shape, tuple, 1));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(elem_shape, HloOpcode::kAdd, gte0, gte1));
-
-  BuildModuleAndRunAnalysis(builder.Build());
-
-  // GetTupleElement instructions only access the top-level buffer of their
-  // operand.
-  EXPECT_TRUE(DoesNotUseOperandBuffer(tuple, {0}, gte0, *points_to_analysis_));
-  EXPECT_TRUE(DoesNotUseOperandBuffer(tuple, {1}, gte1, *points_to_analysis_));
-  EXPECT_FALSE(DoesNotUseOperandBuffer(tuple, {}, gte0, *points_to_analysis_));
-  EXPECT_FALSE(DoesNotUseOperandBuffer(tuple, {}, gte1, *points_to_analysis_));
-
-  EXPECT_TRUE(DoesNotUseOperandBuffer(tuple, {0}, gte0, *dataflow_analysis_));
-  EXPECT_TRUE(DoesNotUseOperandBuffer(tuple, {1}, gte1, *dataflow_analysis_));
-  EXPECT_FALSE(DoesNotUseOperandBuffer(tuple, {}, gte0, *dataflow_analysis_));
-  EXPECT_FALSE(DoesNotUseOperandBuffer(tuple, {}, gte1, *dataflow_analysis_));
-}
-
-TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) {
-  auto builder = HloComputation::Builder(TestName());
-
-  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
-  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
-  auto gte0 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
-  auto gte1 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
-
-  // Create a DynamicUpdateSlice instruction of tuple element 1.
-  auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
-  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
-      Literal::CreateR1<float>({2.f, 2.f, 2.f})));
-  auto dynamic_update_slice =
-      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
-  builder.AddInstruction(
-      HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
-
-  BuildModule(builder.Build());
-  auto fusion = computation_->CreateFusionInstruction(
-      {dynamic_update_slice, starts, update, gte1},
-      HloInstruction::FusionKind::kLoop);
-  RunAnalysis();
-
-  // The fusion instruction never uses tuple element 0, but does use element 1.
-  EXPECT_TRUE(
-      DoesNotUseOperandBuffer(tuple, {0}, fusion, *points_to_analysis_));
-  EXPECT_FALSE(
-      DoesNotUseOperandBuffer(tuple, {1}, fusion, *points_to_analysis_));
-
-  EXPECT_TRUE(DoesNotUseOperandBuffer(tuple, {0}, fusion, *dataflow_analysis_));
-  EXPECT_FALSE(
-      DoesNotUseOperandBuffer(tuple, {1}, fusion, *dataflow_analysis_));
-}
-
-class CanShareOperandBufferWithUserTest : public PointsToAnalysisTestBase {};
-
-TEST_F(CanShareOperandBufferWithUserTest, ElementWiseSameShape) {
-  auto builder = HloComputation::Builder(TestName());
-
-  Shape shape = ShapeUtil::MakeShape(F32, {8});
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape, "param"));
-  auto exp = builder.AddInstruction(
-      HloInstruction::CreateUnary(shape, HloOpcode::kExp, param));
-  auto log = builder.AddInstruction(
-      HloInstruction::CreateUnary(shape, HloOpcode::kLog, exp));
-
-  BuildModuleAndRunAnalysis(builder.Build());
-
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(param, {}, exp, {}, *points_to_analysis_));
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(exp, {}, log, {}, *points_to_analysis_));
-
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(param, {}, exp, {}, *dataflow_analysis_));
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(exp, {}, log, {}, *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, ElementWiseDifferentShape) {
-  auto builder = HloComputation::Builder(TestName());
-
-  Shape in_shape = ShapeUtil::MakeShape(F32, {8});
-  Shape out_shape = ShapeUtil::MakeShape(PRED, {8});
-  auto param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, in_shape, "param0"));
-  auto param1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, in_shape, "param1"));
-  auto result = builder.AddInstruction(
-      HloInstruction::CreateBinary(out_shape, HloOpcode::kEq, param0, param1));
-
-  BuildModuleAndRunAnalysis(builder.Build());
-
-  EXPECT_FALSE(CanShareOperandBufferWithUser(param0, {}, result, {},
-                                             *points_to_analysis_));
-  EXPECT_FALSE(CanShareOperandBufferWithUser(param1, {}, result, {},
-                                             *points_to_analysis_));
-
-  EXPECT_FALSE(CanShareOperandBufferWithUser(param0, {}, result, {},
-                                             *dataflow_analysis_));
-  EXPECT_FALSE(CanShareOperandBufferWithUser(param1, {}, result, {},
-                                             *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, CopyShares) {
-  auto builder = HloComputation::Builder(TestName());
-
-  Shape shape = ShapeUtil::MakeShape(F32, {8});
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape, "param"));
-  auto exp = builder.AddInstruction(
-      HloInstruction::CreateUnary(shape, HloOpcode::kExp, param));
-  auto copy = builder.AddInstruction(
-      HloInstruction::CreateUnary(shape, HloOpcode::kCopy, exp));
-
-  BuildModuleAndRunAnalysis(builder.Build());
-
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(param, {}, exp, {}, *points_to_analysis_));
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(exp, {}, copy, {}, *points_to_analysis_));
-
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(param, {}, exp, {}, *dataflow_analysis_));
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(exp, {}, copy, {}, *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
-  auto builder = HloComputation::Builder(TestName());
-
-  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
-  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
-  auto gte0 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
-  auto gte1 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
-
-  // Create a DynamicUpdateSlice instruction of tuple element 1.
-  auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
-  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
-      Literal::CreateR1<float>({2.f, 2.f, 2.f})));
-  auto dynamic_update_slice =
-      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
-  builder.AddInstruction(
-      HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
-
-  BuildModule(builder.Build());
-  auto fusion = computation_->CreateFusionInstruction(
-      {dynamic_update_slice, starts, update, gte1},
-      HloInstruction::FusionKind::kLoop);
-  RunAnalysis();
-
-  // The fusion instruction can share with tuple element 1.
-  EXPECT_FALSE(CanShareOperandBufferWithUser(tuple, {0}, fusion, {},
-                                             *points_to_analysis_));
-  EXPECT_TRUE(CanShareOperandBufferWithUser(tuple, {1}, fusion, {},
-                                            *points_to_analysis_));
-
-  EXPECT_FALSE(CanShareOperandBufferWithUser(tuple, {0}, fusion, {},
-                                             *dataflow_analysis_));
-  EXPECT_TRUE(CanShareOperandBufferWithUser(tuple, {1}, fusion, {},
-                                            *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
-  auto builder = HloComputation::Builder(TestName());
-
-  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
-  Shape update_shape = ShapeUtil::MakeShape(F32, {4});
-  Shape starts_shape = ShapeUtil::MakeShape(S32, {1});
-  auto data = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, data_shape, "data"));
-  auto update = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, update_shape, "update"));
-  auto starts = builder.AddInstruction(
-      HloInstruction::CreateParameter(2, starts_shape, "starts"));
-  auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      data_shape, data, update, starts));
-
-  BuildModuleAndRunAnalysis(builder.Build());
-
-  // The DynamicUpdateSlice instruction can share with the data operand, but not
-  // with update or starts.
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(data, {}, dus, {}, *points_to_analysis_));
-  EXPECT_FALSE(
-      CanShareOperandBufferWithUser(update, {}, dus, {}, *points_to_analysis_));
-  EXPECT_FALSE(
-      CanShareOperandBufferWithUser(starts, {}, dus, {}, *points_to_analysis_));
-
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(data, {}, dus, {}, *dataflow_analysis_));
-  EXPECT_FALSE(
-      CanShareOperandBufferWithUser(update, {}, dus, {}, *dataflow_analysis_));
-  EXPECT_FALSE(
-      CanShareOperandBufferWithUser(starts, {}, dus, {}, *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
-  auto builder = HloComputation::Builder(TestName());
-  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
-
-  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
-      Literal::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
-  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
-      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
-
-  DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
-  auto dot = builder.AddInstruction(
-      HloInstruction::CreateDot(data_shape, a, b, dot_dnums));
-
-  auto one = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
-  auto add_operand = builder.AddInstruction(
-      HloInstruction::CreateBroadcast(data_shape, one, {1}));
-
-  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
-      data_shape, HloOpcode::kAdd, dot, add_operand));
-
-  BuildModule(builder.Build());
-  auto fusion = computation_->CreateFusionInstruction(
-      {add, dot}, HloInstruction::FusionKind::kOutput);
-  RunAnalysis();
-
-  // Output fused dot add should be able to share buffer with 'add_operand'.
-  EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
-                                            *points_to_analysis_));
-
-  EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
-                                            *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, FusedTransposeDotAdd) {
-  auto builder = HloComputation::Builder(TestName());
-  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
-
-  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
-      Literal::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
-  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
-      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
-  auto b_t = builder.AddInstruction(
-      HloInstruction::CreateTranspose(data_shape, b, {1, 0}));
-
-  DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
-  auto dot = builder.AddInstruction(
-      HloInstruction::CreateDot(data_shape, a, b_t, dot_dnums));
-
-  auto one = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
-  auto add_operand = builder.AddInstruction(
-      HloInstruction::CreateBroadcast(data_shape, one, {1}));
-
-  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
-      data_shape, HloOpcode::kAdd, dot, add_operand));
-
-  BuildModule(builder.Build());
-
-  auto nested_fusion = computation_->CreateFusionInstruction(
-      {dot, b_t}, HloInstruction::FusionKind::kTransposeDot);
-
-  auto fusion = computation_->CreateFusionInstruction(
-      {add, nested_fusion}, HloInstruction::FusionKind::kOutput);
-  RunAnalysis();
-
-  // Output fused transpose-dot-add should be share buffer with 'add_operand'.
-  EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
-                                            *points_to_analysis_));
-
-  EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
-                                            *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) {
-  auto builder = HloComputation::Builder(TestName());
-  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
-
-  auto one = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
-  auto operand = builder.AddInstruction(
-      HloInstruction::CreateBroadcast(data_shape, one, {1}));
-
-  auto reverse = builder.AddInstruction(
-      HloInstruction::CreateReverse(data_shape, operand, {0, 1}));
-
-  auto two = builder.AddInstruction(HloInstruction::CreateConstant(
-      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
-
-  auto add = builder.AddInstruction(
-      HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, reverse, two));
-
-  BuildModule(builder.Build());
-  auto fusion = computation_->CreateFusionInstruction(
-      {add, two, reverse}, HloInstruction::FusionKind::kOutput);
-  RunAnalysis();
-
-  // Output fused operand->reverse->add cannot alias operand buffer 'operand'.
-  EXPECT_FALSE(CanShareOperandBufferWithUser(operand, {}, fusion, {},
-                                             *points_to_analysis_));
-
-  EXPECT_FALSE(CanShareOperandBufferWithUser(operand, {}, fusion, {},
-                                             *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
-  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
-
-  auto make_cond = [this, &data_shape]() {
-    auto builder = HloComputation::Builder(TestName() + ".Cond");
-    auto data = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, data_shape, "data"));
-    builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kEq, data, data));
-    return builder.Build();
-  };
-
-  auto make_body = [this, &data_shape]() {
-    auto builder = HloComputation::Builder(TestName() + ".Body");
-    auto data = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, data_shape, "data"));
-    builder.AddInstruction(
-        HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, data, data));
-    return builder.Build();
-  };
-
-  module_ = CreateNewModule();
-  HloComputation* cond_computation =
-      module_->AddEmbeddedComputation(make_cond());
-  HloComputation* body_computation =
-      module_->AddEmbeddedComputation(make_body());
-
-  auto builder = HloComputation::Builder(TestName());
-  auto data = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, data_shape, "data"));
-  auto whil = builder.AddInstruction(HloInstruction::CreateWhile(
-      data_shape, cond_computation, body_computation, data));
-  computation_ = module_->AddEntryComputation(builder.Build());
-
-  RunAnalysis();
-
-  // The While instruction can share with the data operand.
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(data, {}, whil, {}, *points_to_analysis_));
-
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(data, {}, whil, {}, *dataflow_analysis_));
-}
-
-// Tests that Call can alias operand buffer if the only use of the operand
-// in the called computation is an elementwise instruction.
-TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
-  Shape shape = ShapeUtil::MakeShape(F32, {8});
-  // Build sub-computation with fusion root.
-  auto sub_builder = HloComputation::Builder(TestName() + "_sub");
-  auto sub_param = sub_builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape, "sub_param"));
-  auto one = sub_builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
-  auto ones = sub_builder.AddInstruction(
-      HloInstruction::CreateBroadcast(shape, one, {1}));
-  auto add = sub_builder.AddInstruction(
-      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sub_param, ones));
-
-  module_ = CreateNewModule();
-  auto sub_computation = module_->AddEmbeddedComputation(sub_builder.Build());
-  sub_computation->CreateFusionInstruction({add, ones},
-                                           HloInstruction::FusionKind::kLoop);
-
-  // Build entry-computation with kCall which calls 'sub_computation'.
-  auto builder = HloComputation::Builder(TestName());
-
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape, "param"));
-  auto reverse =
-      builder.AddInstruction(HloInstruction::CreateReverse(shape, param, {0}));
-  auto call = builder.AddInstruction(
-      HloInstruction::CreateCall(shape, {reverse}, sub_computation));
-  computation_ = module_->AddEntryComputation(builder.Build());
-
-  RunAnalysis();
-
-  EXPECT_TRUE(CanShareOperandBufferWithUser(reverse, {}, call, {},
-                                            *points_to_analysis_));
-  EXPECT_TRUE(CanShareOperandBufferWithUser(reverse, {}, call, {},
-                                            *dataflow_analysis_));
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index bc683a1880b010d57e83aa6e9ffa95fda299e1a0..f172b1d87c870270436f7301ed200b47d08431a7 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -151,7 +151,7 @@ Status FusedIrEmitter::HandleTuple(HloInstruction* tuple) {
 
 Status FusedIrEmitter::FinishVisit(HloInstruction* root) {
   fused_root_ = root;
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 FusedIrEmitter::Generator FusedIrEmitter::GetRootGenerator() const {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index 3312a888443233139841ce7a5e3173f907605e1d..7323abeb2077154f82828bcda3e90eb45a67138a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -333,18 +333,7 @@ llvm::Value* IrArray::EmitArrayElementAddress(
   }
   CHECK_EQ(index.size(), ShapeUtil::Rank(*shape_));
 
-  std::vector<llvm::Value*> actual_index;
-  bool is_implicit_broadcast = false;
-  // We perform broadcasting when the operand shape has dimension(s) of size
-  // 1. In this case we fix the index value for that dimension to zero. This
-  // effectively broadcasts along this dimension.
-  for (int64 i = 0; i < index.size(); ++i) {
-    auto dim = shape_->dimensions(i);
-    actual_index.push_back(dim == 1 ? ir_builder->getInt64(0) : index[i]);
-    is_implicit_broadcast |= dim == 1;
-  }
-
-  if (!is_implicit_broadcast && index.LinearValidOnShape(*shape_)) {
+  if (index.LinearValidOnShape(*shape_)) {
     llvm::Module* module =
         ir_builder->GetInsertBlock()->getParent()->getParent();
     return ir_builder->CreateInBoundsGEP(
@@ -354,6 +343,15 @@ llvm::Value* IrArray::EmitArrayElementAddress(
         {index.linear()}, llvm_ir::AsStringRef(name));
   }
 
+  std::vector<llvm::Value*> actual_index;
+  for (int64 i = 0; i < index.size(); ++i) {
+    // When dimension i is of size 1, LLVM optimization is able to replace
+    // index[i] with 0. However, setting index[i] to 0 here still allows LLVM to
+    // produce better code in some cases.
+    auto dim = shape_->dimensions(i);
+    actual_index.push_back(dim == 1 ? ir_builder->getInt64(0) : index[i]);
+  }
+
   // "base_ptr_" has the type of "<ir_type_for_its_shape>*"
   // (e.g. [3 x [2 x float]]*). Therefore, the address of the indexed element
   // should be computed by
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
index 1c00b2aabd182da72e78d2c9c01cbe70cfd8e33c..64b935bbf1fb9033cd2e1259b4639cd3780be711 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
@@ -100,6 +100,15 @@ class KernelSupportLibrary {
         [&](llvm::Value* indvar, llvm::Value*) { for_body_generator(indvar); });
   }
 
+  void For(
+      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+      int64 step,
+      const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
+    For(name, start, end, ir_builder_->getInt64(step),
+        /*peel_first_iteration=*/false,
+        [&](llvm::Value* indvar, llvm::Value*) { for_body_generator(indvar); });
+  }
+
   void For(
       tensorflow::StringPiece name, int64 start, int64 end, int64 step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index 3978acc132f34b8b195d3772ccf71d0d467984db..0728ccfff7b85e3751f33bc5272a5f22d4e5411a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -39,14 +39,13 @@ LoopEmitter::LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
 LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
                          const IrArray& target_array,
                          llvm::IRBuilder<>* ir_builder)
-    : body_emitter_([=](const llvm_ir::IrArray::Index array_index)
-                        -> ::tensorflow::Status {
+    : body_emitter_([=](const llvm_ir::IrArray::Index array_index) -> Status {
         // Convert target_element_generator to a BodyEmitter.
         TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
                             target_element_generator(array_index));
         target_array.EmitWriteArrayElement(array_index, target_element,
                                            ir_builder);
-        return tensorflow::Status::OK();
+        return Status::OK();
       }),
       shape_(target_array.GetShape()),
       ir_builder_(ir_builder) {}
@@ -124,7 +123,7 @@ std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
   return {array_index};
 }
 
-tensorflow::Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name) {
+Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name) {
   for (const IrArray::Index& array_index :
        EmitIndexAndSetExitBasicBlock(loop_name)) {
     TF_RETURN_IF_ERROR(body_emitter_(array_index));
@@ -135,7 +134,7 @@ tensorflow::Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name) {
   if (exit_bb_ != nullptr) {
     ir_builder_->SetInsertPoint(exit_bb_);
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index 9ff497aecd0bc964c929205c7fd410cca87d9b77..b70d28ecd3033eb26629718e50ce48f39b162273 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -38,8 +38,7 @@ using ElementGenerator =
 // Emits a loop for every element in the given shape.
 class LoopEmitter {
  public:
-  using BodyEmitter =
-      std::function<tensorflow::Status(const IrArray::Index& index)>;
+  using BodyEmitter = std::function<Status(const IrArray::Index& index)>;
 
   LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
               llvm::IRBuilder<>* ir_builder);
@@ -72,7 +71,7 @@ class LoopEmitter {
       tensorflow::StringPiece loop_name);
 
   // Emits a complete loop nest for every element in the given shape.
-  tensorflow::Status EmitLoop(tensorflow::StringPiece loop_name = "");
+  Status EmitLoop(tensorflow::StringPiece loop_name = "");
 
  protected:
   // An IR emitter that generates the loop body.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ops.cc b/tensorflow/compiler/xla/service/llvm_ir/ops.cc
index 34899b7400464e4f4f97d301f35ed3b7b083bca1..dacc54742c0897bbd92315f1e33a484aae56bb7f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ops.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ops.cc
@@ -49,22 +49,41 @@ static Status EmitDynamicUpdateSliceInPlaceImpl(
   for (int64 i = 0; i < rank; ++i) {
     IrArray::Index dim_index({ir_builder->getInt64(i)});
     TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(dim_index));
+    llvm::Value* output_dim_size = llvm::ConstantInt::get(
+        start_index[i]->getType(), output_shape.dimensions(i));
+    llvm::Value* update_dim_size = llvm::ConstantInt::get(
+        start_index[i]->getType(), update_shape.dimensions(i));
+
+    // Clamp the start index so that the update region fits in the operand.
+    // start_index = clamp(start_index, 0, output_dim_size - update_dim_size)
+
+    // TODO(b/74360564): This is implementation defined behavior, but is
+    // currently respected by all implementations. Change this if we ever decide
+    // to oficially document different behavior.
+    llvm::Value* max_bound =
+        ir_builder->CreateSub(output_dim_size, update_dim_size);
+    llvm::Value* zero = llvm::ConstantInt::get(start_index[i]->getType(), 0);
+    start_index[i] = ir_builder->CreateSelect(
+        ir_builder->CreateICmp(llvm::ICmpInst::ICMP_SGE, zero, start_index[i]),
+        zero, start_index[i]);
+
+    start_index[i] = ir_builder->CreateSelect(
+        ir_builder->CreateICmp(llvm::ICmpInst::ICMP_SLE, max_bound,
+                               start_index[i]),
+        max_bound, start_index[i]);
   }
 
   auto loop_body_emitter = [&](const IrArray::Index& update_index) -> Status {
     // Calculate output_index, where we'll write the value from update.  For
     // each dimension,
     //
-    //   output_index[dim] = (start_index[dim] + update_index[dim]) % dim_size.
+    //   output_index[dim] = start_index[dim] + update_index[dim]
     //
     IrArray::Index output_index(rank);
     for (int64 i = 0; i < rank; ++i) {
-      llvm::Value* dim_size = llvm::ConstantInt::get(
-          update_index[i]->getType(), output_shape.dimensions(i));
-      llvm::Value* start_index0 = ir_builder->CreateZExtOrBitCast(
+      llvm::Value* start_index0 = ir_builder->CreateSExtOrBitCast(
           start_index[i], update_index[i]->getType());
-      output_index[i] = ir_builder->CreateURem(
-          ir_builder->CreateAdd(start_index0, update_index[i]), dim_size);
+      output_index[i] = ir_builder->CreateAdd(start_index0, update_index[i]);
     }
 
     // Do output[output_index] = update[update_index].
diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index f74bcb0b79355c8e69890487266cbc5f2a4500be..3a6a7c25f4b727c7112dbcbcb4f3d892679a0011 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -53,7 +53,7 @@ NameUniquer::NameUniquer(const string& separator) {
 }
 
 string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) {
-  string root = GetSanitizedName(prefix.empty() ? "name" : prefix.ToString());
+  string root = GetSanitizedName(prefix.empty() ? "name" : std::string(prefix));
 
   // Strip away numeric suffix (if any). Only recognize separator if it is in
   // the middle of the name.
diff --git a/tensorflow/compiler/xla/service/owning_device_memory.cc b/tensorflow/compiler/xla/service/owning_device_memory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c115bc097f3b1dd810654745b835a977955718c3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/owning_device_memory.cc
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/owning_device_memory.h"
+
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+
+namespace xla {
+
+void OwningDeviceMemory::Free() {
+  CHECK(allocator_ != nullptr)
+      << "Can't call Free() on an inactive (i.e. moved from, Forget()'ten, "
+         "or Free()'ed) instance.";
+  auto status = allocator_->Deallocate(device_ordinal_, mem_);
+  if (!status.ok()) {
+    LOG(WARNING) << "Deallocating buffer " << mem_.opaque() << " failed.";
+  }
+
+  allocator_ = nullptr;
+  mem_ = se::DeviceMemoryBase();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/owning_device_memory.h b/tensorflow/compiler/xla/service/owning_device_memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..9cf071f0d9d09dfbf74b15e73caaf542714ec8d5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/owning_device_memory.h
@@ -0,0 +1,131 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
+
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+
+// Break circular dependency between this file and device_memory_allocator.h.
+class DeviceMemoryAllocator;
+
+// Owning pointer for memory on a device.
+//
+// OwningDeviceMemory is an owning pointer like std::unique_ptr, but it can
+// point to memory that resides on a "device" (e.g. a GPU).  When an
+// OwningDeviceMemory goes out of scope, it frees the memory it owns.
+//
+// We say that an instance of OwningDeviceMemory is "active" if it currently
+// owns a (possibly empty) slice of memory on the device.  Moving, Forget()'ing,
+// Free()'ing, and other actions can deactive an active object.
+//
+// Note that we can't simply use stream_executor::ScopedDeviceMemory instead of
+// OwningDeviceMemory, because ScopedDeviceMemory frees its pointer via a
+// StreamExecutor.  This class needs to free via a xla::DeviceMemoryAllocator.
+class OwningDeviceMemory {
+ public:
+  OwningDeviceMemory() : device_ordinal_(-1), allocator_(nullptr) {}
+
+  explicit OwningDeviceMemory(se::DeviceMemoryBase mem, int device_ordinal,
+                              DeviceMemoryAllocator* allocator)
+      : mem_(mem), device_ordinal_(device_ordinal), allocator_(allocator) {
+    CHECK(allocator != nullptr) << "allocator cannot be null.";
+  }
+
+  OwningDeviceMemory(OwningDeviceMemory&& other)
+      : mem_(other.mem_),
+        device_ordinal_(other.device_ordinal_),
+        allocator_(other.allocator_) {
+    other.mem_ = se::DeviceMemoryBase();
+    other.allocator_ = nullptr;
+  }
+
+  OwningDeviceMemory& operator=(OwningDeviceMemory&& other) {
+    if (allocator_ != nullptr) {
+      Free();
+    }
+    mem_ = other.mem_;
+    device_ordinal_ = other.device_ordinal_;
+    allocator_ = other.allocator_;
+
+    other.mem_ = se::DeviceMemoryBase();
+    other.allocator_ = nullptr;
+    return *this;
+  }
+
+  // Deactivates this instance if it's active.  Nop if it's not active.
+  OwningDeviceMemory& operator=(std::nullptr_t) {
+    if (allocator_ != nullptr) {
+      Free();
+    }
+    return *this;
+  }
+
+  ~OwningDeviceMemory() {
+    if (allocator_ != nullptr) {
+      Free();
+    }
+  }
+
+  // The returned allocator is nonnull iff this object is active.
+  DeviceMemoryAllocator* allocator() const { return allocator_; }
+
+  int device_ordinal() const { return device_ordinal_; }
+
+  // Gets the device memory pointer.
+  const void* opaque() const { return mem_.opaque(); }
+  void* opaque() { return mem_.opaque(); }
+
+  uint64 size() const { return mem_.size(); }
+
+  // Determines whether this wraps a null pointer.
+  //
+  // !is_null() is sufficient but not necessary to imply `this` is active.
+  bool is_null() const { return mem_.is_null(); }
+
+  se::DeviceMemoryBase AsDeviceMemoryBase() {
+    return se::DeviceMemoryBase(opaque(), size(), /*is_sub_buffer=*/false);
+  }
+
+  // Returns the wrapped DeviceMemoryBase without freeing it, and deactivates
+  // this object.  Precondition: `this` is active.
+  TF_MUST_USE_RESULT se::DeviceMemoryBase Forget() {
+    CHECK(allocator_ != nullptr)
+        << "Can't call Forget() on an inactive (i.e. moved from, Forget()'ten, "
+           "or Free()'ed) instance.";
+    allocator_ = nullptr;
+    se::DeviceMemoryBase mem(mem_);
+    mem_ = se::DeviceMemoryBase();
+    return mem;
+  }
+
+  // Frees the wrapped DeviceMemoryBase and deactivates this object.
+  // Precondition: `this` is active.
+  void Free();
+
+ private:
+  se::DeviceMemoryBase mem_;
+  int device_ordinal_;
+  DeviceMemoryAllocator* allocator_;  // Null if this object is inactive.
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index 586f6ef7a9c4f17f69340e77be17aec2f677a791..d3bc47e61e0e75fa2ef181988700f88cec9c1d76 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -702,6 +702,30 @@ class HloInstructionPatternOperandImpl {
   HloInstructionPattern<OperandType, OperandImpl> operand_;
 };
 
+// An HloInstructionPattern implementation that matches only if the instruction
+// is a fusion node with a particular kind.
+template <typename Previous>
+class HloInstructionPatternFusionKindImpl {
+ public:
+  explicit constexpr HloInstructionPatternFusionKindImpl(
+      const Previous& previous, ::xla::HloInstruction::FusionKind kind)
+      : previous_(previous), kind_(kind) {}
+
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && inst->opcode() == HloOpcode::kFusion &&
+           inst->fusion_kind() == kind_;
+  }
+
+  bool Match(::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && inst->opcode() == HloOpcode::kFusion &&
+           inst->fusion_kind() == kind_;
+  }
+
+ private:
+  Previous previous_;
+  ::xla::HloInstruction::FusionKind kind_;
+};
+
 // A pattern that matches HloInstructions.
 template <typename HloInstructionType, typename Impl>
 class HloInstructionPattern {
@@ -807,6 +831,16 @@ class HloInstructionPattern {
         matched_inst_);
   }
 
+  // Modifies the pattern to match only if the instruction is a fusion node with
+  // the given kind.
+  constexpr HloInstructionPattern<HloInstructionType,
+                                  HloInstructionPatternFusionKindImpl<Impl>>
+  WithFusionKind(HloInstruction::FusionKind kind) const {
+    return HloInstructionPattern<HloInstructionType,
+                                 HloInstructionPatternFusionKindImpl<Impl>>(
+        HloInstructionPatternFusionKindImpl<Impl>(impl_, kind), matched_inst_);
+  }
+
  private:
   Impl impl_;
   HloInstructionType** matched_inst_;
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
index c88157c312524fb273e6df368d2ef61d679d1d8b..204e8c99209fa95adb868a676bb9e5144fed432c 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -170,5 +170,28 @@ TEST(PatternMatcherTest, TupleShape) {
       Match(&tuple_shape, match::Shape().WithSubshape({0, 0}, match::Shape())));
 }
 
+TEST(PatternMatcherTest, FusionKind) {
+  constexpr char kModuleStr[] = R"(
+    HloModule test_module
+
+    fused_computation {
+      ROOT fp0 = f32[] parameter(0)
+    }
+
+    ENTRY while.v11 {
+      p0 = f32[] parameter(0)
+      ROOT fusion = f32[] fusion(p0), kind=kLoop, calls=fused_computation
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, tools::Parse(kModuleStr));
+
+  auto* root = hlo_module->entry_computation()->root_instruction();
+  EXPECT_TRUE(Match(
+      root, match::Op().WithFusionKind(HloInstruction::FusionKind::kLoop)));
+  EXPECT_FALSE(Match(
+      root, match::Op().WithFusionKind(HloInstruction::FusionKind::kInput)));
+  EXPECT_FALSE(Match(root->operand(0), match::Op().WithFusionKind(
+                                           HloInstruction::FusionKind::kLoop)));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 495f8801ba82ecbcf9f6e5db5507ef8785c752d6..cb0f76ebe4d445059fdf37ebf559bef851a57104 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -64,7 +64,7 @@ namespace {
 
 // Records the arguments used to invoke a computation in a SessionModule
 // proto.
-tensorflow::Status RecordArguments(
+Status RecordArguments(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     se::StreamExecutor* executor, TransferManager* transfer_manager,
     SessionModule* module) {
@@ -75,24 +75,22 @@ tensorflow::Status RecordArguments(
         transfer_manager->TransferLiteralFromDevice(executor, *argument));
     *module->add_arguments() = literal->ToProto();
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 // Records the result of a computation in a SessionModule proto.
-tensorflow::Status RecordResult(const ShapedBuffer& result,
-                                se::StreamExecutor* executor,
-                                TransferManager* transfer_manager,
-                                SessionModule* module) {
+Status RecordResult(const ShapedBuffer& result, se::StreamExecutor* executor,
+                    TransferManager* transfer_manager, SessionModule* module) {
   module->clear_result();
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Literal> literal,
       transfer_manager->TransferLiteralFromDevice(executor, result));
   *module->mutable_result() = literal->ToProto();
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 // Records the arguments used to invoke a computation in an HloSnapshot proto.
-tensorflow::Status RecordArguments(
+Status RecordArguments(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     se::StreamExecutor* executor, TransferManager* transfer_manager,
     HloSnapshot* module) {
@@ -103,20 +101,18 @@ tensorflow::Status RecordArguments(
         transfer_manager->TransferLiteralFromDevice(executor, *argument));
     *module->add_arguments() = literal->ToProto();
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 // Records the result of a computation in a HloSnapshot proto.
-tensorflow::Status RecordResult(const ShapedBuffer& result,
-                                se::StreamExecutor* executor,
-                                TransferManager* transfer_manager,
-                                HloSnapshot* module) {
+Status RecordResult(const ShapedBuffer& result, se::StreamExecutor* executor,
+                    TransferManager* transfer_manager, HloSnapshot* module) {
   module->clear_result();
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Literal> literal,
       transfer_manager->TransferLiteralFromDevice(executor, result));
   *module->mutable_result() = literal->ToProto();
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace
@@ -199,8 +195,8 @@ Service::Service(const ServiceOptions& options,
   }
 }
 
-tensorflow::Status Service::Computation(const ComputationRequest* arg,
-                                        ComputationResponse* result) {
+Status Service::Computation(const ComputationRequest* arg,
+                            ComputationResponse* result) {
   if (arg->name().empty()) {
     return InvalidArgument("computation request needs a name");
   }
@@ -210,24 +206,23 @@ tensorflow::Status Service::Computation(const ComputationRequest* arg,
   VLOG(1) << Printf("Created new computation %s on service %p, name %s",
                     result->computation().ShortDebugString().c_str(), this,
                     arg->name().c_str());
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::CreateChannelHandle(
-    const CreateChannelHandleRequest* arg,
-    CreateChannelHandleResponse* result) {
+Status Service::CreateChannelHandle(const CreateChannelHandleRequest* arg,
+                                    CreateChannelHandleResponse* result) {
   *result->mutable_channel() = channel_tracker_.NewChannel();
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::Unregister(const UnregisterRequest* arg,
-                                       UnregisterResponse* result) {
+Status Service::Unregister(const UnregisterRequest* arg,
+                           UnregisterResponse* result) {
   return allocation_tracker_.Unregister(arg->data());
 }
 
 // Deconstructs a previously-allocated global handle.
-tensorflow::Status Service::DeconstructTuple(const DeconstructTupleRequest* arg,
-                                             DeconstructTupleResponse* result) {
+Status Service::DeconstructTuple(const DeconstructTupleRequest* arg,
+                                 DeconstructTupleResponse* result) {
   TF_ASSIGN_OR_RETURN(
       std::vector<GlobalDataHandle> elements,
       allocation_tracker_.DeconstructTuple(arg->tuple_handle()));
@@ -235,11 +230,11 @@ tensorflow::Status Service::DeconstructTuple(const DeconstructTupleRequest* arg,
   for (auto& element : elements) {
     *result->add_element_handles() = element;
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::ValidateResultShapeWithLayout(
-    const Shape& shape_with_layout, const Shape& result_shape) const {
+Status Service::ValidateResultShapeWithLayout(const Shape& shape_with_layout,
+                                              const Shape& result_shape) const {
   if (!ShapeUtil::Compatible(shape_with_layout, result_shape)) {
     return InvalidArgument(
         "Shape used to set computation result layout %s is not compatible "
@@ -345,6 +340,9 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     // If the result layout is not set, then choose the default.
     // TODO(b/29118294): Allow the compiler to choose a better layout in this
     // case.
+    // TODO(b/78356948): We are forcing the default layout here. We should fix
+    // clients which expect a default layout, to be explicit about it, by
+    // passing the proper ExecutionOptions with shape_with_output_layout set.
     host_computation_layout->mutable_result_layout()->SetToDefaultLayout();
     device_computation_layout->mutable_result_layout()->SetToDefaultLayout();
   }
@@ -511,7 +509,7 @@ Status Service::ValidateEntryComputationLayout(HloModule* module) {
       module->device_entry_computation_layout().result_shape(),
       execute_backend_->transfer_manager()->HostShapeToDeviceShape(
           module->host_entry_computation_layout().result_shape())));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
@@ -801,8 +799,8 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
                                                        result_tag);
 }
 
-tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg,
-                                           SetReturnValueResponse* results) {
+Status Service::SetReturnValue(const SetReturnValueRequest* arg,
+                               SetReturnValueResponse* results) {
   TF_ASSIGN_OR_RETURN(UserComputation * computation,
                       computation_tracker_.Resolve(arg->computation()));
   return computation->SetReturnValue(arg->operand());
@@ -849,8 +847,8 @@ StatusOr<std::vector<std::vector<const ShapedBuffer*>>> Service::GetArguments(
   return replicated_arguments;
 }
 
-tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
-                                            ExecuteParallelResponse* result) {
+Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
+                                ExecuteParallelResponse* result) {
   VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString();
 
   std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
@@ -957,11 +955,11 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
   }
 
   VLOG(1) << "successfully completed 'execute-parallel' request";
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::ExecuteGraphParallel(
-    const ExecuteGraphParallelRequest* arg, ExecuteParallelResponse* result) {
+Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
+                                     ExecuteParallelResponse* result) {
   VLOG(1) << "running execute-graph-parallel request";
 
   std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
@@ -1058,11 +1056,11 @@ tensorflow::Status Service::ExecuteGraphParallel(
   }
 
   VLOG(1) << "successfully completed 'execute-graph-parallel' request";
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
-                                             GetDeviceHandlesResponse* result) {
+Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
+                                 GetDeviceHandlesResponse* result) {
   const int64 available_device_count = execute_backend_->device_count();
   const int64 replica_count = options_.number_of_replicas();
   if (replica_count <= 0) {
@@ -1082,11 +1080,11 @@ tensorflow::Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
     *result->add_device_handles() = device_handle;
   }
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::ExecuteOneToN(const ExecuteRequest* arg,
-                                          ExecuteResponse* result) {
+Status Service::ExecuteOneToN(const ExecuteRequest* arg,
+                              ExecuteResponse* result) {
   ExecuteParallelRequest parallel_arg;
   *parallel_arg.add_requests() = *arg;
   ExecuteParallelResponse parallel_result;
@@ -1094,8 +1092,8 @@ tensorflow::Status Service::ExecuteOneToN(const ExecuteRequest* arg,
   return PickParallelResponse(parallel_result, result);
 }
 
-tensorflow::Status Service::ExecuteOneToN(const ExecuteGraphRequest* arg,
-                                          ExecuteResponse* result) {
+Status Service::ExecuteOneToN(const ExecuteGraphRequest* arg,
+                              ExecuteResponse* result) {
   ExecuteGraphParallelRequest parallel_arg;
   *parallel_arg.add_requests() = *arg;
   ExecuteParallelResponse parallel_result;
@@ -1103,7 +1101,7 @@ tensorflow::Status Service::ExecuteOneToN(const ExecuteGraphRequest* arg,
   return PickParallelResponse(parallel_result, result);
 }
 
-tensorflow::Status Service::PickParallelResponse(
+Status Service::PickParallelResponse(
     const ExecuteParallelResponse& parallel_result, ExecuteResponse* result) {
   // The "result device" selection is a bit hacky, but better than assuming it
   // is device 0. We have b/76035356 for restructuring the client API to clean
@@ -1126,8 +1124,7 @@ tensorflow::Status Service::PickParallelResponse(
   return Status::OK();
 }
 
-tensorflow::Status Service::Execute(const ExecuteRequest* arg,
-                                    ExecuteResponse* result) {
+Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) {
   VLOG(1) << "running execute request: " << arg->ShortDebugString();
 
   TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
@@ -1198,7 +1195,7 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
   }
 
   VLOG(1) << "successfully completed 'execute' request";
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
@@ -1243,8 +1240,8 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
   return std::move(executable);
 }
 
-tensorflow::Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
-                                         ExecuteResponse* result) {
+Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
+                             ExecuteResponse* result) {
   VLOG(1) << "running execute-graph request";
 
   if (!arg->has_computation()) {
@@ -1303,11 +1300,11 @@ tensorflow::Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
   }
 
   VLOG(1) << "successfully completed 'execute-graph' request";
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
-                                         ExecuteAsyncResponse* result) {
+Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
+                             ExecuteAsyncResponse* result) {
   VLOG(1) << "running execute-async request: " << arg->ShortDebugString();
 
   TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
@@ -1383,11 +1380,11 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
   streams.clear();
 
   VLOG(1) << "successfully completed 'execute-async' request";
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::WaitForExecution(const WaitForExecutionRequest* arg,
-                                             WaitForExecutionResponse* result) {
+Status Service::WaitForExecution(const WaitForExecutionRequest* arg,
+                                 WaitForExecutionResponse* result) {
   TF_ASSIGN_OR_RETURN(const auto execution,
                       execution_tracker_.Resolve(arg->execution()));
 
@@ -1398,11 +1395,11 @@ tensorflow::Status Service::WaitForExecution(const WaitForExecutionRequest* arg,
 
   TF_RETURN_IF_ERROR(execution_tracker_.Unregister(arg->execution()));
   VLOG(1) << "successfully completed 'wait-for-execution' request";
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::TransferToClient(const TransferToClientRequest* arg,
-                                             TransferToClientResponse* result) {
+Status Service::TransferToClient(const TransferToClientRequest* arg,
+                                 TransferToClientResponse* result) {
   TF_ASSIGN_OR_RETURN(const ShapedBuffer* shaped_buffer,
                       allocation_tracker_.ResolveForReplica(arg->data(), 0));
 
@@ -1432,7 +1429,7 @@ tensorflow::Status Service::TransferToClient(const TransferToClientRequest* arg,
     *result->mutable_literal() =
         result_literal->Relayout(*return_shape)->ToProto();
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 namespace {
@@ -1450,8 +1447,8 @@ std::unique_ptr<ShapedBuffer> CloneShapedBufferOnDevice(
 
 }  // namespace
 
-tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
-                                             TransferToServerResponse* result) {
+Status Service::TransferToServer(const TransferToServerRequest* arg,
+                                 TransferToServerResponse* result) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
                       Literal::CreateFromProto(arg->literal()));
   const Shape& shape = literal->shape();
@@ -1484,11 +1481,11 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
                           StrCat("TransferToServer literal of shape ",
                                  ShapeUtil::HumanString(shape))));
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
-                                             TransferToInfeedResponse* result) {
+Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
+                                 TransferToInfeedResponse* result) {
   const int64 replica_count = options_.number_of_replicas();
   if (arg->replica_id() < 0 || arg->replica_id() >= replica_count) {
     return FailedPrecondition(
@@ -1517,9 +1514,8 @@ tensorflow::Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
       executor, *literal);
 }
 
-tensorflow::Status Service::TransferFromOutfeed(
-    const TransferFromOutfeedRequest* arg,
-    TransferFromOutfeedResponse* result) {
+Status Service::TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
+                                    TransferFromOutfeedResponse* result) {
   const int64 replica_count = options_.number_of_replicas();
   if (arg->replica_id() < 0 || arg->replica_id() >= replica_count) {
     return FailedPrecondition(
@@ -1545,16 +1541,16 @@ tensorflow::Status Service::TransferFromOutfeed(
       execute_backend_->transfer_manager()->TransferLiteralFromOutfeed(
           executor, arg->shape_with_layout(), &literal));
   *result->mutable_literal() = literal.ToProto();
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::ResetDevice(const ResetDeviceRequest* arg,
-                                        ResetDeviceResponse* result) {
+Status Service::ResetDevice(const ResetDeviceRequest* arg,
+                            ResetDeviceResponse* result) {
   return execute_backend_->ResetDevices();
 }
 
-tensorflow::Status Service::IsConstant(const IsConstantRequest* arg,
-                                       IsConstantResponse* result) {
+Status Service::IsConstant(const IsConstantRequest* arg,
+                           IsConstantResponse* result) {
   TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
                       computation_tracker_.Resolve(arg->computation()));
 
@@ -1570,11 +1566,11 @@ tensorflow::Status Service::IsConstant(const IsConstantRequest* arg,
       user_computation->IsConstant(arg->operand(), arg->num_parameters()));
 
   result->set_is_constant(is_constant);
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
-                                            ComputeConstantResponse* result) {
+Status Service::ComputeConstant(const ComputeConstantRequest* arg,
+                                ComputeConstantResponse* result) {
   TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
                       computation_tracker_.Resolve(arg->computation()));
 
@@ -1661,11 +1657,11 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
   }
   *result->mutable_literal() = result_literal->ToProto();
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::ComputeConstantGraph(
-    const ComputeConstantGraphRequest* arg, ComputeConstantResponse* result) {
+Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
+                                     ComputeConstantResponse* result) {
   if (!arg->has_computation()) {
     return InvalidArgument("computations may not be empty");
   }
@@ -1703,20 +1699,18 @@ tensorflow::Status Service::ComputeConstantGraph(
   }
   *result->mutable_literal() = result_literal->ToProto();
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::GetShape(const GetShapeRequest* arg,
-                                     GetShapeResponse* result) {
+Status Service::GetShape(const GetShapeRequest* arg, GetShapeResponse* result) {
   TF_ASSIGN_OR_RETURN(const ShapedBuffer* buffer,
                       allocation_tracker_.ResolveForReplica(arg->data(), 0));
   *result->mutable_shape() = buffer->on_host_shape();
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::GetComputationShape(
-    const GetComputationShapeRequest* arg,
-    GetComputationShapeResponse* result) {
+Status Service::GetComputationShape(const GetComputationShapeRequest* arg,
+                                    GetComputationShapeResponse* result) {
   TF_ASSIGN_OR_RETURN(UserComputation * computation,
                       computation_tracker_.Resolve(arg->computation()));
 
@@ -1726,21 +1720,21 @@ tensorflow::Status Service::GetComputationShape(
   TF_ASSIGN_OR_RETURN(auto program_shape, computation->ComputeProgramShape(
                                               versioned_handle.version));
   *result->mutable_program_shape() = *program_shape;
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::GetLocalShape(const GetLocalShapeRequest* arg,
-                                          GetLocalShapeResponse* result) {
+Status Service::GetLocalShape(const GetLocalShapeRequest* arg,
+                              GetLocalShapeResponse* result) {
   TF_ASSIGN_OR_RETURN(UserComputation * computation,
                       computation_tracker_.Resolve(arg->computation()));
 
   TF_ASSIGN_OR_RETURN(*result->mutable_shape(),
                       computation->GetShape(arg->operand()));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::GetComputationStats(
-    const ComputationStatsRequest* arg, ComputationStatsResponse* result) {
+Status Service::GetComputationStats(const ComputationStatsRequest* arg,
+                                    ComputationStatsResponse* result) {
   TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
                       computation_tracker_.Resolve(arg->computation()));
 
@@ -1766,10 +1760,10 @@ tensorflow::Status Service::GetComputationStats(
   stats.set_flop_count(analysis.flop_count());
   stats.set_transcendental_count(analysis.transcendental_count());
   *result->mutable_stats() = stats;
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::GetComputationGraphStats(
+Status Service::GetComputationGraphStats(
     const ComputationGraphStatsRequest* arg, ComputationStatsResponse* result) {
   if (!arg->has_computation()) {
     return InvalidArgument("Computations may not be empty.");
@@ -1796,11 +1790,11 @@ tensorflow::Status Service::GetComputationGraphStats(
   stats.set_flop_count(analysis.flop_count());
   stats.set_transcendental_count(analysis.transcendental_count());
   *result->mutable_stats() = stats;
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 template <typename RequestT, typename ResponseT>
-tensorflow::Status Service::AddInstruction(
+Status Service::AddInstruction(
     const RequestT* arg, ResponseT* result,
     const std::function<StatusOr<ComputationDataHandle>(UserComputation*)>&
         adder) {
@@ -1808,10 +1802,10 @@ tensorflow::Status Service::AddInstruction(
                       computation_tracker_.Resolve(arg->computation()));
 
   TF_ASSIGN_OR_RETURN(*result->mutable_output(), adder(computation));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
+Status Service::Op(const OpRequest* arg, OpResponse* result) {
   TF_ASSIGN_OR_RETURN(UserComputation * computation,
                       computation_tracker_.Resolve(arg->computation()));
   StatusOr<ComputationDataHandle> handle_status;
@@ -2033,27 +2027,26 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
   if (arg->has_sharding()) {
     TF_RETURN_IF_ERROR(computation->SetOpSharding(handle, arg->sharding()));
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::SnapshotComputation(
-    const SnapshotComputationRequest* arg,
-    SnapshotComputationResponse* result) {
+Status Service::SnapshotComputation(const SnapshotComputationRequest* arg,
+                                    SnapshotComputationResponse* result) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<SessionModule> module,
       computation_tracker_.SnapshotComputation(arg->computation()));
 
   result->set_allocated_module(module.release());
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::LoadComputationSnapshot(
+Status Service::LoadComputationSnapshot(
     const LoadComputationSnapshotRequest* arg,
     LoadComputationSnapshotResponse* result) {
   TF_ASSIGN_OR_RETURN(*result->mutable_computation(),
                       computation_tracker_.LoadSessionModule(arg->module()));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 DeviceHandle Service::SingleComputationDeviceHandle() const {
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index f84fe407e05da371da66ba33efd6e8165198cf2c..81fbd41957887aec763e1cfe165ad0d1d2ac2269 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -85,55 +85,52 @@ class Service : public ServiceInterface {
 
   // Creates a new computation with the given name.
   // A unique ComputationHandle is returned.
-  tensorflow::Status Computation(const ComputationRequest* arg,
-                                 ComputationResponse* result) override;
+  Status Computation(const ComputationRequest* arg,
+                     ComputationResponse* result) override;
 
   // Unregisters a previously-allocated global handle.
   //
   // If the handle given is not currently allocated, a NOT_FOUND status is
   // returned.
-  tensorflow::Status Unregister(const UnregisterRequest* arg,
-                                UnregisterResponse* result) override;
+  Status Unregister(const UnregisterRequest* arg,
+                    UnregisterResponse* result) override;
 
   // Deconstructs a tuple. Returns a newly created GlobalDataHandle for each
   // element in the tuple.
-  tensorflow::Status DeconstructTuple(
-      const DeconstructTupleRequest* arg,
-      DeconstructTupleResponse* result) override;
+  Status DeconstructTuple(const DeconstructTupleRequest* arg,
+                          DeconstructTupleResponse* result) override;
 
   // Modifies the provided computation so that subsequent executions
   // will compute the provided ComputationDataHandle, rather than the
   // last expression enqueued on that Computation.
-  tensorflow::Status SetReturnValue(const SetReturnValueRequest* arg,
-                                    SetReturnValueResponse* results) override;
+  Status SetReturnValue(const SetReturnValueRequest* arg,
+                        SetReturnValueResponse* results) override;
 
   // Executes a computation with the provided global data passed as
   // immutable arguments. Returns global data output and execution timing.
-  tensorflow::Status Execute(const ExecuteRequest* arg,
-                             ExecuteResponse* result) override;
+  Status Execute(const ExecuteRequest* arg, ExecuteResponse* result) override;
 
   // Executes a computation with the provided global data passed as
   // immutable arguments. The request contains the whole computation graph.
   // Returns global data output and execution timing.
   //
   // TODO(b/74197823): This is a part of a NOT YET ready refactor.
-  tensorflow::Status ExecuteGraph(const ExecuteGraphRequest* arg,
-                                  ExecuteResponse* result) override;
+  Status ExecuteGraph(const ExecuteGraphRequest* arg,
+                      ExecuteResponse* result) override;
 
   // Executes one or more computations in parallel with the provided global data
   // passed as immutable arguments. Returns global data output for each
   // computation.
-  tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
-                                     ExecuteParallelResponse* result) override;
+  Status ExecuteParallel(const ExecuteParallelRequest* arg,
+                         ExecuteParallelResponse* result) override;
 
   // Executes one or more computations in parallel with the provided global data
   // passed as immutable arguments. Returns global data output for each
   // computation.
   //
   // TODO(b/74197823): This is a part of a NOT YET ready refactor.
-  tensorflow::Status ExecuteGraphParallel(
-      const ExecuteGraphParallelRequest* arg,
-      ExecuteParallelResponse* result) override;
+  Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
+                              ExecuteParallelResponse* result) override;
 
   // Requests one or more device handles from the target.
   //
@@ -143,9 +140,8 @@ class Service : public ServiceInterface {
   // the first set of replicas, and the next R devices to the second set of
   // replicas, etc. Each returned device handle represents the device with the
   // replica id 0.
-  tensorflow::Status GetDeviceHandles(
-      const GetDeviceHandlesRequest* arg,
-      GetDeviceHandlesResponse* result) override;
+  Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
+                          GetDeviceHandlesResponse* result) override;
 
   // Asynchronously executes a computation with provided arguments. Invokes
   // the provided computation with the provided global data passed as
@@ -154,38 +150,33 @@ class Service : public ServiceInterface {
   // (Note: The corresponding function in xla::Client was removed as part of
   // b/64116060, in an attempt to simplify our API.  We're keeping this around
   // for now in case we want to expose this to clients in a different way.)
-  tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
-                                  ExecuteAsyncResponse* result) override;
+  Status ExecuteAsync(const ExecuteAsyncRequest* arg,
+                      ExecuteAsyncResponse* result) override;
 
   // Waits until the specified execution is complete and returns the result.
   // Calling this API multiple times with the same execution handle returns the
   // method with an error since the execution handle is destroyed after the
   // first call.
-  tensorflow::Status WaitForExecution(
-      const WaitForExecutionRequest* arg,
-      WaitForExecutionResponse* result) override;
+  Status WaitForExecution(const WaitForExecutionRequest* arg,
+                          WaitForExecutionResponse* result) override;
 
   // Requests that global data be transferred to the client in literal form.
-  tensorflow::Status TransferToClient(
-      const TransferToClientRequest* arg,
-      TransferToClientResponse* result) override;
+  Status TransferToClient(const TransferToClientRequest* arg,
+                          TransferToClientResponse* result) override;
 
   // Transfers data from a literal provided by the client, into device memory.
-  tensorflow::Status TransferToServer(
-      const TransferToServerRequest* arg,
-      TransferToServerResponse* result) override;
+  Status TransferToServer(const TransferToServerRequest* arg,
+                          TransferToServerResponse* result) override;
 
   // Transfers data from a literal provided by the client, into the Infeed
   // buffer of the device.
-  tensorflow::Status TransferToInfeed(
-      const TransferToInfeedRequest* arg,
-      TransferToInfeedResponse* result) override;
+  Status TransferToInfeed(const TransferToInfeedRequest* arg,
+                          TransferToInfeedResponse* result) override;
 
   // Transfers data from the Outfeed othe device to the literal provided by the
   // client.
-  tensorflow::Status TransferFromOutfeed(
-      const TransferFromOutfeedRequest* arg,
-      TransferFromOutfeedResponse* result) override;
+  Status TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
+                             TransferFromOutfeedResponse* result) override;
 
   // Resets devices, clearing all existing state on all the devices associated
   // with this service (including memory allocated on the devices).
@@ -196,71 +187,65 @@ class Service : public ServiceInterface {
   // ResetDevice should be called before an Execution that expect the device to
   // be in the reset state. For example, if the prior Execution modifies device
   // state (e.g., architectural state) that the next Execution depends on.
-  tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
-                                 ResetDeviceResponse* result) override;
+  Status ResetDevice(const ResetDeviceRequest* arg,
+                     ResetDeviceResponse* result) override;
 
   // Tests if an expression is a compile-time constant.
-  tensorflow::Status IsConstant(const IsConstantRequest* arg,
-                                IsConstantResponse* result) override;
+  Status IsConstant(const IsConstantRequest* arg,
+                    IsConstantResponse* result) override;
 
   // Computes the value of a constant expression.
-  tensorflow::Status ComputeConstant(const ComputeConstantRequest* arg,
-                                     ComputeConstantResponse* result) override;
-  tensorflow::Status ComputeConstantGraph(
-      const ComputeConstantGraphRequest* arg,
-      ComputeConstantResponse* result) override;
+  Status ComputeConstant(const ComputeConstantRequest* arg,
+                         ComputeConstantResponse* result) override;
+  Status ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
+                              ComputeConstantResponse* result) override;
 
   // Returns the shape (with layout) of an array associated with a given data
   // handle.
-  tensorflow::Status GetShape(const GetShapeRequest* arg,
-                              GetShapeResponse* result) override;
+  Status GetShape(const GetShapeRequest* arg,
+                  GetShapeResponse* result) override;
 
   // Returns the program shape of the computation associated with the given
   // handle.
-  tensorflow::Status GetComputationShape(
-      const GetComputationShapeRequest* arg,
-      GetComputationShapeResponse* result) override;
+  Status GetComputationShape(const GetComputationShapeRequest* arg,
+                             GetComputationShapeResponse* result) override;
 
   /////
   // Computation-oriented methods.
 
   // Enqueues an Op on the computation.
-  tensorflow::Status Op(const OpRequest* arg, OpResponse* result) override;
+  Status Op(const OpRequest* arg, OpResponse* result) override;
 
   // Retrieves the inferred shape for a value within a computation.
-  tensorflow::Status GetLocalShape(const GetLocalShapeRequest* arg,
-                                   GetLocalShapeResponse* result) override;
+  Status GetLocalShape(const GetLocalShapeRequest* arg,
+                       GetLocalShapeResponse* result) override;
 
   // Retrieves the statistics of a computation.
-  tensorflow::Status GetComputationStats(
-      const ComputationStatsRequest* arg,
-      ComputationStatsResponse* result) override;
+  Status GetComputationStats(const ComputationStatsRequest* arg,
+                             ComputationStatsResponse* result) override;
 
   // Retrieves the statistics of a computation.
   //
   // TODO(b/74197823): This is a part of a NOT YET ready refactor.
-  tensorflow::Status GetComputationGraphStats(
-      const ComputationGraphStatsRequest* arg,
-      ComputationStatsResponse* result) override;
+  Status GetComputationGraphStats(const ComputationGraphStatsRequest* arg,
+                                  ComputationStatsResponse* result) override;
 
   // Snapshots the current state of a computation handle into a serializable
   // protocol buffer form, so it can be loaded via
   // LoadComputationSnapshot.
-  tensorflow::Status SnapshotComputation(
-      const SnapshotComputationRequest* arg,
-      SnapshotComputationResponse* result) override;
+  Status SnapshotComputation(const SnapshotComputationRequest* arg,
+                             SnapshotComputationResponse* result) override;
 
   // Loads a computation from a serialized protocol buffer created via
   // SnapshotComputation.
-  tensorflow::Status LoadComputationSnapshot(
+  Status LoadComputationSnapshot(
       const LoadComputationSnapshotRequest* arg,
       LoadComputationSnapshotResponse* result) override;
 
   // Creates a unique channel handle that can be used for Send/Recv
   // instructions.
-  tensorflow::Status CreateChannelHandle(
-      const CreateChannelHandleRequest* arg,
-      CreateChannelHandleResponse* result) override;
+  Status CreateChannelHandle(const CreateChannelHandleRequest* arg,
+                             CreateChannelHandleResponse* result) override;
 
   // Returns the ComputationTracker of the current service instance.
   // Only used in unit tests to access user computations from client.
@@ -389,7 +374,7 @@ class Service : public ServiceInterface {
 
   // Convenience function for adding a function to a user computation.
   template <typename RequestT, typename ResponseT>
-  tensorflow::Status AddInstruction(
+  Status AddInstruction(
       const RequestT* arg, ResponseT* result,
       const std::function<StatusOr<ComputationDataHandle>(UserComputation*)>&
           adder);
@@ -397,16 +382,14 @@ class Service : public ServiceInterface {
   // Executes a single computation which has more than one target device.
   // The N devices are expected to all return an empty tuple, but one, which
   // will be the result of this computation.
-  tensorflow::Status ExecuteOneToN(const ExecuteRequest* arg,
-                                   ExecuteResponse* result);
-  tensorflow::Status ExecuteOneToN(const ExecuteGraphRequest* arg,
-                                   ExecuteResponse* result);
+  Status ExecuteOneToN(const ExecuteRequest* arg, ExecuteResponse* result);
+  Status ExecuteOneToN(const ExecuteGraphRequest* arg, ExecuteResponse* result);
 
   // Convenience function which checks whether the given shape_with_layout
   // (presumably passed by the client to set the result layout) is valid for the
   // given computation result shape.
-  tensorflow::Status ValidateResultShapeWithLayout(
-      const Shape& shape_with_layout, const Shape& result_shape) const;
+  Status ValidateResultShapeWithLayout(const Shape& shape_with_layout,
+                                       const Shape& result_shape) const;
 
   // Returns the stream executors assigned to the replicas represented by the
   // given device handle. Each device_handle is a virtual replicated device that
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 48b2922e77b78719e5d3469cbaa4fc15969de91b..3500978bdd808f0c7684d14a05636d90105aa594 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -58,6 +58,8 @@ UnaryOperation OpcodeToUnaryOperation(HloOpcode opcode) {
       return UNOP_COS;
     case HloOpcode::kExp:
       return UNOP_EXP;
+    case HloOpcode::kExpm1:
+      return UNOP_EXPM1;
     case HloOpcode::kFloor:
       return UNOP_FLOOR;
     case HloOpcode::kImag:
@@ -66,6 +68,8 @@ UnaryOperation OpcodeToUnaryOperation(HloOpcode opcode) {
       return UNOP_IS_FINITE;
     case HloOpcode::kLog:
       return UNOP_LOG;
+    case HloOpcode::kLog1p:
+      return UNOP_LOG1P;
     case HloOpcode::kNot:
       return UNOP_NOT;
     case HloOpcode::kNegate:
@@ -168,24 +172,24 @@ bool AllUnique(tensorflow::gtl::ArraySlice<int64> slice) {
   return std::set<int64>(slice.begin(), slice.end()).size() == slice.size();
 }
 
-tensorflow::Status ExpectNotTupleOrOpaque(const Shape& shape,
-                                          tensorflow::StringPiece op_type) {
+Status ExpectNotTupleOrOpaque(const Shape& shape,
+                              tensorflow::StringPiece op_type) {
   if (ShapeUtil::IsTuple(shape)) {
     return InvalidArgument("Expected non-tuple argument for %s, but got %s.",
-                           op_type.ToString().c_str(),
+                           std::string(op_type).c_str(),
                            ShapeUtil::HumanString(shape).c_str());
   } else if (ShapeUtil::IsOpaque(shape)) {
     return InvalidArgument("Expected non-opaque argument for %s, but got %s.",
-                           op_type.ToString().c_str(),
+                           std::string(op_type).c_str(),
                            ShapeUtil::HumanString(shape).c_str());
   } else {
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 }
 
-tensorflow::Status VerifyReducerShape(const ProgramShape& reducer_shape,
-                                      const Shape& init_value_shape,
-                                      const PrimitiveType& input_element_type) {
+Status VerifyReducerShape(const ProgramShape& reducer_shape,
+                          const Shape& init_value_shape,
+                          const PrimitiveType& input_element_type) {
   if (reducer_shape.parameters_size() != 2) {
     return InvalidArgument(
         "Reduction function must take 2 parameters, but "
@@ -245,7 +249,7 @@ tensorflow::Status VerifyReducerShape(const ProgramShape& reducer_shape,
         ShapeUtil::HumanString(accumulator_shape).c_str());
   }
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
@@ -337,7 +341,9 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     case UNOP_COS:
     case UNOP_SIN:
     case UNOP_EXP:
+    case UNOP_EXPM1:
     case UNOP_LOG:
+    case UNOP_LOG1P:
     case UNOP_TANH:
       if (!ShapeUtil::ElementIsFloating(arg) &&
           !ShapeUtil::ElementIsComplex(arg)) {
@@ -1212,11 +1218,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       scale_shape, "scale input of batch norm training"));
 
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(operand_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(offset_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(scale_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
 
   if (feature_index >= ShapeUtil::Rank(operand_shape)) {
     return InvalidArgument(
@@ -1318,15 +1324,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       scale_shape, "scale input of batch norm inference"));
 
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(operand_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(offset_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(scale_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(mean_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(variance_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
 
   if (feature_index >= ShapeUtil::Rank(operand_shape)) {
     return InvalidArgument(
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index fb3b5f06dad67b4305aed0305c9f6441e666db53..7d7dcac10b65933d1c81b8aca77465932694bfdb 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 
-#include <set>
 #include <string>
 #include <utility>
 
@@ -25,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -123,6 +123,8 @@ ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s)
 }
 
 ScopedShapedBuffer& ScopedShapedBuffer::operator=(ScopedShapedBuffer&& s) {
+  Deallocate();
+
   *static_cast<ShapedBuffer*>(this) = std::move(static_cast<ShapedBuffer&>(s));
   allocator_ = s.allocator_;
   // Null out s.allocator_ so it doesn't try to free anything in its destructor.
@@ -130,7 +132,15 @@ ScopedShapedBuffer& ScopedShapedBuffer::operator=(ScopedShapedBuffer&& s) {
   return *this;
 }
 
-ScopedShapedBuffer::~ScopedShapedBuffer() {
+ScopedShapedBuffer::~ScopedShapedBuffer() { Deallocate(); }
+
+ShapedBuffer ScopedShapedBuffer::release() {
+  ShapedBuffer shaped_buffer(static_cast<ShapedBuffer&&>(*this));
+  buffers_ = ShapeTree<se::DeviceMemoryBase>();
+  return shaped_buffer;
+}
+
+void ScopedShapedBuffer::Deallocate() {
   // allocator_ will be null if we were moved-from.
   if (allocator_ == nullptr) {
     return;
@@ -138,22 +148,14 @@ ScopedShapedBuffer::~ScopedShapedBuffer() {
   // Deallocate all non-null buffers. A buffer may appear in more than one spot
   // in the shape (eg, a tuple with a repeated element) so keep track of what
   // has been deallocated.
-  std::set<void*> deallocated_opaques;
+  tensorflow::gtl::FlatSet<void*> deallocated_ptrs;
   for (auto& pair : buffers_) {
     se::DeviceMemoryBase& memory_base = pair.second;
     if (!memory_base.is_null() &&
-        deallocated_opaques.count(memory_base.opaque()) == 0) {
-      deallocated_opaques.insert(memory_base.opaque());
-      TF_CHECK_OK(
-          this->allocator_->Deallocate(this->device_ordinal(), &memory_base));
+        deallocated_ptrs.insert(memory_base.opaque()).second) {
+      TF_CHECK_OK(allocator_->Deallocate(device_ordinal(), memory_base));
     }
   }
 }
 
-ShapedBuffer ScopedShapedBuffer::release() {
-  ShapedBuffer shaped_buffer(static_cast<ShapedBuffer&&>(*this));
-  buffers_ = ShapeTree<se::DeviceMemoryBase>();
-  return shaped_buffer;
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index e10fca9e9466c018f6cb4da2f5618e4db4977307..905a7e82e621f2bf4588b71be5dbab20f892cafe 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -148,13 +148,29 @@ class ScopedShapedBuffer : public ShapedBuffer {
   // ScopedShapedBuffer.
   DeviceMemoryAllocator* memory_allocator() const { return allocator_; }
 
-  // Releases all device memory owned by this ScopedShapedBuffer and returns the
-  // device memory pointers in the form of a ShapedBuffer. The returned
-  // ShapedBuffer takes over the memory from the ScopedShapedBuffer. The
-  // resulting ScopedShapedBuffer can only be destroyed.
-  ShapedBuffer release();
+  // Sets the device memory buffer at the given index.
+  //
+  // If the given buffer's device memory is non-null, its device_ordinal and
+  // allocator must match those in `this`.
+  void set_buffer(OwningDeviceMemory buffer, const ShapeIndex& index) {
+    if (!buffer.is_null()) {
+      CHECK_EQ(buffer.device_ordinal(), device_ordinal());
+      CHECK_EQ(buffer.allocator(), allocator_);
+      *buffers_.mutable_element(index) = buffer.Forget();
+    } else {
+      *buffers_.mutable_element(index) = se::DeviceMemoryBase();
+    }
+  }
+
+  // Like unique_ptr::release(), creates and returns a regular ShapedBuffer from
+  // this ScopedShapedBuffer, without freeing any of the associated memory.
+  //
+  // It's the caller's job to ensure that the memory contained therein is freed.
+  TF_MUST_USE_RESULT ShapedBuffer release();
 
  protected:
+  void Deallocate();
+
   DeviceMemoryAllocator* allocator_;
 };
 
diff --git a/tensorflow/compiler/xla/service/shaped_buffer_test.cc b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0fc243667911651c788e3c1e5f1d39d86170f1ad
--- /dev/null
+++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
@@ -0,0 +1,110 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
+
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace xla {
+namespace {
+
+TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) {
+  TF_ASSERT_OK_AND_ASSIGN(auto platforms,
+                          xla::PlatformUtil::GetSupportedPlatforms());
+  ASSERT_FALSE(platforms.empty());
+  auto* platform = platforms[0];
+  TF_ASSERT_OK_AND_ASSIGN(auto executors,
+                          xla::PlatformUtil::GetStreamExecutors(platform));
+  xla::StreamExecutorMemoryAllocator allocator(platform, executors);
+  const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {});
+  const int kDeviceOrdinal = 0;
+  auto scoped_buffer = tensorflow::MakeUnique<xla::ScopedShapedBuffer>(
+      shape, shape, &allocator, kDeviceOrdinal);
+  std::unique_ptr<xla::ShapedBuffer> buffer = std::move(scoped_buffer);
+  buffer = nullptr;
+}
+
+class TestAllocator : public DeviceMemoryAllocator {
+ public:
+  TestAllocator()
+      : DeviceMemoryAllocator(PlatformUtil::GetDefaultPlatform().ValueOrDie()) {
+  }
+
+  ~TestAllocator() override {
+    if (!allocations_.empty()) {
+      ADD_FAILURE() << "Some allocations not freed!";
+    }
+  }
+
+  // Pull in two-arg overload of Allocate.
+  using DeviceMemoryAllocator::Allocate;
+
+  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                        bool /*retry_on_failure*/) override {
+    // By contract, we must return null if size == 0.
+    if (size == 0) {
+      return OwningDeviceMemory();
+    }
+    void* buf = malloc(size);
+    allocations_.insert({device_ordinal, buf});
+    return OwningDeviceMemory(se::DeviceMemoryBase(buf, size), device_ordinal,
+                              this);
+  }
+
+  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override {
+    if (mem.is_null()) {
+      return Status::OK();
+    }
+
+    auto it = allocations_.find({device_ordinal, mem.opaque()});
+    if (it == allocations_.end()) {
+      ADD_FAILURE() << "Allocation not found (double free?)";
+    } else {
+      free(mem.opaque());
+      allocations_.erase(it);
+    }
+    return Status::OK();
+  }
+
+  bool AllowsAsynchronousDeallocation() const override { return false; }
+
+ private:
+  std::set<std::pair</*device_ordinal*/ int64, void*>> allocations_;
+};
+
+TEST(ScopedShapedBufferTest, TestMoveAssignmentOperator) {
+  Shape s = ShapeUtil::MakeShape(F32, {1});
+  TestAllocator allocator;
+  ScopedShapedBuffer sb1(s, s, &allocator, /*device_ordinal=*/0);
+  sb1.set_buffer(
+      allocator.Allocate(/*device_ordinal=*/0, /*size=*/42).ValueOrDie(),
+      /*index=*/{});
+
+  ScopedShapedBuffer sb2(s, s, &allocator, /*device_ordinal=*/1);
+  sb2.set_buffer(
+      allocator.Allocate(/*device_ordinal=*/1, /*size=*/10).ValueOrDie(),
+      /*index=*/{});
+
+  sb1 = std::move(sb2);
+
+  // TestAllocator's destructor checks that all memory was freed.
+}
+
+}  // anonymous namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index 8b71a415091f028b3167cddb2583754e72ba17c8..c4d01562c4e32225ebb984d8fcd93ec3fa86e403 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -37,7 +37,7 @@ TransferManager::GetPlatformTransferManagers() {
 }
 
 Status TransferManager::TransferArrayToDevice(
-    se::StreamExecutor* executor, const Literal& literal,
+    se::StreamExecutor* executor, const LiteralSlice& literal,
     const se::DeviceMemoryBase& dest) {
   const Shape on_device_shape = HostShapeToDeviceShape(literal.shape());
   TF_RET_CHECK(ShapeUtil::IsArray(on_device_shape))
@@ -196,9 +196,11 @@ StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
     const ShapeIndex& index = pair.first;
     se::DeviceMemoryBase& memory_base = pair.second;
     const Shape& subshape = ShapeUtil::GetSubshape(on_device_shape, index);
-    TF_ASSIGN_OR_RETURN(memory_base,
+    TF_ASSIGN_OR_RETURN(auto memory,
                         allocator->Allocate(shaped_buffer.device_ordinal(),
                                             GetByteSizeRequirement(subshape)));
+    // Move the allocated buffer into the ScopedShapedBuffer, which owns it.
+    memory_base = memory.Forget();
   }
 
   return std::move(shaped_buffer);
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index d82b4f0f81b5da38c1caf80bddefa0d3f7842463..43a8092b06fba0e2495bce0ee1a309c85a908273 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -65,14 +65,14 @@ class TransferManager {
   // of the ShapedBuffer and DeviceShape(literal.shape()) must be compatible,
   // but need not have the same layout
   virtual Status TransferLiteralToDevice(se::StreamExecutor* executor,
-                                         const Literal& literal,
+                                         const LiteralSlice& literal,
                                          const ShapedBuffer& device_buffer) = 0;
 
   // Convenience methods for transferring an array to or from the device at a
   // known address. This avoids having to construct a ShapedBuffer just to
   // transfer an array at a known address.
   Status TransferArrayToDevice(se::StreamExecutor* executor,
-                               const Literal& literal,
+                               const LiteralSlice& literal,
                                const se::DeviceMemoryBase& dest);
   StatusOr<std::unique_ptr<Literal>> TransferArrayFromDevice(
       se::StreamExecutor* executor, const Shape& shape,
@@ -81,7 +81,7 @@ class TransferManager {
   // Transfers the given literal into the Infeed interface of the device,
   // using the given executor.
   virtual Status TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                         const Literal& literal) = 0;
+                                         const LiteralSlice& literal) = 0;
 
   // Transfers the given literal from the Outfeed interface of the device,
   // using the given executor.
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index 3efd38ce0daa3e3f3398b32463019df6cd10a009..ba16dc640e2d2974eab4fc8b134a6e33c03e3b85 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -35,7 +35,8 @@ TransposeFolding::OperandIndices CanFoldOperandsIntoDot(
     const HloInstruction& dot,
     const TransposeFolding::TransposableGemmOperandsFn&
         transposable_gemm_operands) {
-  if (HloOpcode::kDot != dot.opcode()) {
+  if (HloOpcode::kDot != dot.opcode() ||
+      dot.dot_dimension_numbers().lhs_batch_dimensions_size() != 0) {
     return {};
   }
 
@@ -44,6 +45,8 @@ TransposeFolding::OperandIndices CanFoldOperandsIntoDot(
     auto& operand = *dot.operand(i);
     if (operand.IsRank2Transpose()) {
       operand_set.push_back(i);
+    } else if (ShapeUtil::Rank(operand.shape()) != 2) {
+      return {};
     }
   }
 
@@ -74,23 +77,39 @@ using InstructionOperandsPair =
 
 // Folds the operands of `dot` that are foldable transposes. `computation` is
 // the parent HLO computation of `dot`.
-//
-// Returns whether the module is changed.
-bool FoldTransposeIntoDot(InstructionOperandsPair pair) {
-  auto* dot = pair.first;
-  std::vector<HloInstruction*> instructions_to_fuse(1, dot);
-  for (const int64 operand_index : pair.second) {
-    instructions_to_fuse.push_back(dot->mutable_operand(operand_index));
-  }
-
-  // Early-exit if no operands are foldable.
-  if (instructions_to_fuse.size() == 1) {
-    return false;
+Status FoldTransposeIntoDot(InstructionOperandsPair pair) {
+  HloInstruction* dot = pair.first;
+
+  DotDimensionNumbers new_dim_numbers = dot->dot_dimension_numbers();
+  HloInstruction* new_lhs = dot->mutable_operand(0);
+  HloInstruction* new_rhs = dot->mutable_operand(1);
+
+  CHECK_EQ(new_dim_numbers.lhs_batch_dimensions_size(), 0);
+  CHECK_EQ(new_dim_numbers.rhs_batch_dimensions_size(), 0);
+  CHECK_EQ(new_dim_numbers.lhs_contracting_dimensions_size(), 1);
+  CHECK_EQ(new_dim_numbers.rhs_contracting_dimensions_size(), 1);
+
+  for (int64 operand_index : pair.second) {
+    // We've checked that there aren't any batch dimensions and that the inputs
+    // are rank 2, and shape inference guarantees that there is exactly one
+    // contracting dimension.
+    if (operand_index == 0) {
+      CHECK_EQ(new_lhs->opcode(), HloOpcode::kTranspose);
+      new_dim_numbers.set_lhs_contracting_dimensions(
+          0, 1 - new_dim_numbers.lhs_contracting_dimensions(0));
+      new_lhs = new_lhs->mutable_operand(0);
+    } else {
+      CHECK_EQ(operand_index, 1);
+      CHECK_EQ(new_rhs->opcode(), HloOpcode::kTranspose);
+      new_dim_numbers.set_rhs_contracting_dimensions(
+          0, 1 - new_dim_numbers.rhs_contracting_dimensions(0));
+      new_rhs = new_rhs->mutable_operand(0);
+    }
   }
 
-  dot->parent()->CreateFusionInstruction(
-      instructions_to_fuse, HloInstruction::FusionKind::kTransposeDot);
-  return true;
+  std::unique_ptr<HloInstruction> new_dot = HloInstruction::CreateDot(
+      dot->shape(), new_lhs, new_rhs, new_dim_numbers);
+  return dot->parent()->ReplaceWithNewInstruction(dot, std::move(new_dot));
 }
 
 // Folds the operands of `convolution` that are foldable transposes.
@@ -196,7 +215,7 @@ StatusOr<bool> TransposeFolding::Run(HloModule* module) {
             std::make_pair(instruction, operand_indices));
       }
     }
-    return tensorflow::Status::OK();
+    return Status::OK();
   };
 
   for (auto* comp : module->MakeNonfusionComputations()) {
@@ -205,7 +224,8 @@ StatusOr<bool> TransposeFolding::Run(HloModule* module) {
 
   bool changed = false;
   for (InstructionOperandsPair& pair : foldable_dots) {
-    changed |= FoldTransposeIntoDot(pair);
+    TF_RETURN_IF_ERROR(FoldTransposeIntoDot(pair));
+    changed = true;
   }
   for (InstructionOperandsPair& pair : foldable_convolutions) {
     changed |= FoldTransposeIntoConvolution(pair);
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 0319109f7fc54c6abfe7627bcaff747bade90f41..f73f1227aaf1630a9e7c43bb508732c5518ef929 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -31,9 +32,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
@@ -54,83 +58,102 @@ class TransposeFoldingTest : public HloTestBase {
 };
 
 TEST_F(TransposeFoldingTest, FoldDotTranspose) {
-  auto builder = HloComputation::Builder("entry_computation");
-  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
-      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {2, 3}),
-      /*name=*/"x"));
-  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
-      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {2, 3}),
-      /*name=*/"y"));
-  HloInstruction* transpose_y =
-      builder.AddInstruction(HloInstruction::CreateTranspose(
-          ShapeUtil::MakeShape(F32, {3, 2}), y, {1, 0}));
-  DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
-  HloInstruction* dot = builder.AddInstruction(
-      HloInstruction::CreateDot(ShapeUtil::MakeShape(F32, {2, 2}), /*lhs=*/x,
-                                /*rhs=*/transpose_y, dot_dnums));
+  string hlo_string = R"(
+HloModule FoldDotTranspose
+
+ENTRY entry_computation {
+  x = f32[2,3]{1,0} parameter(0)
+  y = f32[2,3]{1,0} parameter(1)
+  transpose = f32[3,2]{1,0} transpose(y), dimensions={1,0}
+  ROOT dot = f32[2,2]{1,0} dot(x, transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
 
-  auto module = CreateNewModule("test_module");
-  HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build(dot));
   FoldTranspose(module.get());
 
-  // Instructions after folding: x, y, and the fusion.
-  std::unordered_set<HloInstruction*> instruction_set(
-      entry_computation->instructions().begin(),
-      entry_computation->instructions().end());
-  CHECK_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
-  CHECK_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
-  CHECK_EQ(1, instruction_set.size())
-      << "entry_computation should contain exactly 3 instructions.";
-  HloInstruction* fusion = *instruction_set.begin();
-  EXPECT_EQ(HloOpcode::kFusion, fusion->opcode());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Dot(op::Parameter(0), op::Parameter(1),
+                      /*lhs_contracting_dim=*/1, /*rhs_contracting_dim=*/1));
+}
+
+TEST_F(TransposeFoldingTest, DontFoldTransposeOfBatchDim) {
+  string hlo_string = R"(
+HloModule FoldDotTranspose
 
-  // The fusion instruction should contain two parameters, one transpose and
-  // one dot.
-  EXPECT_EQ(4, fusion->fused_instruction_count());
+ENTRY entry_computation {
+  x = f32[2,3] parameter(0)
+  y = f32[3,2] parameter(1)
+  transpose = f32[2,3] transpose(y), dimensions={1,0}
+  ROOT dot = f32[2] dot(x, transpose), lhs_batch_dims={0}, rhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
+
+  TransposeFolding transpose_folding(
+      [](const HloInstruction& dot,
+         const TransposeFolding::OperandIndices& candidate_operands) {
+        return candidate_operands;
+      },
+      [](const HloInstruction& convolution,
+         const TransposeFolding::OperandIndices& candidate_operands) {
+        return candidate_operands;
+      });
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, transpose_folding.Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(TransposeFoldingTest, DontFoldTransposeOfRank1Dot) {
+  string hlo_string = R"(
+HloModule FoldDotTranspose
+
+ENTRY entry_computation {
+  x = f32[3] parameter(0)
+  y = f32[3,2] parameter(1)
+  transpose = f32[2,3] transpose(y), dimensions={1,0}
+  ROOT dot = f32[2] dot(x, transpose), lhs_batch_dims={}, rhs_batch_dims={0}, lhs_contracting_dims={0}, rhs_contracting_dims={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
+
+  TransposeFolding transpose_folding(
+      [](const HloInstruction& dot,
+         const TransposeFolding::OperandIndices& candidate_operands) {
+        return candidate_operands;
+      },
+      [](const HloInstruction& convolution,
+         const TransposeFolding::OperandIndices& candidate_operands) {
+        return candidate_operands;
+      });
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, transpose_folding.Run(module.get()));
+  EXPECT_FALSE(changed);
 }
 
 TEST_F(TransposeFoldingTest, FoldDotTransposeConstant) {
-  auto builder = HloComputation::Builder("entry_computation");
-  // 2x1
-  HloInstruction* const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR2<float>({{1}, {2}})));
-  // 3x2
-  HloInstruction* const1 =
-      builder.AddInstruction(HloInstruction::CreateConstant(
-          Literal::CreateR2<float>({{1, 2}, {3, 4}, {5, 6}})));
-  HloInstruction* transpose0 =
-      builder.AddInstruction(HloInstruction::CreateTranspose(
-          ShapeUtil::MakeShape(F32, {1, 2}), const0, {1, 0}));
-  HloInstruction* transpose1 =
-      builder.AddInstruction(HloInstruction::CreateTranspose(
-          ShapeUtil::MakeShape(F32, {2, 3}), const1, {1, 0}));
-  DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
-      ShapeUtil::MakeShape(F32, {1, 3}),
-      /*lhs=*/transpose0, /*rhs=*/transpose1, dot_dnums));
+  string hlo_string = R"(
+HloModule FoldDotTransposeConstant
+
+ENTRY entry_computation {
+  constant = f32[2,1]{1,0} constant(f32[2,1] { { 1 }, { 2 } })
+  transpose = f32[1,2]{1,0} transpose(constant), dimensions={1,0}
+  constant.1 = f32[3,2]{1,0} constant(f32[3,2] { { 1, 2 }, { 3, 4 }, { 5, 6 } })
+  transpose.1 = f32[2,3]{1,0} transpose(constant.1), dimensions={1,0}
+  ROOT dot = f32[1,3]{1,0} dot(transpose, transpose.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
 
-  auto module = CreateNewModule("test_module");
-  HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build(dot));
   FoldTranspose(module.get());
 
-  for (auto* instruction : entry_computation->instructions()) {
-    if (instruction->opcode() == HloOpcode::kFusion) {
-      CHECK_EQ(2, instruction->operand_count());
-      EXPECT_EQ(const0, instruction->operand(0));
-      EXPECT_EQ(const1, instruction->operand(1));
-    }
-  }
-
-  // The created fusion instruction should contain two parameters, two
-  // transposes (one for each parameter) and one dot.
-  EXPECT_EQ(5,
-            entry_computation->root_instruction()->fused_instruction_count());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Dot(op::Constant(), op::Constant(),
+                      /*lhs_contracting_dim=*/0, /*rhs_contracting_dim=*/1));
 }
 
 TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
@@ -164,50 +187,32 @@ TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
   EXPECT_EQ(6, callee_computation->instruction_count());
 }
 
-TEST_F(TransposeFoldingTest, FoldDotTransposeInWhile) {
-  auto builder = HloComputation::Builder("entry_computation");
-  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
-      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {2, 3}),
-      /*name=*/"x"));
-  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
-      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {2, 3}),
-      /*name=*/"y"));
-  HloInstruction* transpose_y =
-      builder.AddInstruction(HloInstruction::CreateTranspose(
-          ShapeUtil::MakeShape(F32, {3, 2}), y, {1, 0}));
-  DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
-  HloInstruction* dot = builder.AddInstruction(
-      HloInstruction::CreateDot(ShapeUtil::MakeShape(F32, {2, 2}), /*lhs=*/x,
-                                /*rhs=*/transpose_y, dot_dnums));
-
-  auto module = CreateNewModule("test_module");
-  HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build(dot));
+TEST_F(TransposeFoldingTest, FoldDotTransposeInCall) {
+  string hlo_string = R"(
+HloModule FoldDotTransposeInCall
 
-  HloInstruction* call = module->OutlineExpressionFromComputation(
-      {transpose_y, dot}, "outlined", entry_computation);
+callee {
+  name.0 = f32[2,3]{1,0} parameter(0)
+  name.1 = f32[2,3]{1,0} parameter(1)
+  transpose.clone = f32[3,2]{1,0} transpose(name.0), dimensions={1,0}
+  ROOT dot.clone = f32[2,2]{1,0} dot(name.1, transpose.clone), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
 
+ENTRY entry_computation {
+  y = f32[2,3]{1,0} parameter(1)
+  x = f32[2,3]{1,0} parameter(0)
+  ROOT call = f32[2,2]{1,0} call(y, x), to_apply=callee
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
   FoldTranspose(module.get());
 
-  // Instructions after folding: x, y, and the fusion.
-  std::unordered_set<HloInstruction*> instruction_set(
-      entry_computation->instructions().begin(),
-      entry_computation->instructions().end());
-  CHECK_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
-  CHECK_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
-  CHECK_EQ(1, instruction_set.erase(call))
-      << "call is not in entry_computation.";
-  CHECK(instruction_set.empty())
-      << "entry_computation should contain exactly 3 instructions.";
-  HloInstruction* fusion =
-      call->called_computations().front()->root_instruction();
-  EXPECT_EQ(HloOpcode::kFusion, fusion->opcode());
-
-  // The fusion instruction should contain two parameters, one transpose and
-  // one dot.
-  EXPECT_EQ(4, fusion->fused_instruction_count());
+  const HloComputation* callee = module->GetComputationWithName("callee");
+  ASSERT_NE(callee, nullptr);
+  EXPECT_THAT(callee->root_instruction(),
+              op::Dot(op::Parameter(1), op::Parameter(0),
+                      /*lhs_contracting_dim=*/1, /*rhs_contracting_dim=*/1));
 }
 
 // Test that a two dimension swap of the kernel gets folded into convolution.
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 657a8fe09ae9df906d695f7f49df72500d611792..8cb654493ca82dc702b2c1e7a4284f4f31d1e5f9 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -588,4 +588,201 @@ void TuplePointsToAnalysis::InstructionToString(
   });
 }
 
+bool TuplePointsToAnalysis::DoesNotUseOperandBuffer(
+    const HloInstruction* operand, const ShapeIndex& index,
+    const HloInstruction* user) const {
+  CHECK(user->IsUserOf(operand))
+      << "user: " << user->ToString() << " operand: " << operand->ToString();
+  if (user->opcode() == HloOpcode::kGetTupleElement && !index.empty()) {
+    // GetTupleElement instructions only access the top-level buffer of their
+    // operand.
+    return true;
+  } else if (user->opcode() == HloOpcode::kFusion &&
+             user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
+    // Find fusion parameter associated with 'operand'.
+    auto it = std::find_if(
+        user->fused_parameters().begin(), user->fused_parameters().end(),
+        [=](HloInstruction* fused_param) {
+          return user->operand(fused_param->parameter_number()) == operand;
+        });
+    CHECK(it != user->fused_parameters().end());
+    // Iterate through all users of all buffer aliases of the buffer in the
+    // points-to set of fusion parameter at 'index'.
+    // Return false if any uses are detected at 'index', returns true otherwise.
+    const LogicalBuffer* buffer = GetBufferDefinedAt(*it, index).ValueOrDie();
+    for (const BufferAlias& alias : GetBufferAliases(*buffer)) {
+      for (HloInstruction* alias_user : alias.instruction()->users()) {
+        if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(),
+                                    alias_user)) {
+          continue;
+        }
+        // Return false: use detected at 'buffer' -> 'alias' -> 'alias_user'.
+        return false;
+      }
+    }
+    // Return true: found no uses of 'operand' at 'index' in 'user'.
+    return true;
+  }
+  return false;
+}
+
+// Returns all uses of all aliases of 'instruction' at 'index' in 'uses'.
+// Each use in 'uses' is a pair (HloInstruction* user, int64 operand_index)
+// where 'user' is a user of an alias of 'instruction' at 'index', and
+// 'operand_index' is the operand index at which the alias appears in the
+// operand list of 'user'.
+std::vector<std::pair<HloInstruction*, int64>>
+TuplePointsToAnalysis::GetAllUsesOfInstructionAtIndex(
+    HloInstruction* instruction, const ShapeIndex& index) const {
+  std::vector<std::pair<HloInstruction*, int64>> uses;
+  const PointsToSet::BufferList& points_to =
+      GetPointsToSet(instruction).element(index);
+  for (const LogicalBuffer* buffer : points_to) {
+    for (const BufferAlias& alias : GetBufferAliases(*buffer)) {
+      for (HloInstruction* alias_user : alias.instruction()->users()) {
+        if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(),
+                                    alias_user)) {
+          continue;
+        }
+        for (int64 op_idx : alias_user->OperandIndices(alias.instruction())) {
+          uses.emplace_back(alias_user, op_idx);
+        }
+      }
+    }
+  }
+  return uses;
+}
+
+// Returns true if there is exactly one use of 'operand' at 'operand_index'
+// in 'fusion.fused_instructions', where the singleton use is the fused
+// root at operand index 'use_operand_index'. Returns false otherwise.
+//
+// REQUIRES: 'fusion' opcode is a kFusion instruction.
+bool TuplePointsToAnalysis::HasUniqueFusedUseOfOperandAt(
+    HloInstruction* operand, const ShapeIndex& operand_index,
+    HloInstruction* fusion, const int64 use_operand_index) const {
+  CHECK_EQ(HloOpcode::kFusion, fusion->opcode());
+  // Check that 'operand' is unique in the operand list of 'fusion'.
+  if (fusion->OperandIndices(operand).size() > 1) {
+    return false;
+  }
+  // Find fusion parameter associated with 'operand'.
+  const auto& fused_params = fusion->fused_parameters();
+  auto fused_param_it = std::find_if(
+      fused_params.begin(), fused_params.end(),
+      [&](HloInstruction* fused_param) {
+        return fusion->operand(fused_param->parameter_number()) == operand;
+      });
+  if (fused_param_it == fused_params.end()) {
+    return false;
+  }
+  auto* fused_param = *fused_param_it;
+  // Get all uses of 'operand' at 'index' from 'fusion.fused_instructions'.
+  auto fused_param_uses =
+      GetAllUsesOfInstructionAtIndex(fused_param, operand_index);
+  // Return true iff there is exactly one use of 'operand' at 'index', and
+  // this singleton use is the fused root (at index in 'use_operand_indices').
+  return fused_param_uses.size() == 1 &&
+         fused_param_uses[0].first == fusion->fused_expression_root() &&
+         fused_param_uses[0].second == use_operand_index;
+}
+
+// User and operand can share buffers iff both instructions emit the same shape
+// and layout, and 'user' meets one of the following qualifications:
+//
+// (1) Is element-wise. Or...
+// (2) Is a loop fusion instruction where the only use of 'operand' at 'index'
+//     in the set 'user.fused_instructions' is a DynamicUpdateSlice fused root
+//     at operand 0. Or...
+// (3) Is a kDot -> kAdd output fusion instruction where the only use of
+//     'operand' at 'index' in the set 'user.fused_instructions' is a kAdd fused
+//     root at operand 0 or 1. Or...
+// (4) The 'user' of 'operand' is DynamicUpdateSlice or While at operand index
+//     0.
+//
+// (2) and (3) can only be determined if points-to analysis is available.
+bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
+    HloInstruction* operand, const ShapeIndex& operand_index,
+    HloInstruction* user, const ShapeIndex& user_index) const {
+  CHECK(user->IsUserOf(operand))
+      << "user: " << user->ToString() << " operand: " << operand->ToString();
+  const Shape& operand_subshape =
+      ShapeUtil::GetSubshape(operand->shape(), operand_index);
+  const Shape& user_subshape =
+      ShapeUtil::GetSubshape(user->shape(), user_index);
+  // Check that operand and user emit the same shape and layout.
+  if (!ShapeUtil::Equal(operand_subshape, user_subshape)) {
+    return false;
+  }
+  if (user->opcode() == HloOpcode::kFusion) {
+    if (user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
+        user->fused_expression_root()->opcode() ==
+            HloOpcode::kDynamicUpdateSlice) {
+      // Loop fusion with kDynamicUpdateSlice fused root.
+      //
+      // Returns true iff there is exactly one use of 'operand' at shape index
+      // 'operand_index', and this singleton use is the fused root at operand
+      // index 0.
+      return HasUniqueFusedUseOfOperandAt(operand, operand_index, user, 0);
+    } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
+               user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
+      // Output fusion with kAdd fused root.
+
+      // Check if one operand of kAdd fused root is kDot or kConvolution.
+      auto* add = user->fused_expression_root();
+      auto add_operand_it =
+          std::find_if(add->operands().begin(), add->operands().end(),
+                       [&](HloInstruction* operand) {
+                         return operand->opcode() == HloOpcode::kConvolution ||
+                                operand->opcode() == HloOpcode::kDot;
+                       });
+      if (add_operand_it == add->operands().end()) {
+        return false;
+      }
+      auto* matched_add_operand = *add_operand_it;
+      // Calculate operand index of 'add' operand which was not matched above.
+      const int64 other_add_operand_index =
+          matched_add_operand == add->operand(0) ? 1 : 0;
+      // Returns true iff there is exactly one use of 'operand' at shape index
+      // 'operand_index', and this singleton use is the fused root (at operand
+      // index 'other_add_operand_index').
+      return HasUniqueFusedUseOfOperandAt(operand, operand_index, user,
+                                          other_add_operand_index);
+    }
+  }
+  if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
+      user->opcode() == HloOpcode::kWhile) {
+    // We eliminated other users in BufferLiveness::live_range_strictly_before,
+    // so here we just need to check that the use is at operand index 0.
+    std::vector<int64> operand_indices = user->OperandIndices(operand);
+    return operand_indices.size() == 1 && operand_indices[0] == 0;
+  }
+  if (user->opcode() == HloOpcode::kCall) {
+    // TODO(b/62548313): Remove when buffer assignment is module scoped and
+    // does not assign buffers to calls.
+    // Find called computation parameter associated with 'operand'.
+    const std::vector<int64> operand_indices = user->OperandIndices(operand);
+    if (operand_indices.size() > 1) {
+      return false;
+    }
+    CHECK_EQ(1, operand_indices.size());
+    auto* param = user->to_apply()->parameter_instruction(operand_indices[0]);
+    // Get all uses of 'operand' at 'index' in called computation.
+    auto param_uses = GetAllUsesOfInstructionAtIndex(param, operand_index);
+
+    // Return true iff:
+    // *) There exists exactly one use of 'operand' in called computation.
+    // *) The unique use is by the root instruction of called computation.
+    //    (Note: we check the root of the called computation, because the
+    //     root result buffer is required to alias with the Call result buffer).
+    // *) The root instruction of the called computation is element-wise on
+    //    'operand'.
+    auto* callee_root = user->to_apply()->root_instruction();
+    return param_uses.size() == 1 && param_uses[0].first == callee_root &&
+           callee_root->IsElementwiseOnOperand(param_uses[0].second);
+  }
+  // Check if 'user' is element-wise.
+  return user->IsElementwise();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index c3743b150168ebcf1051050dc511e50c43108c4f..1ac713013650d807b15e33565e6d2dec406a5d13 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -256,6 +256,23 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
 
   string ToString() const;
 
+  // Returns true if 'user' cannot possibly use the buffer at 'index' in
+  // 'operand'. Returns false otherwise.
+  //
+  // REQUIRES: 'operand' is an operand of 'user'.
+  bool DoesNotUseOperandBuffer(const HloInstruction* operand,
+                               const ShapeIndex& index,
+                               const HloInstruction* user) const;
+
+  // Returns true if 'user' (at 'user_index') can share a buffer with its
+  // operand 'operand' (at 'operand_index'). Returns false otherwise.
+  //
+  // REQUIRES: 'operand' is an operand of 'user'.
+  bool CanShareOperandBufferWithUser(HloInstruction* operand,
+                                     const ShapeIndex& operand_index,
+                                     HloInstruction* user,
+                                     const ShapeIndex& user_index) const;
+
  private:
   explicit TuplePointsToAnalysis(
       const HloModule* module,
@@ -310,6 +327,13 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
     return &per_instruction_[id];
   }
 
+  std::vector<std::pair<HloInstruction*, int64>> GetAllUsesOfInstructionAtIndex(
+      HloInstruction* instruction, const ShapeIndex& index) const;
+  bool HasUniqueFusedUseOfOperandAt(HloInstruction* operand,
+                                    const ShapeIndex& operand_index,
+                                    HloInstruction* fusion,
+                                    const int64 use_operand_index) const;
+
   // The module this analysis is performed on.
   const HloModule* module_;
 
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index dec446d4dac650ba43992f7870764eedc80cb2cf..f558316b05b168a6f100e8ef69adfd9dbc023102 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -805,5 +805,348 @@ TEST_F(FusionPointsToAnalysisTest, FusionParam0TwoUsers) {
   Run(/*add_additional_gte0_user=*/true);
 }
 
+class PointsToAnalysisTestBase : public HloTestBase {
+ protected:
+  void BuildModule(std::unique_ptr<HloComputation> computation) {
+    module_ = CreateNewModule();
+    computation_ = module_->AddEntryComputation(std::move(computation));
+  }
+
+  void RunAnalysis() {
+    CHECK_NOTNULL(module_.get());
+    points_to_analysis_ =
+        TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
+  }
+
+  void BuildModuleAndRunAnalysis(std::unique_ptr<HloComputation> computation) {
+    BuildModule(std::move(computation));
+    RunAnalysis();
+  }
+
+  std::unique_ptr<HloModule> module_;
+  HloComputation* computation_ = nullptr;
+  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
+};
+
+class DoesNotUseOperandBufferTest : public PointsToAnalysisTestBase {};
+
+TEST_F(DoesNotUseOperandBufferTest, GetTupleElement) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape elem_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({elem_shape, elem_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(elem_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(elem_shape, tuple, 1));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(elem_shape, HloOpcode::kAdd, gte0, gte1));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  // GetTupleElement instructions only access the top-level buffer of their
+  // operand.
+  EXPECT_TRUE(points_to_analysis_->DoesNotUseOperandBuffer(tuple, {0}, gte0));
+  EXPECT_TRUE(points_to_analysis_->DoesNotUseOperandBuffer(tuple, {1}, gte1));
+  EXPECT_FALSE(points_to_analysis_->DoesNotUseOperandBuffer(tuple, {}, gte0));
+  EXPECT_FALSE(points_to_analysis_->DoesNotUseOperandBuffer(tuple, {}, gte1));
+}
+
+TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
+
+  // Create a DynamicUpdateSlice instruction of tuple element 1.
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
+  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR1<float>({2.f, 2.f, 2.f})));
+  auto dynamic_update_slice =
+      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          data_shape, gte1, update, starts));
+  builder.AddInstruction(
+      HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {dynamic_update_slice, starts, update, gte1},
+      HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  // The fusion instruction never uses tuple element 0, but does use element 1.
+  EXPECT_TRUE(points_to_analysis_->DoesNotUseOperandBuffer(tuple, {0}, fusion));
+  EXPECT_FALSE(
+      points_to_analysis_->DoesNotUseOperandBuffer(tuple, {1}, fusion));
+}
+
+class CanShareOperandBufferWithUserTest : public PointsToAnalysisTestBase {};
+
+TEST_F(CanShareOperandBufferWithUserTest, ElementWiseSameShape) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kExp, param));
+  auto log = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kLog, exp));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_TRUE(
+      points_to_analysis_->CanShareOperandBufferWithUser(param, {}, exp, {}));
+  EXPECT_TRUE(
+      points_to_analysis_->CanShareOperandBufferWithUser(exp, {}, log, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, ElementWiseDifferentShape) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape in_shape = ShapeUtil::MakeShape(F32, {8});
+  Shape out_shape = ShapeUtil::MakeShape(PRED, {8});
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, in_shape, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, in_shape, "param1"));
+  auto result = builder.AddInstruction(
+      HloInstruction::CreateBinary(out_shape, HloOpcode::kEq, param0, param1));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_FALSE(points_to_analysis_->CanShareOperandBufferWithUser(param0, {},
+                                                                  result, {}));
+  EXPECT_FALSE(points_to_analysis_->CanShareOperandBufferWithUser(param1, {},
+                                                                  result, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, CopyShares) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kExp, param));
+  auto copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCopy, exp));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_TRUE(
+      points_to_analysis_->CanShareOperandBufferWithUser(param, {}, exp, {}));
+  EXPECT_TRUE(
+      points_to_analysis_->CanShareOperandBufferWithUser(exp, {}, copy, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
+
+  // Create a DynamicUpdateSlice instruction of tuple element 1.
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
+  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR1<float>({2.f, 2.f, 2.f})));
+  auto dynamic_update_slice =
+      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          data_shape, gte1, update, starts));
+  builder.AddInstruction(
+      HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {dynamic_update_slice, starts, update, gte1},
+      HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  // The fusion instruction can share with tuple element 1.
+  EXPECT_FALSE(points_to_analysis_->CanShareOperandBufferWithUser(tuple, {0},
+                                                                  fusion, {}));
+  EXPECT_TRUE(points_to_analysis_->CanShareOperandBufferWithUser(tuple, {1},
+                                                                 fusion, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  Shape update_shape = ShapeUtil::MakeShape(F32, {4});
+  Shape starts_shape = ShapeUtil::MakeShape(S32, {1});
+  auto data = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape, "data"));
+  auto update = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, update_shape, "update"));
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, starts_shape, "starts"));
+  auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      data_shape, data, update, starts));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  // The DynamicUpdateSlice instruction can share with the data operand, but not
+  // with update or starts.
+  EXPECT_TRUE(
+      points_to_analysis_->CanShareOperandBufferWithUser(data, {}, dus, {}));
+  EXPECT_FALSE(
+      points_to_analysis_->CanShareOperandBufferWithUser(update, {}, dus, {}));
+  EXPECT_FALSE(
+      points_to_analysis_->CanShareOperandBufferWithUser(starts, {}, dus, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
+  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateDot(data_shape, a, b, dot_dnums));
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto add_operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      data_shape, HloOpcode::kAdd, dot, add_operand));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, dot}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused dot add should be able to share buffer with 'add_operand'.
+  EXPECT_TRUE(points_to_analysis_->CanShareOperandBufferWithUser(
+      add_operand, {}, fusion, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto reverse = builder.AddInstruction(
+      HloInstruction::CreateReverse(data_shape, operand, {0, 1}));
+
+  auto two = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, reverse, two));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, two, reverse}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused operand->reverse->add cannot alias operand buffer 'operand'.
+  EXPECT_FALSE(points_to_analysis_->CanShareOperandBufferWithUser(operand, {},
+                                                                  fusion, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+
+  auto make_cond = [this, &data_shape]() {
+    auto builder = HloComputation::Builder(TestName() + ".Cond");
+    auto data = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, data_shape, "data"));
+    builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kEq, data, data));
+    return builder.Build();
+  };
+
+  auto make_body = [this, &data_shape]() {
+    auto builder = HloComputation::Builder(TestName() + ".Body");
+    auto data = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, data_shape, "data"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, data, data));
+    return builder.Build();
+  };
+
+  module_ = CreateNewModule();
+  HloComputation* cond_computation =
+      module_->AddEmbeddedComputation(make_cond());
+  HloComputation* body_computation =
+      module_->AddEmbeddedComputation(make_body());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto data = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape, "data"));
+  auto whil = builder.AddInstruction(HloInstruction::CreateWhile(
+      data_shape, cond_computation, body_computation, data));
+  computation_ = module_->AddEntryComputation(builder.Build());
+
+  RunAnalysis();
+
+  // The While instruction can share with the data operand.
+  EXPECT_TRUE(
+      points_to_analysis_->CanShareOperandBufferWithUser(data, {}, whil, {}));
+}
+
+// Tests that Call can alias operand buffer if the only use of the operand
+// in the called computation is an elementwise instruction.
+TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  // Build sub-computation with fusion root.
+  auto sub_builder = HloComputation::Builder(TestName() + "_sub");
+  auto sub_param = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "sub_param"));
+  auto one = sub_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto ones = sub_builder.AddInstruction(
+      HloInstruction::CreateBroadcast(shape, one, {1}));
+  auto add = sub_builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sub_param, ones));
+
+  module_ = CreateNewModule();
+  auto sub_computation = module_->AddEmbeddedComputation(sub_builder.Build());
+  sub_computation->CreateFusionInstruction({add, ones},
+                                           HloInstruction::FusionKind::kLoop);
+
+  // Build entry-computation with kCall which calls 'sub_computation'.
+  auto builder = HloComputation::Builder(TestName());
+
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto reverse =
+      builder.AddInstruction(HloInstruction::CreateReverse(shape, param, {0}));
+  auto call = builder.AddInstruction(
+      HloInstruction::CreateCall(shape, {reverse}, sub_computation));
+  computation_ = module_->AddEntryComputation(builder.Build());
+
+  RunAnalysis();
+
+  EXPECT_TRUE(points_to_analysis_->CanShareOperandBufferWithUser(reverse, {},
+                                                                 call, {}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc
index 113c2e2bd9f73a2b0c783103d7f2da9534bc97c3..d668855084a884518b338cdf396a9330b9f43a2b 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc
@@ -69,6 +69,7 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
       //       Tuple
       //
       HloInstruction* top_tuple = nullptr;
+      HloInstruction* first_gte = nullptr;
       bool can_simplify = true;
       for (int64 operand_number = 0;
            operand_number < instruction->operand_count(); ++operand_number) {
@@ -78,11 +79,17 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
           can_simplify = false;
           break;
         }
-
+        if (first_gte == nullptr) {
+          first_gte = operand;
+        } else if (!first_gte->has_compatible_sharding(operand)) {
+          can_simplify = false;
+          break;
+        }
         if (top_tuple == nullptr) {
           top_tuple = operand->mutable_operand(0);
           if (!ShapeUtil::Compatible(top_tuple->shape(),
-                                     instruction->shape())) {
+                                     instruction->shape()) ||
+              !instruction->has_compatible_sharding(top_tuple)) {
             can_simplify = false;
             break;
           }
@@ -108,15 +115,17 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
       //          |
       //         GTE
       if (instruction->operand(0)->opcode() == HloOpcode::kTuple) {
-        changed = true;
         HloInstruction* element_source =
             instruction->mutable_operand(0)->mutable_operand(
                 instruction->tuple_index());
-        TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source));
-        for (HloInstruction* user : element_source->users()) {
-          if (user->opcode() == HloOpcode::kTuple ||
-              user->opcode() == HloOpcode::kGetTupleElement) {
-            worklist.push(user);
+        if (instruction->has_compatible_sharding(element_source)) {
+          changed = true;
+          TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source));
+          for (HloInstruction* user : element_source->users()) {
+            if (user->opcode() == HloOpcode::kTuple ||
+                user->opcode() == HloOpcode::kGetTupleElement) {
+              worklist.push(user);
+            }
           }
         }
       }
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 0f16a592b68e20f5dbd1e4655ad5720ecce5a7bd..9e62d0acfb98946f1e693fc0310098b4ec99750b 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -55,6 +55,8 @@ HloOpcode UnaryOperationToHloOpcode(UnaryOperation unop) {
       return HloOpcode::kCos;
     case UNOP_EXP:
       return HloOpcode::kExp;
+    case UNOP_EXPM1:
+      return HloOpcode::kExpm1;
     case UNOP_FLOOR:
       return HloOpcode::kFloor;
     case UNOP_IMAG:
@@ -63,6 +65,8 @@ HloOpcode UnaryOperationToHloOpcode(UnaryOperation unop) {
       return HloOpcode::kIsFinite;
     case UNOP_LOG:
       return HloOpcode::kLog;
+    case UNOP_LOG1P:
+      return HloOpcode::kLog1p;
     case UNOP_NOT:
       return HloOpcode::kNot;
     case UNOP_NEGATE:
diff --git a/tensorflow/compiler/xla/service_interface.h b/tensorflow/compiler/xla/service_interface.h
index 5b44c26b7c7b082556d9533cf3b3b1b98e5e4b09..141347a792c23a2c542d7b564ab76c118409865d 100644
--- a/tensorflow/compiler/xla/service_interface.h
+++ b/tensorflow/compiler/xla/service_interface.h
@@ -16,8 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERFACE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_INTERFACE_H_
 
+#include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 
@@ -31,99 +32,93 @@ class ServiceInterface {
   virtual ~ServiceInterface() = default;
 
   // TODO(b/31824348): Convert to use StatusOr.
-  virtual tensorflow::Status TransferToClient(
-      const TransferToClientRequest* arg, TransferToClientResponse* result) = 0;
+  virtual Status TransferToClient(const TransferToClientRequest* arg,
+                                  TransferToClientResponse* result) = 0;
 
-  virtual tensorflow::Status TransferToServer(
-      const TransferToServerRequest* arg, TransferToServerResponse* result) = 0;
+  virtual Status TransferToServer(const TransferToServerRequest* arg,
+                                  TransferToServerResponse* result) = 0;
 
-  virtual tensorflow::Status TransferToInfeed(
-      const TransferToInfeedRequest* arg, TransferToInfeedResponse* result) = 0;
+  virtual Status TransferToInfeed(const TransferToInfeedRequest* arg,
+                                  TransferToInfeedResponse* result) = 0;
 
-  virtual tensorflow::Status TransferFromOutfeed(
-      const TransferFromOutfeedRequest* arg,
-      TransferFromOutfeedResponse* result) = 0;
+  virtual Status TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
+                                     TransferFromOutfeedResponse* result) = 0;
 
-  virtual tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
-                                         ResetDeviceResponse* result) = 0;
+  virtual Status ResetDevice(const ResetDeviceRequest* arg,
+                             ResetDeviceResponse* result) = 0;
 
-  virtual tensorflow::Status LoadComputationSnapshot(
+  virtual Status LoadComputationSnapshot(
       const LoadComputationSnapshotRequest* request,
       LoadComputationSnapshotResponse* result) = 0;
 
-  virtual tensorflow::Status Execute(const ExecuteRequest* arg,
-                                     ExecuteResponse* result) = 0;
+  virtual Status Execute(const ExecuteRequest* arg,
+                         ExecuteResponse* result) = 0;
 
-  virtual tensorflow::Status ExecuteGraph(const ExecuteGraphRequest* arg,
-                                          ExecuteResponse* result) = 0;
+  virtual Status ExecuteGraph(const ExecuteGraphRequest* arg,
+                              ExecuteResponse* result) = 0;
 
-  virtual tensorflow::Status ExecuteParallel(
-      const ExecuteParallelRequest* arg, ExecuteParallelResponse* result) = 0;
+  virtual Status ExecuteParallel(const ExecuteParallelRequest* arg,
+                                 ExecuteParallelResponse* result) = 0;
 
-  virtual tensorflow::Status ExecuteGraphParallel(
-      const ExecuteGraphParallelRequest* arg,
-      ExecuteParallelResponse* result) = 0;
+  virtual Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
+                                      ExecuteParallelResponse* result) = 0;
 
-  virtual tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
-                                          ExecuteAsyncResponse* result) = 0;
+  virtual Status ExecuteAsync(const ExecuteAsyncRequest* arg,
+                              ExecuteAsyncResponse* result) = 0;
 
-  virtual tensorflow::Status WaitForExecution(
-      const WaitForExecutionRequest* arg, WaitForExecutionResponse* result) = 0;
+  virtual Status WaitForExecution(const WaitForExecutionRequest* arg,
+                                  WaitForExecutionResponse* result) = 0;
 
-  virtual tensorflow::Status DeconstructTuple(
-      const DeconstructTupleRequest* arg, DeconstructTupleResponse* result) = 0;
+  virtual Status DeconstructTuple(const DeconstructTupleRequest* arg,
+                                  DeconstructTupleResponse* result) = 0;
 
-  virtual tensorflow::Status GetComputationStats(
-      const ComputationStatsRequest* arg, ComputationStatsResponse* result) = 0;
+  virtual Status GetComputationStats(const ComputationStatsRequest* arg,
+                                     ComputationStatsResponse* result) = 0;
 
-  virtual tensorflow::Status GetComputationGraphStats(
+  virtual Status GetComputationGraphStats(
       const ComputationGraphStatsRequest* arg,
       ComputationStatsResponse* result) = 0;
 
-  virtual tensorflow::Status GetComputationShape(
-      const GetComputationShapeRequest* arg,
-      GetComputationShapeResponse* result) = 0;
+  virtual Status GetComputationShape(const GetComputationShapeRequest* arg,
+                                     GetComputationShapeResponse* result) = 0;
 
-  virtual tensorflow::Status GetShape(const GetShapeRequest* arg,
-                                      GetShapeResponse* result) = 0;
+  virtual Status GetShape(const GetShapeRequest* arg,
+                          GetShapeResponse* result) = 0;
 
-  virtual tensorflow::Status CreateChannelHandle(
-      const CreateChannelHandleRequest* arg,
-      CreateChannelHandleResponse* result) = 0;
+  virtual Status CreateChannelHandle(const CreateChannelHandleRequest* arg,
+                                     CreateChannelHandleResponse* result) = 0;
 
-  virtual tensorflow::Status GetDeviceHandles(
-      const GetDeviceHandlesRequest* arg, GetDeviceHandlesResponse* result) = 0;
+  virtual Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
+                                  GetDeviceHandlesResponse* result) = 0;
 
   // Methods used by ComputationBuilder.
-  virtual tensorflow::Status Computation(const ComputationRequest* arg,
-                                         ComputationResponse* result) = 0;
+  virtual Status Computation(const ComputationRequest* arg,
+                             ComputationResponse* result) = 0;
 
-  virtual tensorflow::Status Op(const OpRequest* arg, OpResponse* result) = 0;
+  virtual Status Op(const OpRequest* arg, OpResponse* result) = 0;
 
-  virtual tensorflow::Status GetLocalShape(const GetLocalShapeRequest* arg,
-                                           GetLocalShapeResponse* result) = 0;
+  virtual Status GetLocalShape(const GetLocalShapeRequest* arg,
+                               GetLocalShapeResponse* result) = 0;
 
-  virtual tensorflow::Status SetReturnValue(
-      const SetReturnValueRequest* arg, SetReturnValueResponse* results) = 0;
+  virtual Status SetReturnValue(const SetReturnValueRequest* arg,
+                                SetReturnValueResponse* results) = 0;
 
-  virtual tensorflow::Status IsConstant(const IsConstantRequest* arg,
-                                        IsConstantResponse* result) = 0;
+  virtual Status IsConstant(const IsConstantRequest* arg,
+                            IsConstantResponse* result) = 0;
 
-  virtual tensorflow::Status ComputeConstant(
-      const ComputeConstantRequest* arg, ComputeConstantResponse* result) = 0;
+  virtual Status ComputeConstant(const ComputeConstantRequest* arg,
+                                 ComputeConstantResponse* result) = 0;
 
-  virtual tensorflow::Status ComputeConstantGraph(
-      const ComputeConstantGraphRequest* arg,
-      ComputeConstantResponse* result) = 0;
+  virtual Status ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
+                                      ComputeConstantResponse* result) = 0;
 
   // Methods used by Computation.
-  virtual tensorflow::Status SnapshotComputation(
-      const SnapshotComputationRequest* ag,
-      SnapshotComputationResponse* result) = 0;
+  virtual Status SnapshotComputation(const SnapshotComputationRequest* ag,
+                                     SnapshotComputationResponse* result) = 0;
 
   // Methods used by GlobalData.
-  virtual tensorflow::Status Unregister(const UnregisterRequest* arg,
-                                        UnregisterResponse* result) = 0;
+  virtual Status Unregister(const UnregisterRequest* arg,
+                            UnregisterResponse* result) = 0;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_layout.cc b/tensorflow/compiler/xla/shape_layout.cc
index 789eba5780d37e1fd4d80ec881855951c8bba0eb..7ee366b27a82bdbcb7a63a57ea80194db8ca7df4 100644
--- a/tensorflow/compiler/xla/shape_layout.cc
+++ b/tensorflow/compiler/xla/shape_layout.cc
@@ -22,24 +22,24 @@ limitations under the License.
 
 namespace xla {
 
-tensorflow::Status ShapeLayout::CopyLayoutFromShape(const Shape& other_shape) {
+Status ShapeLayout::CopyLayoutFromShape(const Shape& other_shape) {
   if (!ShapeUtil::Compatible(other_shape, shape_)) {
     return InvalidArgument("Shape %s is not compatible with shape %s",
                            ShapeUtil::HumanString(other_shape).c_str(),
                            ShapeUtil::HumanString(shape()).c_str());
   }
   shape_ = other_shape;
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ShapeLayout::AssignLayoutToShape(Shape* to_shape) const {
+Status ShapeLayout::AssignLayoutToShape(Shape* to_shape) const {
   if (!ShapeUtil::Compatible(*to_shape, shape_)) {
     return InvalidArgument("Shape %s is not compatible with shape %s",
                            ShapeUtil::HumanString(*to_shape).c_str(),
                            ShapeUtil::HumanString(shape()).c_str());
   }
   *to_shape = shape_;
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 void ShapeLayout::SetToDefaultLayout() {
diff --git a/tensorflow/compiler/xla/shape_layout.h b/tensorflow/compiler/xla/shape_layout.h
index a1dce758cd3ab3f204ce330fca2a7d2bdf57a2be..36806da599cc9b27286e67c128bb7f496f29c105 100644
--- a/tensorflow/compiler/xla/shape_layout.h
+++ b/tensorflow/compiler/xla/shape_layout.h
@@ -40,7 +40,7 @@ class ShapeLayout {
   // Assigns the layouts in this ShapeLayout to the Layout fields of the given
   // shape. 'to_shape' and the shape of the ShapeLayout object must be
   // compatible.
-  tensorflow::Status AssignLayoutToShape(Shape* to_shape) const;
+  Status AssignLayoutToShape(Shape* to_shape) const;
 
   // Returns true if the Layouts in this ShapeLayout match the layouts in the
   // given shape. Returns false otherwise. If the given shape is not compatible
@@ -49,7 +49,7 @@ class ShapeLayout {
 
   // Copies the layout from the given shape into this ShapeLayout. 'other_shape'
   // must be compatible with the ShapeLayout's shape.
-  tensorflow::Status CopyLayoutFromShape(const Shape& other_shape);
+  Status CopyLayoutFromShape(const Shape& other_shape);
 
   // Clears (Layout::Clear) all the Layouts stored in this object.
   void Clear();
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index ffaa40c2d673a2365342371ed8dab59565d1d08f..37c94ac543b166c14affd8165d244440ae6b67d6 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -42,36 +42,20 @@ namespace internal {
 template <typename T>
 struct ShapeTreeNode {
   // Data corresponding to this node.
-  T data;
+  std::pair<ShapeIndex, T> data;
 
-  // Children of this node.
-  std::vector<std::unique_ptr<ShapeTreeNode>> children;
+  // Children of this node, as indices into the container's nodes_ array.
+  std::vector<size_t> children;
 
-  ShapeTreeNode() = default;
-  explicit ShapeTreeNode(const T& data) : data(data) {}
-
-  ShapeTreeNode(const ShapeTreeNode& other)
-      : data(other.data), children(other.children.size()) {
-    for (size_t i = 0; i < children.size(); ++i) {
-      children[i] = ::xla::MakeUnique<ShapeTreeNode>(*other.children[i]);
-    }
-  }
-
-  ShapeTreeNode& operator=(const ShapeTreeNode& other) {
-    if (this != &other) {
-      data = other.data;
-      children.resize(other.children.size());
-      for (size_t i = 0; i < children.size(); ++i) {
-        children[i] = ::xla::MakeUnique<ShapeTreeNode>(*other.children[i]);
-      }
-    }
-    return *this;
-  }
+  explicit ShapeTreeNode(ShapeIndex index)
+      : ShapeTreeNode(std::move(index), T()) {}
+  ShapeTreeNode(ShapeIndex index, T data)
+      : data(std::move(index), std::move(data)) {}
 };
 
 }  // namespace internal
 
-template <typename T, bool is_const>
+template <typename ContainerType, typename IteratorType, typename ValueType>
 class ShapeTreeIterator;
 
 // A ShapeTree<T> is a recursive data structure which mirrors the structure of a
@@ -95,10 +79,9 @@ class ShapeTreeIterator;
 // before its ShapeTree goes away.
 template <typename T>
 class ShapeTree {
-  friend class ShapeTreeIterator<T, /*is_const=*/true>;
-  friend class ShapeTreeIterator<T, /*is_const=*/false>;
-
  public:
+  using Node = internal::ShapeTreeNode<T>;
+
   // Default constructor creates a tree with a nil shape (i.e. an empty tuple).
   ShapeTree() : ShapeTree(ShapeUtil::MakeNil()) {}
 
@@ -110,30 +93,12 @@ class ShapeTree {
   // alive longer than this ShapeTree.
   explicit ShapeTree(Shape shape);
   explicit ShapeTree(const Shape* shape);
+  explicit ShapeTree(const std::shared_ptr<Shape>& shape);
 
   // Create ShapeTree with the given shape, and init_value for all nodes.
   ShapeTree(Shape shape, const T& init_value);
   ShapeTree(const Shape* shape, const T& init_value);
-
-  ShapeTree(const ShapeTree& other) { *this = other; }
-  ShapeTree(ShapeTree&&) = default;
-
-  ShapeTree& operator=(const ShapeTree& other) {
-    root_ = other.root_;
-
-    // Fix up internal pointer if necessary.
-    if (other.shape_storage_) {
-      CHECK_EQ(other.shape_, other.shape_storage_.get());
-      shape_storage_.reset(new Shape(*other.shape_));
-      shape_ = shape_storage_.get();
-    } else {
-      shape_ = other.shape_;
-    }
-
-    return *this;
-  }
-
-  ShapeTree& operator=(ShapeTree&& other) = default;
+  ShapeTree(const std::shared_ptr<Shape>& shape, const T& init_value);
 
   // Returns the data element associated with the array in the shape at the
   // given index (see ShapeUtil::GetSubshape for how indexes are defined).
@@ -161,63 +126,70 @@ class ShapeTree {
     return Lookup(index)->children.empty();
   }
 
-  // iterator implements a forward_iterator with value_type =
-  // std::pair<ShapeIndex, T&>
-  using iterator = ShapeTreeIterator<T, /*is_const=*/false>;
-  using const_iterator = ShapeTreeIterator<T, /*is_const=*/true>;
+  ShapeTree(const ShapeTree&) = default;
+  ShapeTree& operator=(const ShapeTree&) = default;
+  ShapeTree(ShapeTree&&) = default;
+  ShapeTree& operator=(ShapeTree&& other) = default;
+
+  // iterator implements a bidirectional_iterator with
+  //  value_type = std::pair<ShapeIndex, T>.
+  //
+  // The iteration order is guaranteed to be a pre-order walk of the ShapeTree.
+  using iterator =
+      ShapeTreeIterator<std::vector<Node>, typename std::vector<Node>::iterator,
+                        std::pair<ShapeIndex, T>>;
+  using const_iterator =
+      ShapeTreeIterator<const std::vector<Node>,
+                        typename std::vector<Node>::const_iterator,
+                        const std::pair<ShapeIndex, T>>;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
 
   // begin/end for iterating over all nodes.
   iterator begin() {
-    return iterator(&root_, /*iterate_leaves_only=*/false,
-                    /*reverse=*/false);
+    return iterator(&nodes_, nodes_.begin(),
+                    /*iterate_leaves_only=*/false);
   }
   iterator end() {
-    return iterator(nullptr, /*iterate_leaves_only=*/false,
-                    /*reverse=*/false);
+    return iterator(&nodes_, nodes_.end(),
+                    /*iterate_leaves_only=*/false);
   }
   const_iterator begin() const {
-    return const_iterator(&root_, /*iterate_leaves_only=*/false,
-                          /*reverse=*/false);
+    return const_iterator(&nodes_, nodes_.begin(),
+                          /*iterate_leaves_only=*/false);
   }
   const_iterator end() const {
-    return const_iterator(nullptr, /*iterate_leaves_only=*/false,
-                          /*reverse=*/false);
+    return const_iterator(&nodes_, nodes_.end(),
+                          /*iterate_leaves_only=*/false);
   }
 
   // rbegin/rend for iterating over all nodes in reverse.
-  iterator rbegin() {
-    return iterator(&root_, /*iterate_leaves_only=*/false,
-                    /*reverse=*/true);
-  }
-  iterator rend() {
-    return iterator(nullptr, /*iterate_leaves_only=*/false,
-                    /*reverse=*/true);
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
+  reverse_iterator rend() { return reverse_iterator(begin()); }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
   }
-  const_iterator rbegin() const {
-    return const_iterator(&root_, /*iterate_leaves_only=*/false,
-                          /*reverse=*/true);
-  }
-  const_iterator rend() const {
-    return const_iterator(nullptr, /*iterate_leaves_only=*/false,
-                          /*reverse=*/true);
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
   }
 
   // leaf_begin()/leaf_end() iterates over all leaf nodes (nodes with no
   // children).
   iterator leaf_begin() {
-    return iterator(&root_, /*iterate_leaves_only=*/true, /*reverse=*/false);
+    return iterator(&nodes_, nodes_.begin(),
+                    /*iterate_leaves_only=*/true);
   }
   iterator leaf_end() {
-    return iterator(nullptr, /*iterate_leaves_only=*/true,
-                    /*reverse=*/false);
+    return iterator(&nodes_, nodes_.end(),
+                    /*iterate_leaves_only=*/true);
   }
   const_iterator leaf_begin() const {
-    return const_iterator(&root_, /*iterate_leaves_only=*/true,
-                          /*reverse=*/false);
+    return const_iterator(&nodes_, nodes_.begin(),
+                          /*iterate_leaves_only=*/true);
   }
   const_iterator leaf_end() const {
-    return const_iterator(nullptr, /*iterate_leaves_only=*/true,
-                          /*reverse=*/false);
+    return const_iterator(&nodes_, nodes_.end(),
+                          /*iterate_leaves_only=*/true);
   }
   // range-based iterator for leaf_begin()/leaf_end().
   tensorflow::gtl::iterator_range<iterator> leaves() {
@@ -227,20 +199,27 @@ class ShapeTree {
     return tensorflow::gtl::make_range(leaf_begin(), leaf_end());
   }
 
-  iterator leaf_rbegin() {
-    return iterator(&root_, /*iterate_leaves_only=*/true, /*reverse=*/true);
+  reverse_iterator leaf_rbegin() { return reverse_iterator(leaf_end()); }
+  reverse_iterator leaf_rend() { return reverse_iterator(leaf_begin()); }
+  const_reverse_iterator leaf_rbegin() const {
+    return const_reverse_iterator(leaf_end());
   }
-  iterator leaf_rend() {
-    return iterator(nullptr, /*iterate_leaves_only=*/true,
-                    /*reverse=*/true);
+  const_reverse_iterator leaf_rend() const {
+    return const_reverse_iterator(leaf_begin());
   }
-  const_iterator leaf_rbegin() const {
-    return const_iterator(&root_, /*iterate_leaves_only=*/true,
-                          /*reverse=*/true);
+
+  // Returns an iterator pointing to the given ShapeIndex.
+  // REQUIRES: index must exist in the ShapeTree.
+  iterator find(const ShapeIndex& index) {
+    Node* element = Lookup(index);
+    return iterator(&nodes_, typename std::vector<Node>::iterator(element),
+                    /*iterate_leaves_only=*/false);
   }
-  const_iterator leaf_rend() const {
-    return const_iterator(nullptr, /*iterate_leaves_only=*/true,
-                          /*reverse=*/true);
+  const_iterator find(const ShapeIndex& index) const {
+    Node* element = Lookup(index);
+    return iterator(&nodes_,
+                    typename std::vector<Node>::const_iterator(element),
+                    /*iterate_leaves_only=*/false);
   }
 
   // Recursively traverses the shape and calls the given function at each
@@ -282,8 +261,6 @@ class ShapeTree {
   bool operator!=(const ShapeTree<T>& other) const { return !(*this == other); }
 
  private:
-  using Node = internal::ShapeTreeNode<T>;
-
   // Initialize node->children based on 'shape'. All children are assigned the
   // the given 'init_value'.
   void InitChildren(const Shape& shape, const T& init_value, Node* node);
@@ -292,136 +269,57 @@ class ShapeTree {
   // default-constructed data values.
   void InitChildren(const Shape& shape, Node* node);
 
+  // Returns the number of subshapes, including interior nodes, in shape.
+  int64 CountSubshapes(const Shape& shape);
+
   // Helpers for traversing the shape via ForEachElement. The helpers
   // recursively traverse the subtree rooted at "index" (defined as in
   // ShapeUtil::GetSubshape).
   template <typename Fn>
-  static Status ForEachHelper(const Fn& func, const Node& node,
-                              ShapeIndex* index);
+  static Status ForEachHelper(const Fn& func, const std::vector<Node>& nodes);
   template <typename Fn>
-  static Status ForEachMutableHelper(const Fn& func, Node* node,
-                                     ShapeIndex* index);
+  static Status ForEachMutableHelper(const Fn& func, std::vector<Node>* nodes);
 
   // Return the tree node at the given index.
   Node* Lookup(const ShapeIndex& index);
   const Node* Lookup(const ShapeIndex& index) const;
 
-  // The root node, which contains all other nodes.
-  Node root_;
+  // The nodes in this shape tree.
+  std::vector<Node> nodes_;
 
   // If we own our Shape, this field contains it, and shape_ is a pointer into
   // here.  Otherwise if we don't own our shape, this is nullptr.
-  std::unique_ptr<Shape> shape_storage_;
+  std::shared_ptr<Shape> shape_storage_;
 
   // The XLA shape mirrored in this ShapeTree.  This is either
   // shape_storage_.get() or the Shape pointer passed to our constructor.
   const Shape* shape_;
 };
 
-// Internal iterator that performs a pre-order walk. This is copyable, but
-// contains a vector so isn't cheap to copy. This also means post-increment is
-// expensive. The iterator value_type is equivalent to a std::pair<ShapeIndex,
-// T&>, similar to std::map. The non-const iterator's T& type can be mutated
-// in-place.
-template <typename T, bool is_const>
-class ShapeTreeIterator : public std::iterator<std::forward_iterator_tag,
-                                               std::pair<ShapeIndex, T&>> {
+// Internal iterator that performs a pre-order walk. This is cheap to copy.
+// The iterator value_type is equivalent to a
+// std::pair<ShapeIndex,T>&, similar to std::map.
+template <typename ContainerType, typename IteratorType, typename ValueType>
+class ShapeTreeIterator
+    : public std::iterator<std::bidirectional_iterator_tag, ValueType> {
  public:
-  using value_type =
-      typename std::conditional<is_const, std::pair<ShapeIndex, const T&>,
-                                std::pair<ShapeIndex, T&>>::type;
-  using NodeType =
-      typename std::conditional<is_const, const typename ShapeTree<T>::Node,
-                                typename ShapeTree<T>::Node>::type;
-
-  // Construct an iterator pointing at node. Node must either be the tree root
-  // or nullptr (which is equivalent to end() and should not be dereferenced or
-  // incremented). If iterate_leaves_only is true, the iterator will not include
-  // interior tree nodes, only leaves. If reverse is true, the iterator will
-  // visit nodes in the reverse of pre-order traversal.
-  ShapeTreeIterator(NodeType* node, bool iterate_leaves_only, bool reverse)
-      : node_(node),
-        iterate_leaves_only_(iterate_leaves_only),
-        reverse_(reverse) {
-    if (node_) {
-      if (reverse_) {
-        while (!node_->children.empty()) {
-          const int child_index = node_->children.size() - 1;
-          stack_.push_back({node_, child_index});
-          node_ = node_->children[child_index].get();
-        }
-      } else {
-        if (!node_->children.empty() && iterate_leaves_only) {
-          ++*this;
-        }
-      }
+  ShapeTreeIterator(ContainerType* nodes, IteratorType node,
+                    bool iterate_leaves_only)
+      : nodes_(nodes),
+        node_(std::move(node)),
+        iterate_leaves_only_(iterate_leaves_only) {
+    while (iterate_leaves_only && node_ != nodes_->end() &&
+           !node_->children.empty()) {
+      ++node_;
     }
   }
-  ShapeTreeIterator(const ShapeTreeIterator& other)
-      : node_(other.node_),
-        stack_(other.stack_),
-        iterate_leaves_only_(other.iterate_leaves_only_),
-        reverse_(other.reverse_) {}
 
   ShapeTreeIterator& operator++() {
-    CHECK_NE(nullptr, node_) << "walking off the end() of an iterator!";
-    if (reverse_) {
-      while (!stack_.empty()) {
-        node_ = stack_.back().first;
-        int64 next_child_index = stack_.back().second - 1;
-        stack_.pop_back();
-        if (next_child_index < 0) {
-          if (!iterate_leaves_only_) {
-            // All children are visited, yield <node_>.
-            return *this;
-          }
-        } else {
-          stack_.push_back({node_, next_child_index});
-          node_ = node_->children[next_child_index].get();
-          while (!node_->children.empty()) {
-            const int child_index = node_->children.size() - 1;
-            stack_.push_back({node_, child_index});
-            node_ = node_->children[child_index].get();
-          }
-          return *this;
-        }
-      }
-    } else {
-      // We're doing a pre-order walk, so if our current node has children take
-      // the first child.
-      if (!node_->children.empty()) {
-        stack_.push_back({node_, /*child-index=*/0});
-        node_ = node_->children[0].get();
-        if (node_->children.empty() || !iterate_leaves_only_) {
-          return *this;
-        } else {
-          // This is a non-leaf; tail-recurse.
-          return ++(*this);
-        }
-      }
-      // Otherwise we are currently at a leaf. Walk back up until a node
-      // contains a child we haven't visited yet.
-      while (!stack_.empty()) {
-        node_ = stack_.back().first;
-        int64 next_child_index = stack_.back().second + 1;
-        stack_.pop_back();
-        if (node_->children.size() > next_child_index) {
-          stack_.push_back({node_, next_child_index});
-          node_ = node_->children[next_child_index].get();
-
-          if (node_->children.empty() || !iterate_leaves_only_) {
-            return *this;
-          } else {
-            // This is a non-leaf; tail-recurse.
-            return ++(*this);
-          }
-        }
-      }
+    ++node_;
+    while (iterate_leaves_only_ && node_ != nodes_->end() &&
+           !node_->children.empty()) {
+      ++node_;
     }
-    // We've walked off the end of the tree. Set node_ to nullptr to signify
-    // end().
-    node_ = nullptr;
-    current_.reset();
     return *this;
   }
   ShapeTreeIterator operator++(int) {
@@ -429,52 +327,62 @@ class ShapeTreeIterator : public std::iterator<std::forward_iterator_tag,
     ++(*this);
     return i;
   }
+
+  ShapeTreeIterator& operator--() {
+    --node_;
+    while (iterate_leaves_only_ && node_ > nodes_->begin() &&
+           !node_->children.empty()) {
+      --node_;
+    }
+    return *this;
+  }
+  ShapeTreeIterator operator--(int) {
+    auto i = *this;
+    --(*this);
+    return i;
+  }
+
   bool operator==(const ShapeTreeIterator& other) const {
     return node_ == other.node_;
   }
   bool operator!=(const ShapeTreeIterator& other) const {
     return node_ != other.node_;
   }
-  value_type& operator*() { return UpdateCurrent(); }
-  value_type* operator->() { return &UpdateCurrent(); }
+  ValueType& operator*() { return node_->data; }
+  ValueType* operator->() { return &node_->data; }
 
  private:
-  // Updates the current_ member to reflect the current state.
-  value_type& UpdateCurrent() {
-    ShapeIndex index;
-    for (auto& node_and_index : stack_) {
-      index.push_back(node_and_index.second);
-    }
-    current_ = ::xla::MakeUnique<value_type>(index, node_->data);
-    return *current_;
-  }
-
-  // The node to which this iterator is pointing. This is the source of truth in
-  // the iterator - the stack only exists to facilitate walking back from
-  // children to parents.
-  NodeType* node_;
-  // Stack of {node, child-index} pairs of the path taken from the root to get
-  // to node_. This allows us to backtrack and know where to go next.
-  std::vector<std::pair<NodeType*, int64>> stack_;
+  ContainerType* nodes_;
+  IteratorType node_;
   // True if we should not include interior nodes in our walk.
   bool iterate_leaves_only_;
-  // True if we should yield the reverse of the pre-order traversal.
-  bool reverse_;
-  // Placeholder for the current value. Ideally this wouldn't exist and would
-  // just be an rvalue, but operator -> needs to return a pointer to something.
-  // We cannot just use a plain old value_type as it contains a reference so
-  // cannot be default-constructed.
-  std::unique_ptr<value_type> current_;
 };
 
+template <typename T>
+int64 ShapeTree<T>::CountSubshapes(const Shape& shape) {
+  int64 current_count = 1;
+  if (ShapeUtil::IsTuple(shape)) {
+    int64 count = ShapeUtil::TupleElementCount(shape);
+    for (int i = 0; i < count; ++i) {
+      current_count += CountSubshapes(shape.tuple_shapes(i));
+    }
+  }
+  return current_count;
+}
+
 template <typename T>
 void ShapeTree<T>::InitChildren(const Shape& shape, const T& init_value,
                                 Node* node) {
   if (ShapeUtil::IsTuple(shape)) {
-    for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
-      node->children.emplace_back(new Node(init_value));
-      InitChildren(shape.tuple_shapes(i), init_value,
-                   node->children.back().get());
+    const int64 size = ShapeUtil::TupleElementCount(shape);
+    node->children.reserve(size);
+    ShapeIndex shape_index = node->data.first;
+    shape_index.push_back(0);
+    for (int i = 0; i < size; ++i) {
+      shape_index[shape_index.size() - 1] = i;
+      node->children.push_back(nodes_.size());
+      nodes_.emplace_back(shape_index, init_value);
+      InitChildren(shape.tuple_shapes(i), init_value, &nodes_.back());
     }
   }
 }
@@ -482,63 +390,92 @@ void ShapeTree<T>::InitChildren(const Shape& shape, const T& init_value,
 template <typename T>
 void ShapeTree<T>::InitChildren(const Shape& shape, Node* node) {
   if (ShapeUtil::IsTuple(shape)) {
-    for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
-      node->children.emplace_back(new Node());
-      InitChildren(shape.tuple_shapes(i), node->children.back().get());
+    const int64 size = ShapeUtil::TupleElementCount(shape);
+    node->children.reserve(size);
+    ShapeIndex shape_index = node->data.first;
+    shape_index.push_back(0);
+    for (int i = 0; i < size; ++i) {
+      shape_index[shape_index.size() - 1] = i;
+      node->children.push_back(nodes_.size());
+      nodes_.emplace_back(shape_index);
+      InitChildren(shape.tuple_shapes(i), &nodes_.back());
     }
   }
 }
 
 template <typename T>
 ShapeTree<T>::ShapeTree(Shape shape)
-    : root_(),
-      shape_storage_(::xla::MakeUnique<Shape>(std::move(shape))),
+    : shape_storage_(std::make_shared<Shape>(std::move(shape))),
       shape_(shape_storage_.get()) {
   // The shape_ field is just used to hold the structure of the shape.
   // It should not be relied upon to store layout information.
   LayoutUtil::ClearLayout(shape_storage_.get());
-  InitChildren(*shape_, &root_);
+  nodes_.reserve(CountSubshapes(*shape_));
+  nodes_.emplace_back(ShapeIndex{});
+  InitChildren(*shape_, &nodes_[0]);
+}
+
+template <typename T>
+ShapeTree<T>::ShapeTree(const Shape* shape) : shape_(shape) {
+  nodes_.reserve(CountSubshapes(*shape_));
+  nodes_.emplace_back(ShapeIndex{});
+  InitChildren(*shape_, &nodes_[0]);
 }
 
 template <typename T>
-ShapeTree<T>::ShapeTree(const Shape* shape) : root_(), shape_(shape) {
-  InitChildren(*shape_, &root_);
+ShapeTree<T>::ShapeTree(const std::shared_ptr<Shape>& shape)
+    : shape_storage_(shape), shape_(shape_storage_.get()) {
+  nodes_.reserve(CountSubshapes(*shape_));
+  nodes_.emplace_back(ShapeIndex{});
+  InitChildren(*shape_, &nodes_[0]);
 }
 
 template <typename T>
 ShapeTree<T>::ShapeTree(Shape shape, const T& init_value)
-    : root_(init_value),
-      shape_storage_(::xla::MakeUnique<Shape>(std::move(shape))),
+    : shape_storage_(std::make_shared<Shape>(std::move(shape))),
       shape_(shape_storage_.get()) {
   // The shape_ field is just used to hold the structure of the shape.
   // It should not be relied upon to store layout information.
   LayoutUtil::ClearLayout(shape_storage_.get());
-  InitChildren(*shape_, init_value, &root_);
+  nodes_.reserve(CountSubshapes(*shape_));
+  nodes_.emplace_back(ShapeIndex{}, init_value);
+  InitChildren(*shape_, init_value, &nodes_[0]);
 }
 
 template <typename T>
 ShapeTree<T>::ShapeTree(const Shape* shape, const T& init_value)
-    : root_(init_value), shape_(shape) {
-  InitChildren(*shape_, init_value, &root_);
+    : shape_(shape) {
+  nodes_.reserve(CountSubshapes(*shape_));
+  nodes_.emplace_back(ShapeIndex{}, init_value);
+  InitChildren(*shape_, init_value, &nodes_[0]);
+}
+
+template <typename T>
+ShapeTree<T>::ShapeTree(const std::shared_ptr<Shape>& shape,
+                        const T& init_value)
+    : shape_storage_(shape), shape_(shape_storage_.get()) {
+  nodes_.reserve(CountSubshapes(*shape_));
+  nodes_.emplace_back(ShapeIndex{}, init_value);
+  InitChildren(*shape_, init_value, &nodes_[0]);
 }
 
 template <typename T>
 const T& ShapeTree<T>::element(const ShapeIndex& index) const {
-  return Lookup(index)->data;
+  return Lookup(index)->data.second;
 }
 
 template <typename T>
 T* ShapeTree<T>::mutable_element(const ShapeIndex& index) {
-  return &Lookup(index)->data;
+  return &Lookup(index)->data.second;
 }
 
 template <typename T>
 internal::ShapeTreeNode<T>* ShapeTree<T>::Lookup(const ShapeIndex& index) {
-  Node* node = &root_;
+  Node* node = &nodes_[0];
   for (const int64 i : index) {
     CHECK_GE(i, 0);
     CHECK_LT(i, node->children.size());
-    node = node->children[i].get();
+    node = &nodes_[node->children[i]];
   }
   return node;
 }
@@ -552,13 +489,10 @@ const internal::ShapeTreeNode<T>* ShapeTree<T>::Lookup(
 /* static */
 template <typename T>
 template <typename Fn>
-Status ShapeTree<T>::ForEachHelper(const Fn& func, const Node& node,
-                                   ShapeIndex* index) {
-  TF_RETURN_IF_ERROR(func(*index, node.data));
-  for (int64 i = 0; i < node.children.size(); ++i) {
-    index->push_back(i);
-    TF_RETURN_IF_ERROR(ForEachHelper(func, *node.children[i], index));
-    index->pop_back();
+Status ShapeTree<T>::ForEachHelper(const Fn& func,
+                                   const std::vector<Node>& nodes) {
+  for (const auto& node : nodes) {
+    TF_RETURN_IF_ERROR(func(node.data.first, node.data.second));
   }
   return Status::OK();
 }
@@ -566,14 +500,10 @@ Status ShapeTree<T>::ForEachHelper(const Fn& func, const Node& node,
 /* static */
 template <typename T>
 template <typename Fn>
-Status ShapeTree<T>::ForEachMutableHelper(const Fn& func, Node* node,
-                                          ShapeIndex* index) {
-  TF_RETURN_IF_ERROR(func(*index, &node->data));
-  for (int64 i = 0; i < node->children.size(); ++i) {
-    index->push_back(i);
-    TF_RETURN_IF_ERROR(
-        ForEachMutableHelper(func, node->children[i].get(), index));
-    index->pop_back();
+Status ShapeTree<T>::ForEachMutableHelper(const Fn& func,
+                                          std::vector<Node>* nodes) {
+  for (auto& node : *nodes) {
+    TF_RETURN_IF_ERROR(func(node.data.first, &node.data.second));
   }
   return Status::OK();
 }
@@ -581,40 +511,36 @@ Status ShapeTree<T>::ForEachMutableHelper(const Fn& func, Node* node,
 template <typename T>
 template <typename Fn>
 Status ShapeTree<T>::ForEachElementWithStatus(const Fn& func) const {
-  ShapeIndex index;
-  return ForEachHelper(func, root_, &index);
+  return ForEachHelper(func, nodes_);
 }
 
 template <typename T>
 template <typename Fn>
 Status ShapeTree<T>::ForEachMutableElementWithStatus(const Fn& func) {
-  ShapeIndex index;
-  return ForEachMutableHelper(func, &root_, &index);
+  return ForEachMutableHelper(func, &nodes_);
 }
 
 template <typename T>
 template <typename Fn>
 void ShapeTree<T>::ForEachElement(const Fn& func) const {
-  ShapeIndex index;
   return ForEachHelper(
              [&func](const ShapeIndex& index, const T& data) {
                func(index, data);
                return Status::OK();
              },
-             root_, &index)
+             nodes_)
       .IgnoreError();
 }
 
 template <typename T>
 template <typename Fn>
 void ShapeTree<T>::ForEachMutableElement(const Fn& func) {
-  ShapeIndex index;
   return ForEachMutableHelper(
              [&func](const ShapeIndex& index, T* data) {
                func(index, data);
                return Status::OK();
              },
-             &root_, &index)
+             &nodes_)
       .IgnoreError();
 }
 
diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc
index 4b6ab772811f4a6c6ffc1d10befc7122f883b8f9..dc5facf1581c07fbb74dfcee95025692938632bd 100644
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace xla {
 namespace {
@@ -421,8 +422,8 @@ TEST_F(ShapeTreeTest, IterateAndMutate) {
     }
     ++i;
   }
-  t.begin()->second = 78;
-  EXPECT_EQ(78, t.begin()->second);
+  (*t.begin()).second = 78;
+  EXPECT_EQ(78, (*t.begin()).second);
   i = 0;
   for (auto& index_to_data : t) {
     if (i == 0) {
@@ -434,14 +435,14 @@ TEST_F(ShapeTreeTest, IterateAndMutate) {
     }
     ++i;
   }
-  EXPECT_EQ(78, t.begin()->second);
-  EXPECT_EQ(98, std::next(t.begin())->second);
+  EXPECT_EQ(78, (*t.begin()).second);
+  EXPECT_EQ(98, (*std::next(t.begin())).second);
 }
 
 TEST_F(ShapeTreeTest, IterateOrder) {
   ShapeTree<int> t(nested_tuple_shape_, 42);
   std::vector<ShapeIndex> v;
-  for (auto& index_to_data : t) {
+  for (auto index_to_data : t) {
     v.push_back(index_to_data.first);
   }
   EXPECT_EQ(v, (std::vector<ShapeIndex>{{},
@@ -479,7 +480,7 @@ TEST_F(ShapeTreeTest, ReverseIterateOrder) {
 TEST_F(ShapeTreeTest, IterateOrderLeaves) {
   ShapeTree<int> t(nested_tuple_shape_, 42);
   std::vector<ShapeIndex> v;
-  for (auto& index_to_data : t.leaves()) {
+  for (auto index_to_data : t.leaves()) {
     v.push_back(index_to_data.first);
   }
   EXPECT_EQ(v, (std::vector<ShapeIndex>{
@@ -502,5 +503,106 @@ TEST_F(ShapeTreeTest, ReverseIterateOrderLeaves) {
                }));
 }
 
+void BM_Construct(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = ShapeUtil::MakeTupleShape(shapes);
+  }
+  tensorflow::testing::StartTiming();
+
+  for (int i = 0; i < iters; ++i) {
+    ShapeTree<int> shape_tree(shape);
+  }
+}
+
+void BM_ConstructUnowned(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = ShapeUtil::MakeTupleShape(shapes);
+  }
+  tensorflow::testing::StartTiming();
+
+  for (int i = 0; i < iters; ++i) {
+    ShapeTree<int> shape_tree(&shape);
+  }
+}
+
+void BM_Copy(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = ShapeUtil::MakeTupleShape(shapes);
+  }
+  tensorflow::testing::StartTiming();
+
+  ShapeTree<int> shape_tree(shape);
+  for (int i = 0; i < iters; ++i) {
+    ShapeTree<int> copy = shape_tree;
+    tensorflow::testing::DoNotOptimize(copy);
+  }
+}
+
+void BM_Move(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = ShapeUtil::MakeTupleShape(shapes);
+  }
+  tensorflow::testing::StartTiming();
+
+  ShapeTree<int> shape_tree(shape);
+  for (int i = 0; i < iters; ++i) {
+    ShapeTree<int> copy = std::move(shape_tree);
+    shape_tree = std::move(copy);
+  }
+}
+
+void BM_ForEach(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = ShapeUtil::MakeTupleShape(shapes);
+  }
+  tensorflow::testing::StartTiming();
+
+  ShapeTree<int> shape_tree(shape);
+  for (int i = 0; i < iters; ++i) {
+    shape_tree.ForEachMutableElement([](const ShapeIndex& index, int* data) {
+      tensorflow::testing::DoNotOptimize(index);
+    });
+  }
+}
+
+void BM_Iterate(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = ShapeUtil::MakeTupleShape(shapes);
+  }
+  tensorflow::testing::StartTiming();
+
+  ShapeTree<int> shape_tree(shape);
+  for (int i = 0; i < iters; ++i) {
+    for (auto& iter : shape_tree) {
+      tensorflow::testing::DoNotOptimize(iter.second);
+    }
+  }
+}
+
+BENCHMARK(BM_Construct)->ArgPair(2, 8);
+BENCHMARK(BM_ConstructUnowned)->ArgPair(2, 8);
+BENCHMARK(BM_Copy)->ArgPair(2, 8);
+BENCHMARK(BM_Move)->ArgPair(2, 8);
+BENCHMARK(BM_ForEach)->ArgPair(2, 8);
+BENCHMARK(BM_Iterate)->ArgPair(2, 8);
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index cb8bf5a2b9e5d06f73e2116ed08630249ae8f970..82c75f85d838f94cb040e56d59d0e012af5e0db0 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -231,7 +231,7 @@ class ShapeUtil {
   }
 
   // Returns the higher-precision element type if a and b are both floating
-  // point types; otherwise, checks that that they have the same element type
+  // point types; otherwise, checks that they have the same element type
   // and returns it.
   static PrimitiveType HigherPrecisionElementType(const Shape& a,
                                                   const Shape& b) {
diff --git a/tensorflow/compiler/xla/status.h b/tensorflow/compiler/xla/status.h
index 4eb3bf3766412d5d9a8e78a4652807c5eaeef6ee..69abb51852ac09e8d357a9ba7924efc348ef2001 100644
--- a/tensorflow/compiler/xla/status.h
+++ b/tensorflow/compiler/xla/status.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace xla {
 
-using tensorflow::Status;
+using tensorflow::Status;  // TENSORFLOW_STATUS_OK
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/statusor.h b/tensorflow/compiler/xla/statusor.h
index cccbce5fc83af87396f4d51eb9e785cea93aba0b..0e1387c93938fa520562fcd63ac107a82b089a51 100644
--- a/tensorflow/compiler/xla/statusor.h
+++ b/tensorflow/compiler/xla/statusor.h
@@ -13,13 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// StatusOr<T> is the union of a Status object and a T
-// object. StatusOr models the concept of an object that is either a
-// usable value, or an error Status explaining why such a value is
-// not present. To this end, StatusOr<T> does not allow its Status
-// value to be Status::OK. Furthermore, the value of a StatusOr<T*>
-// must not be null. This is enforced by a debug check in most cases,
-// but even when it is not, clients must not set the value to null.
+// StatusOr<T> is the union of a Status object and a T object. StatusOr models
+// the concept of an object that is either a value, or an error Status
+// explaining why such a value is not present. To this end, StatusOr<T> does not
+// allow its Status value to be Status::OK.
 //
 // The primary use-case for StatusOr<T> is as the return value of a
 // function which may fail.
diff --git a/tensorflow/compiler/xla/statusor_test.cc b/tensorflow/compiler/xla/statusor_test.cc
index f9d25945bc617507735fb6c4d011c39723497f69..377a618ffbd99316d409130df8a39f352664dee0 100644
--- a/tensorflow/compiler/xla/statusor_test.cc
+++ b/tensorflow/compiler/xla/statusor_test.cc
@@ -75,6 +75,14 @@ TEST(StatusOr, ElementType) {
   static_assert(std::is_same<StatusOr<char>::element_type, char>(), "");
 }
 
+TEST(StatusOr, NullPointerStatusOr) {
+  // As a very special case, null-plain-pointer StatusOr used to be an
+  // error. Test that it no longer is.
+  StatusOr<int*> null_status(nullptr);
+  EXPECT_TRUE(null_status.ok());
+  EXPECT_EQ(null_status.ValueOrDie(), nullptr);
+}
+
 TEST(StatusOr, TestNoDefaultConstructorInitialization) {
   // Explicitly initialize it with an error code.
   StatusOr<NoDefaultConstructor> statusor(tensorflow::errors::Cancelled(""));
@@ -405,7 +413,7 @@ TEST(StatusOr, TestPointerValueConst) {
   EXPECT_EQ(&kI, thing.ValueOrDie());
 }
 
-// NOTE(tucker): tensorflow::StatusOr does not support this kind
+// NOTE(tucker): StatusOr does not support this kind
 // of resize op.
 // TEST(StatusOr, StatusOrVectorOfUniquePointerCanResize) {
 //   using EvilType = std::vector<std::unique_ptr<int>>;
diff --git a/tensorflow/compiler/xla/test_helpers.h b/tensorflow/compiler/xla/test_helpers.h
index 17bae2e4f611268df824ce793c75ba1c95573455..8918350135fbb86973b228b35f5873fea8695b2f 100644
--- a/tensorflow/compiler/xla/test_helpers.h
+++ b/tensorflow/compiler/xla/test_helpers.h
@@ -40,13 +40,10 @@ class Literal;
 namespace testing {
 
 namespace internal_status {
-inline const ::tensorflow::Status& GetStatus(
-    const ::tensorflow::Status& status) {
-  return status;
-}
+inline const Status& GetStatus(const Status& status) { return status; }
 
 template <typename T>
-inline const ::tensorflow::Status& GetStatus(const StatusOr<T>& status) {
+inline const Status& GetStatus(const StatusOr<T>& status) {
   return status.status();
 }
 }  // namespace internal_status
@@ -57,21 +54,17 @@ inline const ::tensorflow::Status& GetStatus(const StatusOr<T>& status) {
 // The following macros are similar to macros in gmock, but deliberately named
 // differently in order to avoid conflicts in files which include both.
 
-// Macros for testing the results of functions that return tensorflow::Status or
+// Macros for testing the results of functions that return Status or
 // StatusOr<T> (for any type T).
-#define EXPECT_IS_OK(expression)      \
-  EXPECT_EQ(tensorflow::Status::OK(), \
-            xla::testing::internal_status::GetStatus(expression))
-#define EXPECT_IS_NOT_OK(expression)  \
-  EXPECT_NE(tensorflow::Status::OK(), \
-            xla::testing::internal_status::GetStatus(expression))
+#define EXPECT_IS_OK(expression) \
+  EXPECT_EQ(Status::OK(), xla::testing::internal_status::GetStatus(expression))
+#define EXPECT_IS_NOT_OK(expression) \
+  EXPECT_NE(Status::OK(), xla::testing::internal_status::GetStatus(expression))
 #undef ASSERT_IS_OK
-#define ASSERT_IS_OK(expression)      \
-  ASSERT_EQ(tensorflow::Status::OK(), \
-            xla::testing::internal_status::GetStatus(expression))
+#define ASSERT_IS_OK(expression) \
+  ASSERT_EQ(Status::OK(), xla::testing::internal_status::GetStatus(expression))
 #undef ASSERT_IS_NOT_OK
-#define ASSERT_IS_NOT_OK(expression)  \
-  ASSERT_NE(tensorflow::Status::OK(), \
-            xla::testing::internal_status::GetStatus(expression))
+#define ASSERT_IS_NOT_OK(expression) \
+  ASSERT_NE(Status::OK(), xla::testing::internal_status::GetStatus(expression))
 
 #endif  // TENSORFLOW_COMPILER_XLA_TEST_HELPERS_H_
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 0571ff50554c5d2291198ceddc085e3c21a9f145..4883380be1f8a291bda829dff713de549ba58c65 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -87,12 +87,12 @@ cc_library(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:array4d",
+        "//tensorflow/compiler/xla:error_spec",
+        "//tensorflow/compiler/xla:literal_comparison",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -152,7 +152,6 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
@@ -188,8 +187,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -288,8 +285,6 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -313,7 +308,6 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -335,7 +329,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -378,7 +371,6 @@ xla_test(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -398,7 +390,6 @@ xla_test(
         "enable_for_xla_interpreter",
     ],
     deps = [
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -422,8 +413,6 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
@@ -450,8 +439,6 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -472,7 +459,6 @@ xla_test(
     ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -491,7 +477,6 @@ xla_test(
     ],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -528,7 +513,6 @@ xla_test(
     tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
@@ -552,7 +536,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -572,8 +555,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -598,8 +579,6 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -626,7 +605,6 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -697,7 +675,6 @@ xla_test(
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -741,7 +718,6 @@ xla_test(
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -766,7 +742,6 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -790,7 +765,6 @@ xla_test(
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -843,7 +817,6 @@ xla_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -868,7 +841,6 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -930,8 +902,6 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
@@ -960,8 +930,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
@@ -1002,7 +970,6 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1055,8 +1022,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1078,7 +1043,6 @@ xla_test(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -1108,8 +1072,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
@@ -1240,8 +1202,6 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
@@ -1281,7 +1241,6 @@ xla_test(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:reference_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1304,7 +1263,6 @@ xla_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1344,7 +1302,6 @@ xla_test(
         "enable_for_xla_interpreter",
     ],
     deps = [
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1362,7 +1319,6 @@ xla_test(
         "enable_for_xla_interpreter",
     ],
     deps = [
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1388,8 +1344,6 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1411,7 +1365,6 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1483,8 +1436,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
@@ -1532,7 +1483,6 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1545,6 +1495,30 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "cross_replica_sum_test",
+    srcs = ["cross_replica_sum_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "bitcast_convert_test",
     srcs = ["bitcast_convert_test.cc"],
@@ -1574,8 +1548,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -1596,7 +1568,6 @@ xla_test(
         "enable_for_xla_interpreter",
     ],
     deps = [
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1620,8 +1591,6 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1642,7 +1611,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -1661,7 +1629,6 @@ xla_test(
     srcs = ["execution_profile_test.cc"],
     deps = [
         ":client_library_test_base",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1676,7 +1643,6 @@ xla_test(
     args = ["--xla_hlo_profile"],
     deps = [
         ":client_library_test_base",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1782,8 +1748,6 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1811,8 +1775,6 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1850,8 +1812,6 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1867,7 +1827,10 @@ xla_test(
 
 xla_test(
     name = "local_client_execute_test",
+    # TODO(b/79375911): Test times out in LLVM at normal size.
+    size = "large",
     srcs = ["local_client_execute_test.cc"],
+    shard_count = 30,
     tags = ["optonly"],
     deps = [
         "//tensorflow/compiler/xla:literal_util",
@@ -1877,8 +1840,6 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1946,8 +1907,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -2048,7 +2007,6 @@ xla_test(
         ":local_client_test_base",
         ":test_utils",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index e8a5efe796a9209307ecfa343b89f66ff2a34e0f..36a706496918ac8c15780473019e2a8d098ffa22 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -2225,6 +2225,15 @@ XLA_TEST_F(ArrayElementwiseOpTest, ClzU32s) {
   ComputeAndCompareR1<uint32>(&builder, {32, 31, 27, 15, 9, 3, 0}, {});
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ClzS64s) {
+  XlaBuilder builder(TestName());
+  auto a =
+      builder.ConstantR1<int64>({0, 1, 0x80000000, 0x7FFFFFFFF2345678ul, -1});
+  builder.Clz(a);
+
+  ComputeAndCompareR1<int64>(&builder, {64, 63, 32, 1, 0}, {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, AddChainFoldLeft) {
   // a ------ (add) --------- (add)
   //         /               /
diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc
index 4e65cf11f3f1a027e1adc5a89930caba28958fea..ca337e78840e77377719636cd4cf33af2578210d 100644
--- a/tensorflow/compiler/xla/tests/bfloat16_test.cc
+++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/xla/tests/broadcast_test.cc b/tensorflow/compiler/xla/tests/broadcast_test.cc
index 6ebbf7191833ef85ee4a48cc96c0a3be38c71228..51b9f0d3e330e73f5d110f0a62f824179d5c7cf7 100644
--- a/tensorflow/compiler/xla/tests/broadcast_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_test.cc
@@ -46,8 +46,8 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarToScalar) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR0<float>(42.0), *result,
-                              error_spec_);
+  EXPECT_TRUE(LiteralTestUtil::Near(*Literal::CreateR0<float>(42.0), *result,
+                                    error_spec_));
 }
 
 XLA_TEST_F(BroadcastTest, BroadcastScalarTo2D) {
@@ -62,9 +62,9 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarTo2D) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(
+  EXPECT_TRUE(LiteralTestUtil::Near(
       *Literal::CreateR2<float>({{42.0, 42.0}, {42.0, 42.0}}), *result,
-      error_spec_);
+      error_spec_));
 }
 
 XLA_TEST_F(BroadcastTest, BroadcastVectorTo2D) {
@@ -85,13 +85,13 @@ XLA_TEST_F(BroadcastTest, BroadcastVectorTo2D) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(
+  EXPECT_TRUE(LiteralTestUtil::Near(
       *Literal::CreateR2<float>({{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}}),
-      LiteralView::Create(*result, {0}), error_spec_);
+      LiteralSlice(*result, {0}), error_spec_));
 
-  LiteralTestUtil::ExpectNear(
+  EXPECT_TRUE(LiteralTestUtil::Near(
       *Literal::CreateR2<float>({{1.0, 2.0, 3.0}, {1.0, 2.0, 3.0}}),
-      LiteralView::Create(*result, {1}), error_spec_);
+      LiteralSlice(*result, {1}), error_spec_));
 }
 
 XLA_TEST_F(BroadcastTest, Broadcast2DTo2D) {
@@ -106,9 +106,9 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2D) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(
-      *Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}}), *result,
-      error_spec_);
+  EXPECT_TRUE(
+      LiteralTestUtil::Near(*Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}}),
+                            *result, error_spec_));
 }
 
 XLA_TEST_F(BroadcastTest, Broadcast2DTo2DTranspose) {
@@ -125,9 +125,9 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2DTranspose) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(
-      *Literal::CreateR2<float>({{1.0, 3.0}, {2.0, 4.0}}), *result,
-      error_spec_);
+  EXPECT_TRUE(
+      LiteralTestUtil::Near(*Literal::CreateR2<float>({{1.0, 3.0}, {2.0, 4.0}}),
+                            *result, error_spec_));
 }
 
 XLA_TEST_F(BroadcastTest, Broadcast2DTo3D) {
@@ -142,10 +142,10 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo3D) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(
+  EXPECT_TRUE(LiteralTestUtil::Near(
       *Literal::CreateR3<float>({{{1.0, 2.0}, {1.0, 2.0}, {1.0, 2.0}},
                                  {{3.0, 4.0}, {3.0, 4.0}, {3.0, 4.0}}}),
-      *result, error_spec_);
+      *result, error_spec_));
 }
 
 TEST_F(BroadcastTest, Broadcast_R1_2_To_R4_2x2x3x3) {
@@ -166,8 +166,8 @@ TEST_F(BroadcastTest, Broadcast_R1_2_To_R4_2x2x3x3) {
   Array2D<float> pz({{1, 2}, {1, 2}});
   expected.FillWithPZ(pz);
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D<float>(expected),
-                              *result, error_spec_);
+  EXPECT_TRUE(LiteralTestUtil::Near(
+      *Literal::CreateR4FromArray4D<float>(expected), *result, error_spec_));
 }
 
 TEST_F(BroadcastTest, Broadcast_R1_1025_To_R4_3x3x3x1025) {
@@ -196,8 +196,8 @@ TEST_F(BroadcastTest, Broadcast_R1_1025_To_R4_3x3x3x1025) {
   }
   expected.FillWithYX(yx);
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D<float>(expected),
-                              *result, error_spec_);
+  EXPECT_TRUE(LiteralTestUtil::Near(
+      *Literal::CreateR4FromArray4D<float>(expected), *result, error_spec_));
 }
 
 XLA_TEST_F(BroadcastTest, Broadcast_R1_64_To_R4_32x64x7x7) {
@@ -218,8 +218,8 @@ XLA_TEST_F(BroadcastTest, Broadcast_R1_64_To_R4_32x64x7x7) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D(r4_array), *result,
-                              error_spec_);
+  EXPECT_TRUE(LiteralTestUtil::Near(*Literal::CreateR4FromArray4D(r4_array),
+                                    *result, error_spec_));
 }
 
 TEST_F(BroadcastTest, Broadcast_R0_to_R4_64x64x3x3) {
@@ -238,8 +238,8 @@ TEST_F(BroadcastTest, Broadcast_R0_to_R4_64x64x3x3) {
   Array4D<float> expected(64, 64, 3, 3);
   expected.Fill(1.0f);
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D<float>(expected),
-                              *result, error_spec_);
+  EXPECT_TRUE(LiteralTestUtil::Near(
+      *Literal::CreateR4FromArray4D<float>(expected), *result, error_spec_));
 }
 
 TEST_F(BroadcastTest, Broadcast_R2_2x2_To_R4_3x3x2x2) {
@@ -260,8 +260,8 @@ TEST_F(BroadcastTest, Broadcast_R2_2x2_To_R4_3x3x2x2) {
   Array4D<float> expected(3, 3, 2, 2);
   expected.FillWithYX(to_broadcast);
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D<float>(expected),
-                              *result, error_spec_);
+  EXPECT_TRUE(LiteralTestUtil::Near(
+      *Literal::CreateR4FromArray4D<float>(expected), *result, error_spec_));
 }
 
 TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) {
@@ -291,8 +291,8 @@ TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D<float>(expected),
-                              *result, error_spec_);
+  EXPECT_TRUE(LiteralTestUtil::Near(
+      *Literal::CreateR4FromArray4D<float>(expected), *result, error_spec_));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/call_test.cc b/tensorflow/compiler/xla/tests/call_test.cc
index a43ca3d5ca2ba39ba9c16213e985e50bf39c0b7d..5fd33b50c94356839bbed58acd43b7d0286f4a7e 100644
--- a/tensorflow/compiler/xla/tests/call_test.cc
+++ b/tensorflow/compiler/xla/tests/call_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index c09e7eaf2bb94d84d68604bff4fc97a8e8dfbc07..bf8ed4d9fb0bc61b86ef0b5872711a122a3d416b 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -178,8 +177,7 @@ void ClientLibraryTestBase::ComputeAndCompareLiteral(
                                                   error, shape_with_layout));
 }
 
-tensorflow::Status
-ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
+Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
     const xla::XlaComputation& computation, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
     const std::function<void(const Literal& actual,
@@ -201,11 +199,10 @@ ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
                                "Test with output layout: ",
                                ShapeUtil::HumanStringWithLayout(layout)));
   } while (std::next_permutation(minor_to_major.begin(), minor_to_major.end()));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status
-ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
+Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
     const xla::XlaComputation& computation, const Literal& /*expected*/,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
     const std::function<void(const Literal& actual,
@@ -216,8 +213,8 @@ ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
   // This is a recursive function. It's an std::function instead of a lambda
   // because it needs to capture itself. The index is the index of the argument
   // to try all layouts for.
-  std::function<tensorflow::Status(int64)> choose;
-  choose = [&, this](int64 index) -> tensorflow::Status {
+  std::function<Status(int64)> choose;
+  choose = [&, this](int64 index) -> Status {
     if (index < arguments.size()) {
       // Try out all layouts for the operand.
       TF_ASSIGN_OR_RETURN(auto literal,
@@ -230,7 +227,7 @@ ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
         TF_RETURN_IF_ERROR(choose(index + 1));
         arguments_with_layout.pop_back();
         layout_strings.pop_back();
-        return tensorflow::Status::OK();
+        return Status::OK();
       }
 
       std::vector<int64> minor_to_major(ShapeUtil::Rank(literal->shape()));
@@ -248,7 +245,7 @@ ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
         layout_strings.pop_back();
       } while (
           std::next_permutation(minor_to_major.begin(), minor_to_major.end()));
-      return tensorflow::Status::OK();
+      return Status::OK();
     }
 
     // Every argument has an assigned layout.
@@ -263,13 +260,13 @@ ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
       tensorflow::strings::StrAppend(&error_message, str, " ");
     }
     verify_output(*actual, error_message);
-    return tensorflow::Status::OK();
+    return Status::OK();
   };
 
   return choose(0);
 }
 
-tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
+Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     XlaBuilder* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments_passed_in,
     const Shape* shape_with_layout) {
@@ -297,7 +294,7 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   std::unique_ptr<Literal> converted_expected;
   Shape layout_shape;
   if (use_bfloat16_) {
-    converted_expected = LiteralTestUtil::ConvertF32ToBF16(expected);
+    converted_expected = Literal::ConvertF32ToBF16(expected);
     expected_ptr = converted_expected.get();
     if (shape_with_layout != nullptr) {
       layout_shape = *shape_with_layout;
@@ -311,7 +308,7 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     }
   }
   auto expect_equal = [&](const Literal& actual, const string& error_message) {
-    LiteralTestUtil::ExpectEqual(*expected_ptr, actual, error_message);
+    EXPECT_TRUE(LiteralTestUtil::Equal(*expected_ptr, actual)) << error_message;
   };
   if (execution_options_.debug_options().xla_test_all_output_layouts()) {
     return ComputeAndCompareLiteralWithAllOutputLayouts(
@@ -323,11 +320,11 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   }
   TF_ASSIGN_OR_RETURN(auto actual, ExecuteAndTransfer(computation, arguments,
                                                       shape_with_layout));
-  LiteralTestUtil::ExpectEqual(*expected_ptr, *actual);
-  return tensorflow::Status::OK();
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected_ptr, *actual));
+  return Status::OK();
 }
 
-tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
+Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     XlaBuilder* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments_passed_in,
     ErrorSpec error, const Shape* shape_with_layout) {
@@ -349,7 +346,7 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   std::unique_ptr<Literal> converted_expected;
   Shape layout_shape;
   if (use_bfloat16_) {
-    converted_expected = LiteralTestUtil::ConvertF32ToBF16(expected);
+    converted_expected = Literal::ConvertF32ToBF16(expected);
     expected_ptr = converted_expected.get();
     if (shape_with_layout != nullptr) {
       layout_shape = *shape_with_layout;
@@ -363,7 +360,8 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     }
   }
   auto expect_near = [&](const Literal& actual, const string& error_message) {
-    LiteralTestUtil::ExpectNear(*expected_ptr, actual, error, error_message);
+    EXPECT_TRUE(LiteralTestUtil::Near(*expected_ptr, actual, error))
+        << error_message;
   };
   if (execution_options_.debug_options().xla_test_all_output_layouts()) {
     return ComputeAndCompareLiteralWithAllOutputLayouts(
@@ -375,8 +373,8 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   }
   TF_ASSIGN_OR_RETURN(auto actual, ExecuteAndTransfer(computation, arguments,
                                                       shape_with_layout));
-  LiteralTestUtil::ExpectNear(*expected_ptr, *actual, error);
-  return tensorflow::Status::OK();
+  EXPECT_TRUE(LiteralTestUtil::Near(*expected_ptr, *actual, error));
+  return Status::OK();
 }
 
 void ClientLibraryTestBase::ComputeAndCompareR1U8(
@@ -407,7 +405,7 @@ void ClientLibraryTestBase::ComputeAndCompareTuple(
     return;
   }
   auto actual = actual_status.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectEqual(expected, *actual);
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, *actual));
 }
 
 void ClientLibraryTestBase::ComputeAndCompareTuple(
@@ -419,7 +417,7 @@ void ClientLibraryTestBase::ComputeAndCompareTuple(
     return;
   }
   auto actual = actual_status.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectNear(expected, *actual, error);
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, *actual, error));
 }
 
 void ClientLibraryTestBase::ComputeAndCompare(
@@ -431,7 +429,7 @@ void ClientLibraryTestBase::ComputeAndCompare(
   }
   std::unique_ptr<Literal> reference, result;
   std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectEqual(*reference, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*reference, *result));
 }
 
 void ClientLibraryTestBase::ComputeAndCompare(
@@ -444,7 +442,7 @@ void ClientLibraryTestBase::ComputeAndCompare(
   }
   std::unique_ptr<Literal> reference, result;
   std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectNear(*reference, *result, error);
+  EXPECT_TRUE(LiteralTestUtil::Near(*reference, *result, error));
 }
 
 StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
@@ -562,7 +560,36 @@ XlaOp ClientLibraryTestBase::AddParam(const Literal& argument,
 XlaOp ClientLibraryTestBase::CreateConstantFromLiteral(const Literal& literal,
                                                        XlaBuilder* builder) {
   return builder->ConstantLiteral(
-      use_bfloat16_ ? *LiteralTestUtil::ConvertF32ToBF16(literal) : literal);
+      use_bfloat16_ ? *Literal::ConvertF32ToBF16(literal) : literal);
+}
+
+std::unique_ptr<GlobalData>
+ClientLibraryTestBase::CreateParameterAndTransferLiteral(int64 parameter_number,
+                                                         const Literal& literal,
+                                                         const string& name,
+                                                         XlaBuilder* builder,
+                                                         XlaOp* data_handle) {
+  return CreateParameterAndTransferLiteral(parameter_number, literal, name,
+                                           nullptr, builder, data_handle);
+}
+
+std::unique_ptr<GlobalData>
+ClientLibraryTestBase::CreateParameterAndTransferLiteral(
+    int64 parameter_number, const Literal& literal, const string& name,
+    const DeviceHandle* device_handle, XlaBuilder* builder,
+    XlaOp* data_handle) {
+  const Literal* param_literal = &literal;
+  std::unique_ptr<Literal> converted_literal;
+  if (use_bfloat16_) {
+    converted_literal = Literal::ConvertF32ToBF16(literal);
+    param_literal = converted_literal.get();
+  }
+  std::unique_ptr<GlobalData> data =
+      client_->TransferToServer(*param_literal, device_handle)
+          .ConsumeValueOrDie();
+  *data_handle =
+      builder->Parameter(parameter_number, param_literal->shape(), name);
+  return data;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index e58979a3035dd5823be7180aeb510fa3e15f51e2..0499fec5898a42affa0e0a712dee10187355c13e 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -188,11 +188,11 @@ class ClientLibraryTestBase : public ::testing::Test {
       const Shape* shape_with_layout = nullptr);
 
   // ComputeAndCompare variant which returns an error status.
-  tensorflow::Status ComputeAndCompareLiteralWithStatus(
+  Status ComputeAndCompareLiteralWithStatus(
       XlaBuilder* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_layout = nullptr);
-  tensorflow::Status ComputeAndCompareLiteralWithStatus(
+  Status ComputeAndCompareLiteralWithStatus(
       XlaBuilder* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
       const Shape* shape_with_layout = nullptr);
@@ -378,12 +378,12 @@ class ClientLibraryTestBase : public ::testing::Test {
   ExecutionOptions execution_options_;
 
  private:
-  tensorflow::Status ComputeAndCompareLiteralWithAllOutputLayouts(
+  Status ComputeAndCompareLiteralWithAllOutputLayouts(
       const xla::XlaComputation& computation, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const std::function<void(const Literal& actual,
                                const string& error_message)>& verify_output);
-  tensorflow::Status ComputeAndCompareLiteralWithAllInputLayouts(
+  Status ComputeAndCompareLiteralWithAllInputLayouts(
       const xla::XlaComputation& computation, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const std::function<void(const Literal& actual,
@@ -541,7 +541,7 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR0Parameter(
     XlaBuilder* builder, XlaOp* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR0(value);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
-    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+    literal = Literal::ConvertF32ToBF16(*literal);
   }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
@@ -555,7 +555,7 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR1Parameter(
     const string& name, XlaBuilder* builder, XlaOp* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR1(values);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
-    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+    literal = Literal::ConvertF32ToBF16(*literal);
   }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
@@ -569,7 +569,7 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR2Parameter(
     const string& name, XlaBuilder* builder, XlaOp* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR2FromArray2D(array_2d);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
-    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+    literal = Literal::ConvertF32ToBF16(*literal);
   }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
@@ -583,7 +583,7 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR3Parameter(
     const string& name, XlaBuilder* builder, XlaOp* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR3FromArray3D(array_3d);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
-    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+    literal = Literal::ConvertF32ToBF16(*literal);
   }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
@@ -616,35 +616,6 @@ std::unique_ptr<Array2D<NativeT>> ClientLibraryTestBase::CreatePseudorandomR2(
   return result;
 }
 
-std::unique_ptr<GlobalData>
-ClientLibraryTestBase::CreateParameterAndTransferLiteral(int64 parameter_number,
-                                                         const Literal& literal,
-                                                         const string& name,
-                                                         XlaBuilder* builder,
-                                                         XlaOp* data_handle) {
-  return CreateParameterAndTransferLiteral(parameter_number, literal, name,
-                                           nullptr, builder, data_handle);
-}
-
-std::unique_ptr<GlobalData>
-ClientLibraryTestBase::CreateParameterAndTransferLiteral(
-    int64 parameter_number, const Literal& literal, const string& name,
-    const DeviceHandle* device_handle, XlaBuilder* builder,
-    XlaOp* data_handle) {
-  const Literal* param_literal = &literal;
-  std::unique_ptr<Literal> converted_literal;
-  if (use_bfloat16_) {
-    converted_literal = LiteralTestUtil::ConvertF32ToBF16(literal);
-    param_literal = converted_literal.get();
-  }
-  std::unique_ptr<GlobalData> data =
-      client_->TransferToServer(*param_literal, device_handle)
-          .ConsumeValueOrDie();
-  *data_handle =
-      builder->Parameter(parameter_number, param_literal->shape(), name);
-  return data;
-}
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_CLIENT_LIBRARY_TEST_BASE_H_
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 0b425b93bb144e395baef2bcf074fd6e7991630b..08671cf62445826649b5c97003f998ae98a59d97 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -62,9 +62,9 @@ XLA_TEST_F(ClientTest, ExecuteWithLayout) {
       TF_ASSERT_OK_AND_ASSIGN(
           auto computed, client_->Transfer(*data, &expected_literal->shape()));
 
-      LiteralTestUtil::AssertEqualShapesAndLayouts(expected_literal->shape(),
-                                                   computed->shape());
-      LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
+      ASSERT_TRUE(LiteralTestUtil::EqualShapesAndLayouts(
+          expected_literal->shape(), computed->shape()));
+      EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *computed));
     }
   }
 }
@@ -91,9 +91,9 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
       auto result,
       client_->ExecuteAndTransfer(computation, {}, &execution_options));
   LiteralTestUtil::ExpectR2Equal<int32>({{1, 2}, {3, 4}},
-                                        LiteralView::Create(*result, {0}));
+                                        LiteralSlice(*result, {0}));
   LiteralTestUtil::ExpectR2Equal<int32>({{10, 20}, {30, 40}},
-                                        LiteralView::Create(*result, {1}));
+                                        LiteralSlice(*result, {1}));
 
   EXPECT_TRUE(ShapeUtil::IsTuple(result->shape()));
   EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->shape()));
@@ -142,7 +142,7 @@ XLA_TEST_F(ClientTest, DISABLED_ON_GPU(ExecuteParallel)) {
       auto result_literal,
       client_->Transfer(*results[0], &expected_result->shape()));
 
-  LiteralTestUtil::ExpectEqual(*expected_result, *result_literal);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected_result, *result_literal));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/compilation_cache_test.cc b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
index ecce599a8a3bd588c11d6bb9ba461b5a917197db..50a006964869b3e5dce431d441f7cd81af9df910 100644
--- a/tensorflow/compiler/xla/tests/compilation_cache_test.cc
+++ b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
@@ -50,8 +49,8 @@ class CompilationCacheTest : public ClientLibraryTestBase {
                                  /*execution_options=*/&execution_options_,
                                  &execution_profile)
             .ConsumeValueOrDie();
-    LiteralTestUtil::ExpectNear(*Literal::CreateR0<float>(expected_result),
-                                *result, error_spec_);
+    EXPECT_TRUE(LiteralTestUtil::Near(
+        *Literal::CreateR0<float>(expected_result), *result, error_spec_));
     EXPECT_EQ(expect_cache_hit, execution_profile.compilation_cache_hit());
   }
 
@@ -67,8 +66,8 @@ class CompilationCacheTest : public ClientLibraryTestBase {
                            .ConsumeValueOrDie();
     std::unique_ptr<Literal> result =
         client_->Transfer(*data_handle).ConsumeValueOrDie();
-    LiteralTestUtil::ExpectNear(*Literal::CreateR2<float>(expected_result),
-                                *result, error_spec_);
+    EXPECT_TRUE(LiteralTestUtil::Near(
+        *Literal::CreateR2<float>(expected_result), *result, error_spec_));
     EXPECT_EQ(expect_cache_hit, execution_profile.compilation_cache_hit());
   }
 
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index bf4b8fb0bcf229b4e8649b3920dcba1ae0579831..ba22530f1cfee56337f862c25122d399dbf0f1e4 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -208,7 +208,7 @@ TEST_F(ComputeConstantTest, NonScalarAdd) {
                             ComputeConstantLiteral(client, computation, &b));
     std::unique_ptr<Literal> expected_literal =
         Literal::CreateR1<int32>({4, 6});
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
+    EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *computed));
   }
 }
 
@@ -222,7 +222,7 @@ TEST_F(ComputeConstantTest, IntegerDivide) {
     TF_ASSERT_OK_AND_ASSIGN(auto computed,
                             ComputeConstantLiteral(client, computation, &b));
     std::unique_ptr<Literal> expected_literal = Literal::CreateR0<int32>(5);
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
+    EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *computed));
   }
 }
 
@@ -244,9 +244,9 @@ XLA_TEST_F(ComputeConstantTest, Layout) {
       std::unique_ptr<Literal> expected_literal =
           Literal::CreateR2WithLayout<int32>({{11, 22}, {33, 44}},
                                              LayoutUtil::MakeLayout(layout));
-      LiteralTestUtil::AssertEqualShapesAndLayouts(expected_literal->shape(),
-                                                   computed->shape());
-      LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
+      ASSERT_TRUE(LiteralTestUtil::EqualShapesAndLayouts(
+          expected_literal->shape(), computed->shape()));
+      EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *computed));
     }
   }
 }
diff --git a/tensorflow/compiler/xla/tests/constants_test.cc b/tensorflow/compiler/xla/tests/constants_test.cc
index 4743673561a665ca8670a56bf15d85a74073e472..916ffadbc798ec0dd016f45b0bc4c36233455ee7 100644
--- a/tensorflow/compiler/xla/tests/constants_test.cc
+++ b/tensorflow/compiler/xla/tests/constants_test.cc
@@ -21,13 +21,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -169,9 +167,9 @@ TEST_F(ConstantsTest, DISABLED_TupleConstant) {
       ExecuteAndTransfer(&builder, {}).ConsumeValueOrDie();
 
   LiteralTestUtil::ExpectR2Near<float>(
-      {{1.0}, {2.0}}, LiteralView::Create(*result, {0}), error_spec_);
+      {{1.0}, {2.0}}, LiteralSlice(*result, {0}), error_spec_);
   LiteralTestUtil::ExpectR1Near<float>(
-      {2.0, 42.0}, LiteralView::Create(*result, {1}), error_spec_);
+      {2.0, 42.0}, LiteralSlice(*result, {1}), error_spec_);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
index 50d6e25d868c4964ff35023b43a3734ed115bbb8..fea850dc135e33fe098aa755c6fdd93319cd2837 100644
--- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index 155fbacf58d81cff27939c142c8f30158cef4e00..2b3390ca98cb2922410d451c06811aa9d4ff8c0b 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -49,7 +49,7 @@ class CopyOpTest : public HloTestBase {
     module->AddEntryComputation(std::move(computation));
 
     std::unique_ptr<Literal> result = ExecuteAndTransfer(std::move(module), {});
-    LiteralTestUtil::ExpectEqual(literal, *result);
+    EXPECT_TRUE(LiteralTestUtil::Equal(literal, *result));
   }
 
   void TestCopyConstantLayout021(size_t n1, size_t n2, size_t n3);
@@ -253,7 +253,7 @@ XLA_TEST_F(CopyOpClientTest, Copy0x0) {
 
   auto actual = ExecuteAndTransfer(&builder, {input_data.get()}, &out_shape)
                     .ConsumeValueOrDie();
-  LiteralTestUtil::ExpectEqual(*empty, *actual);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*empty, *actual));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc b/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b15988776513a60c9e5c85d4780912106db98e75
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+
+namespace xla {
+namespace {
+
+class TrivialCrossReplicaSumTest : public HloTestBase {};
+
+// Currently the CPU and GPU backends only support CrossReplicaSum with one
+// replica.  But we can at least check this.
+
+XLA_TEST_F(TrivialCrossReplicaSumTest, OneOperand) {
+  const char* module_str = R"(
+  HloModule test
+  ENTRY test_computation {
+    p = f32[3] parameter(0)
+    ROOT crs = f32[3] cross-replica-sum(p)
+  })";
+  auto module = tools::Parse(module_str, GetModuleConfigForTest()).ValueOrDie();
+  auto literal = Literal::CreateR1<float>({1, 2, 3});
+  EXPECT_EQ(*literal, *ExecuteAndTransfer(std::move(module), {literal.get()}));
+}
+
+XLA_TEST_F(TrivialCrossReplicaSumTest, MultipleOperands) {
+  const char* module_str = R"(
+  HloModule test
+  ENTRY test_computation {
+    p0 = f32[3] parameter(0)
+    p1 = f32[2] parameter(1)
+    ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1)
+  })";
+  auto module = tools::Parse(module_str, GetModuleConfigForTest()).ValueOrDie();
+  auto literal0 = Literal::CreateR1<float>({1, 2, 3});
+  auto literal1 = Literal::CreateR1<float>({10, 20});
+  EXPECT_EQ(
+      *Literal::MakeTuple({literal0.get(), literal1.get()}),
+      *ExecuteAndTransfer(std::move(module), {literal0.get(), literal1.get()}));
+}
+
+// On the GPU backend, constants get special handling.  Someone might pass a
+// constant to CRS to e.g. count the number of replicas -- we need to make sure
+// it works.
+XLA_TEST_F(TrivialCrossReplicaSumTest, ConstantOperand) {
+  const char* module_str = R"(
+  HloModule test
+  ENTRY test_computation {
+    p0 = f32[3] parameter(0)
+    p1 = f32[2] constant({10, 20})
+    ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1)
+  })";
+  auto module = tools::Parse(module_str, GetModuleConfigForTest()).ValueOrDie();
+  auto literal0 = Literal::CreateR1<float>({1, 2, 3});
+  auto literal1 = Literal::CreateR1<float>({10, 20});
+  EXPECT_EQ(*Literal::MakeTuple({literal0.get(), literal1.get()}),
+            *ExecuteAndTransfer(std::move(module), {literal0.get()}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/deallocation_test.cc b/tensorflow/compiler/xla/tests/deallocation_test.cc
index c76e5aabf4b8a3463b2971654d0a6cf0dd594626..bfe688e20d182d581c3e3b545ac2289413deef7c 100644
--- a/tensorflow/compiler/xla/tests/deallocation_test.cc
+++ b/tensorflow/compiler/xla/tests/deallocation_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
diff --git a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
index d0ada2474830390e50a90c4c41aa42166d6e8ea5..12789fe66530fe03eb33316eda652336f29971ab 100644
--- a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 6b3efba4f80e45d230d3df9274d0fd40c6fb8c42..0fd846cef8095a857dd7b2c12d8afdf409e2bd66 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -61,7 +61,7 @@ using TypesF16F32F64CF64 = ::testing::Types<Eigen::half, float>;
 #endif
 
 // Check that we can safely pass an input tuple's elements to a dot operation.
-TEST_F(DotOperationTest, DotOfInputTupleElem) {
+XLA_TEST_F(DotOperationTest, DotOfInputTupleElem) {
   XlaBuilder builder(TestName());
 
   XlaOp param;
@@ -798,5 +798,250 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64,
       this->error_spec_);
 }
 
+XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSClassicMM) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
+      {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0},
+                          {9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0},
+                          {3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{114, 105, 96}, {96, 105, 114}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({1, 0});
+  auto dynamic_slice =
+      builder.DynamicSlice(lhs_constant, start_constant, {1, 6});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto result = builder.DotGeneral(dynamic_slice, rhs_constant, dot_dnums);
+
+  Array2D<float> expected({{96.0, 105.0, 114.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
+
+XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSClassicMM) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
+      {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0},
+                          {9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0},
+                          {3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{114, 105, 96}, {96, 105, 114}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({0, 1});
+  auto dynamic_slice =
+      builder.DynamicSlice(rhs_constant, start_constant, {6, 1});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto result = builder.DotGeneral(lhs_constant, dynamic_slice, dot_dnums);
+
+  Array2D<float> expected({{105.0}, {105.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
+
+// TODO (b/69062148) Enable when Dot implements general contracting dimensions.
+XLA_TEST_F(DotOperationTest,
+       DISABLED_ON_CPU(DISABLED_ON_GPU(DISABLED_ON_INTERPRETER(
+           DotOfGatherOptimizationWithConstRHSReverseMM)))) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0},
+                          {9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0},
+                          {3.0, 2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(new Array2D<float>(
+      {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{114, 96}, {105, 105}, {96, 114}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({0, 1});
+  auto dynamic_slice =
+      builder.DynamicSlice(lhs_constant, start_constant, {6, 1});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  auto result = builder.DotGeneral(dynamic_slice, rhs_constant, dot_dnums);
+
+  Array2D<float> expected({{105.0, 105.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
+
+// TODO (b/69062148) Enable when Dot implements general contracting dimensions.
+XLA_TEST_F(DotOperationTest,
+       DISABLED_ON_CPU(DISABLED_ON_GPU(DISABLED_ON_INTERPRETER(
+           DotOfGatherOptimizationWithConstLHSReverseMM)))) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0},
+                          {9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0},
+                          {3.0, 2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(new Array2D<float>(
+      {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{114, 96}, {105, 105}, {96, 114}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({1, 0});
+  auto dynamic_slice =
+      builder.DynamicSlice(rhs_constant, start_constant, {1, 6});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  auto result = builder.DotGeneral(lhs_constant, dynamic_slice, dot_dnums);
+
+  Array2D<float> expected({{96.0}, {105.0}, {114.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
+
+// TODO (b/69062148) Enable when Dot implements general contracting dimensions.
+XLA_TEST_F(DotOperationTest,
+       DISABLED_ON_CPU(DISABLED_ON_GPU(
+           DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstRHSRows)))) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(
+      new Array2D<float>({{1.0, 2.0},
+                          {3.0, 4.0},
+                          {5.0, 6.0},
+                          {6.0, 5.0},
+                          {4.0, 3.0},
+                          {2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0},
+                          {9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0},
+                          {3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{132, 129, 126}, {126, 129, 132}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({0, 1});
+  auto dynamic_slice =
+      builder.DynamicSlice(lhs_constant, start_constant, {6, 1});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto result = builder.DotGeneral(dynamic_slice, rhs_constant, dot_dnums);
+
+  Array2D<float> expected({{126.0, 129.0, 132.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
+
+// TODO (b/69062148) Enable when Dot implements general contracting dimensions.
+XLA_TEST_F(DotOperationTest,
+       DISABLED_ON_CPU(DISABLED_ON_GPU(
+           DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstLHSRows)))) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(
+      new Array2D<float>({{1.0, 2.0},
+                          {3.0, 4.0},
+                          {5.0, 6.0},
+                          {6.0, 5.0},
+                          {4.0, 3.0},
+                          {2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0},
+                          {9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0},
+                          {3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{132, 129, 126}, {126, 129, 132}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({0, 1});
+  auto dynamic_slice =
+      builder.DynamicSlice(rhs_constant, start_constant, {6, 1});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto result = builder.DotGeneral(lhs_constant, dynamic_slice, dot_dnums);
+
+  Array2D<float> expected({{129.0}, {129.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
+
+// TODO (b/69062148) Enable when Dot implements general contracting dimensions.
+XLA_TEST_F(DotOperationTest,
+       DISABLED_ON_CPU(DISABLED_ON_GPU(
+           DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstRHSCols)))) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
+      {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0, 4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0, 9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{91, 168, 56}, {56, 168, 91}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({1, 0});
+  auto dynamic_slice =
+      builder.DynamicSlice(lhs_constant, start_constant, {1, 6});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  auto result = builder.DotGeneral(dynamic_slice, rhs_constant, dot_dnums);
+
+  Array2D<float> expected({{56.0, 168.0, 91.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
+
+// TODO (b/69062148) Enable when Dot implements general contracting dimensions.
+XLA_TEST_F(DotOperationTest,
+       DISABLED_ON_CPU(DISABLED_ON_GPU(
+           DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstLHSCols)))) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
+      {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0, 4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0, 9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{91, 168, 56}, {56, 168, 91}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({1, 0});
+  auto dynamic_slice =
+      builder.DynamicSlice(rhs_constant, start_constant, {1, 6});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  auto result = builder.DotGeneral(lhs_constant, dynamic_slice, dot_dnums);
+
+  Array2D<float> expected({{168.0}, {168.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index bfb83faf5222b8ca5ceceebf7f2f976ec803245e..49f3a10d227f2f9edfe76405ba13498fe822f8d8 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -53,9 +53,9 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   }
 
   template <typename IndexT, typename DataT>
-  void TestR1Wrap() {
-    // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {6}, {4}, {6, 7, 0, 1});
+  void TestR1OOB() {
+    // Slice at dimension boundaries, but with out of bounds indices.
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {6}, {4}, {4, 5, 6, 7});
   }
 
   template <typename IndexT, typename DataT>
@@ -78,10 +78,10 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   }
 
   template <typename IndexT, typename DataT>
-  void TestR2Wrap() {
-    // Slice at dimension boundaries, but with sizes that cause indices to wrap.
+  void TestR2OOB() {
+    // Slice at dimension boundaries, but with out of bounds indices.
     RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {1, 1}, {3, 3},
-                         {{5, 6, 4}, {8, 9, 7}, {2, 3, 1}});
+                         {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   }
 
   template <typename IndexT, typename DataT>
@@ -106,11 +106,11 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   }
 
   template <typename IndexT, typename DataT>
-  void TestR3Wrap() {
-    // Slice at dimension boundaries, but with sizes that cause indices to wrap.
+  void TestR3OOB() {
+    // Slice at dimension boundaries, but with out of bounds indices.
     RunR3<IndexT, DataT>(
         {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {0, 2, 1},
-        {2, 1, 2}, {{{6, 5}}, {{12, 11}}});
+        {2, 1, 2}, {{{5, 6}}, {{11, 12}}});
   }
 
   template <typename IndexT, typename DataT>
@@ -199,19 +199,19 @@ class DynamicSliceTest : public ClientLibraryTestBase {
 
 XLA_TEST_F(DynamicSliceTest, Int32R1BF16) { TestR1<int32, bfloat16>(); }
 XLA_TEST_F(DynamicSliceTest, Int32R1) { TestR1<int32, int32>(); }
-XLA_TEST_F(DynamicSliceTest, Int32R1Wrap) { TestR1Wrap<int32, int32>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R1OOB) { TestR1OOB<int32, int32>(); }
 XLA_TEST_F(DynamicSliceTest, Int64R1) { TestR1<int64, float>(); }
 XLA_TEST_F(DynamicSliceTest, UInt64R1) { TestR1<uint64, float>(); }
 
 XLA_TEST_F(DynamicSliceTest, Int32R2BF16) { TestR2<int32, bfloat16>(); }
 XLA_TEST_F(DynamicSliceTest, Int32R2) { TestR2<int32, int32>(); }
-XLA_TEST_F(DynamicSliceTest, Int32R2Wrap) { TestR2Wrap<int32, int32>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R2OOB) { TestR2OOB<int32, int32>(); }
 XLA_TEST_F(DynamicSliceTest, Int64R2) { TestR2<int64, float>(); }
 XLA_TEST_F(DynamicSliceTest, UInt64R2) { TestR2<uint64, int32>(); }
 
 XLA_TEST_F(DynamicSliceTest, Int32R3BF16) { TestR3<int32, bfloat16>(); }
 XLA_TEST_F(DynamicSliceTest, Int32R3) { TestR3<int32, float>(); }
-XLA_TEST_F(DynamicSliceTest, Int32R3Wrap) { TestR3Wrap<int32, float>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R3OOB) { TestR3OOB<int32, float>(); }
 XLA_TEST_F(DynamicSliceTest, Int64R3) { TestR3<int64, float>(); }
 XLA_TEST_F(DynamicSliceTest, UInt64R3) { TestR3<uint64, float>(); }
 
@@ -332,17 +332,17 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
   }
 
   template <typename IndexT, typename DataT>
-  void TestWrap() {
-    // Slice at dimension boundaries, but with sizes that cause indices to wrap.
+  void TestOOB() {
+    // // Slice at dimension boundaries, but with out of bounds indices.
     RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {6},
-                         {10, 1, 2, 3, 4, 5, 8, 9});
+                         {0, 1, 2, 3, 4, 8, 9, 10});
     // R2 Shape: [3, 3]
     RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {2, 2},
-                         {{1, 2, 3}, {4, 5, 6}, {11, 8, 10}});
+                         {{1, 2, 3}, {4, 5, 6}, {7, 10, 11}});
     // R3 Shape: [2, 3, 2]
     RunR3<IndexT, DataT>(
         {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {{{13}, {15}}},
-        {1, 2, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 15}, {9, 10}, {11, 13}}});
+        {1, 2, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 13}, {11, 15}}});
   }
 
   template <typename IndexT, typename DataT>
@@ -476,20 +476,19 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     Array3D<T> input_values(kSeq, kBatch, kDim);
     Array3D<T> update_values(size, kBatch, kDim);
     Array3D<T> expected_values(kSeq, kBatch, kDim);
+    index = std::min(std::max(0, index), kSeq - size);
 
     input_values.FillIota(static_cast<T>(0));
     T value = static_cast<T>(10);
     update_values.FillIota(static_cast<T>(value));
 
     // TODO(b/34128753) Expected values may vary depending on backend when
-    // the update wraps. According to documentation, the results are technically
-    // implementation specific where the update is out of bounds, and hence
-    // we don't really know what to pass into ComputeAndCompareR3.
+    // the indices are out of bounds.
     expected_values.FillIota(static_cast<T>(0));
     for (int i = 0; i < size; i++) {
       for (int j = 0; j < kBatch; j++) {
         for (int k = 0; k < kDim; k++) {
-          expected_values((index + i) % kSeq, j, k) = value++;
+          expected_values(index + i, j, k) = value++;
         }
       }
     }
@@ -547,12 +546,10 @@ XLA_TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3<int32, float>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3<int64, int64>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3<uint64, uint64>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32WrapBF16) {
-  TestWrap<int32, bfloat16>();
-}
-XLA_TEST_F(DynamicUpdateSliceTest, Int32Wrap) { TestWrap<int32, float>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, Int64Wrap) { TestWrap<int64, int64>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, UInt64Wrap) { TestWrap<uint64, uint64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int32OOBBF16) { TestOOB<int32, bfloat16>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int32OOB) { TestOOB<int32, float>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int64OOB) { TestOOB<int64, int64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, UInt64OOB) { TestOOB<uint64, uint64>(); }
 
 XLA_TEST_F(DynamicUpdateSliceTest, Int32R1Pred) {
   // Slice at dimension start.
@@ -615,37 +612,37 @@ XLA_TEST_F(DynamicUpdateSliceTest, Int32R3Pred) {
 // Tests for simple R3 case where the update is contiguous (i.e. the minor
 // two dimensions are not sliced).
 XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousSingleElement) {
-  // Single element, no wrap.
+  // Single element, index in-bounds
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<float>(operand_shape, /*index=*/1, /*size=*/1);
 }
 
 XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousSingleElementBF16) {
-  // Single element, no wrap.
+  // Single element, index in-bounds
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/1, /*size=*/1);
 }
 
 XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleElements) {
-  // Multiple element, no wrap.
+  // Multiples element, index in-bounds.
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<float>(operand_shape, /*index=*/1, /*size=*/2);
 }
 
 XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleElementsBF16) {
-  // Multiple element, no wrap.
+  // Multiples element, index in-bounds.
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/1, /*size=*/2);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleWrapping) {
-  // Multiple element, wrapping.
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleOOB) {
+  // Multiple element, index out of bounds.
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<float>(operand_shape, /*index=*/3, /*size=*/2);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleWrappingBF16) {
-  // Multiple element, wrapping.
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleOOBBF16) {
+  // Multiple element, index out of bounds.
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/3, /*size=*/2);
 }
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index b947f8208a5fa3f5a396ebc7a234afbf7ac3d900..e6f79b5ac55dddfbb213a36cadbee53bc9443d9d 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -118,9 +118,9 @@ class FusionTest : public HloTestBase {
     auto expected = Literal::CreateR2FromArray2D(answer_data);
     auto actual = ExecuteAndTransfer(std::move(hlo_module), {});
     if (primitive_util::IsFloatingPointType(prim_type)) {
-      LiteralTestUtil::ExpectNear(*expected, *actual, ErrorSpec(1e-4));
+      EXPECT_TRUE(LiteralTestUtil::Near(*expected, *actual, ErrorSpec(1e-4)));
     } else {
-      LiteralTestUtil::ExpectEqual(*expected, *actual);
+      EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *actual));
     }
   }
 
@@ -221,9 +221,9 @@ XLA_TEST_F(FusionTest, Test) {
            const4, reshape3, add2, const1, const0},
           HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR2<float>({{0.5}, {2.72}}),
-                              *ExecuteAndTransfer(std::move(hlo_module), {}),
-                              ErrorSpec(1e-4));
+  EXPECT_TRUE(LiteralTestUtil::Near(
+      *Literal::CreateR2<float>({{0.5}, {2.72}}),
+      *ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4)));
 }
 
 // Test whether we emit appropriate code for parameters of fusion instructions.
@@ -247,9 +247,9 @@ XLA_TEST_F(FusionTest, Parameter) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{add3, const2},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR2<float>({{-1.0, 0.0, 1.0}}),
-                              *ExecuteAndTransfer(std::move(hlo_module), {}),
-                              ErrorSpec(1e-4));
+  EXPECT_TRUE(LiteralTestUtil::Near(
+      *Literal::CreateR2<float>({{-1.0, 0.0, 1.0}}),
+      *ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4)));
 }
 
 XLA_TEST_F(FusionTest, RandomizedParallelPartition) {
@@ -307,9 +307,9 @@ XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{add2, broadcast},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectNear(
+  EXPECT_TRUE(LiteralTestUtil::Near(
       *Literal::CreateR2<float>({{0.0, 0.0, -1.0}, {11.0, 22.0, 33.0}}),
-      *ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4));
+      *ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4)));
 }
 
 XLA_TEST_F(FusionTest, ReshapeToScalar) {
@@ -322,8 +322,9 @@ XLA_TEST_F(FusionTest, ReshapeToScalar) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR0<int32>(5),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR0<int32>(5),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
@@ -336,9 +337,9 @@ XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR3<int32>({{{1, 2, 3}, {4, 5, 6}}}),
-      *ExecuteAndTransfer(std::move(hlo_module), {}));
+      *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
@@ -351,9 +352,9 @@ XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}}),
-      *ExecuteAndTransfer(std::move(hlo_module), {}));
+      *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
@@ -366,8 +367,9 @@ XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR0<int32>(7),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR0<int32>(7),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Reshape__1by1by1) {
@@ -380,8 +382,9 @@ XLA_TEST_F(FusionTest, Reshape__1by1by1) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR3<int32>({{{7}}}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR3<int32>({{{7}}}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Reshape__) {
@@ -394,8 +397,9 @@ XLA_TEST_F(FusionTest, Reshape__) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR0<int32>(7),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR0<int32>(7),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
@@ -408,9 +412,9 @@ XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}),
-      *ExecuteAndTransfer(std::move(hlo_module), {}));
+      *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Transpose_2by3) {
@@ -423,9 +427,9 @@ XLA_TEST_F(FusionTest, Transpose_2by3) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR2<int32>({{1, 4}, {2, 5}, {3, 6}}),
-      *ExecuteAndTransfer(std::move(hlo_module), {}));
+      *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Transpose_3by3) {
@@ -438,9 +442,9 @@ XLA_TEST_F(FusionTest, Transpose_3by3) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR2<int32>({{1, 4, 7}, {2, 5, 8}, {3, 6, 9}}),
-      *ExecuteAndTransfer(std::move(hlo_module), {}));
+      *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Reverse) {
@@ -454,8 +458,9 @@ XLA_TEST_F(FusionTest, Reverse) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reverse1},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({3, 2, 1}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR1<int32>({3, 2, 1}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, ReverseNegate) {
@@ -471,8 +476,9 @@ XLA_TEST_F(FusionTest, ReverseNegate) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, reverse1},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({-3, -2, -1}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR1<int32>({-3, -2, -1}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, BroadcastNegate) {
@@ -488,8 +494,9 @@ XLA_TEST_F(FusionTest, BroadcastNegate) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, broadcast1},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({-1, -1}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR1<int32>({-1, -1}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, SliceNegate) {
@@ -505,8 +512,9 @@ XLA_TEST_F(FusionTest, SliceNegate) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, slice1},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({-1, -3}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR1<int32>({-1, -3}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, DynamicSliceNegate) {
@@ -526,8 +534,9 @@ XLA_TEST_F(FusionTest, DynamicSliceNegate) {
           /*instructions_to_fuse=*/{negate3, dynamic_slice2},
           HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({-2, -3}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR1<int32>({-2, -3}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, ReshapeNegate) {
@@ -543,8 +552,9 @@ XLA_TEST_F(FusionTest, ReshapeNegate) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, reshape1},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR2<int32>({{-1, -2}, {-3, -4}}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{-1, -2}, {-3, -4}}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 // TODO(b/64070202): Investigate failure.
@@ -561,8 +571,9 @@ XLA_TEST_F(FusionTest, DISABLED_ON_GPU(TransposeNegate)) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, transpose1},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR2<int32>({{-1, -3}, {-2, -4}}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{-1, -3}, {-2, -4}}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 std::unique_ptr<HloComputation> MakeReduceTestComputation() {
@@ -591,8 +602,9 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reduce2},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR0<int32>(15),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR0<int32>(15),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
@@ -612,8 +624,9 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate3, reduce2},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR0<int32>(-15),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR0<int32>(-15),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
@@ -661,9 +674,9 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reduce_window2},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR2<int32>({{462, 2145}, {24871, 62491}}),
-      *ExecuteAndTransfer(std::move(hlo_module), {}));
+      *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 // When a constant (or other op) which has multiple users is imported
@@ -697,8 +710,9 @@ XLA_TEST_F(FusionTest, SharedConstant) {
   // fused instruction contains the constant(2), the parameter, and 4 adds
   EXPECT_EQ(entry_comp->root_instruction()->fused_instruction_count(), 6);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({8}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR1<int32>({8}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Add2D) { TestElementwise2D<float, 2>(HloOpcode::kAdd); }
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index 130456e61ca8a217e903d2ddecc487f29a098ce1..4854c649c15f2ab89bd3b343abd248be6e227c60 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -629,8 +629,8 @@ XLA_TEST_F(GatherClientLibraryTest, DISABLED_ON_GPU(Basic)) {
       client_->ExecuteParallel(computation_instances));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
                           client_->Transfer(*(result_data[0])));
-  LiteralTestUtil::ExpectEqual(
-      *result_literal, *Literal::CreateR2<int32>({{1, 2, 3}, {7, 8, 9}}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result_literal, *Literal::CreateR2<int32>({{1, 2, 3}, {7, 8, 9}})));
 }
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 12598579c7032e954c4a4875ab8e6475b112f5ae..36e19e6507fa3b6f4a21949583f92716d2f44333 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -94,18 +94,15 @@ HloTestBase::HloTestBase(se::Platform* test_platform,
 
 /* static */
 std::unique_ptr<HloModule> HloTestBase::CreateNewModule(const string& name) {
-  HloModuleConfig config;
-  auto debug_options = HloTestBase::GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
-  config.set_debug_options(debug_options);
-
-  return MakeUnique<HloModule>(name, VersionedComputationHandle(), config);
+  return MakeUnique<HloModule>(name, VersionedComputationHandle(),
+                               GetModuleConfigForTest());
 }
 
 /*static*/ DebugOptions HloTestBase::GetDebugOptionsForTest() {
   auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
   debug_options.add_xla_disable_hlo_passes("constant_folding");
+  debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
   return debug_options;
 }
 
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 9539ae06801628baedaea69024b7760ebefa6e3a..eb3a2ea76a667a2afa2562f01d28f34384b84a21 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -93,6 +93,13 @@ class HloTestBase : public ::testing::Test {
   // DebugOptions, e.g. when creating a module from a string or a file.
   static DebugOptions GetDebugOptionsForTest();
 
+  // Gets an HloModuleConfig with options appropriate for tests.
+  static HloModuleConfig GetModuleConfigForTest() {
+    HloModuleConfig config;
+    config.set_debug_options(GetDebugOptionsForTest());
+    return config;
+  }
+
   // Executes the given module and return the result as a Literal.
   StatusOr<std::unique_ptr<Literal>> Execute(
       std::unique_ptr<HloModule> module,
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index c28f79ae386670ca80d603a42f6629dfd30e0bc9..cde1dcd9cd10c86107f495a92be42b57bf6a085b 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -15,978 +15,93 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 
-#include <unistd.h>
-#include <cmath>
-#include <vector>
-
-#include "tensorflow/compiler/xla/index_util.h"
-#include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/casts.h"
+#include "tensorflow/compiler/xla/literal_comparison.h"
 #include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
-using ::tensorflow::strings::Appendf;
-using ::tensorflow::strings::Printf;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
-
-/* static */ ::testing::AssertionResult LiteralTestUtil::EqualShapes(
-    const Shape& expected, const Shape& actual) {
-  if (ShapeUtil::IsTuple(expected) != ShapeUtil::IsTuple(actual)) {
-    return ::testing::AssertionFailure()
-           << "tupleness-mismatch! want: " << ShapeUtil::HumanString(expected)
-           << " got: " << ShapeUtil::HumanString(actual);
-  }
-  if (ShapeUtil::IsTuple(expected)) {
-    if (ShapeUtil::TupleElementCount(expected) !=
-        ShapeUtil::TupleElementCount(actual)) {
-      return ::testing::AssertionFailure()
-             << "want tuple element count: "
-             << ShapeUtil::TupleElementCount(expected)
-             << " got tuple element count: "
-             << ShapeUtil::TupleElementCount(actual);
-    }
-    for (int i = 0; i < expected.tuple_shapes_size(); ++i) {
-      ::testing::AssertionResult result =
-          EqualShapes(expected.tuple_shapes(i), actual.tuple_shapes(i))
-          << "mismatch in tuple index " << i;
-      if (!result) {
-        return result;
-      }
-    }
-  } else {
-    if (ShapeUtil::Rank(expected) != ShapeUtil::Rank(actual)) {
-      return ::testing::AssertionFailure()
-             << "want rank of: " << ShapeUtil::HumanString(expected)
-             << " got rank of: " << ShapeUtil::HumanString(actual);
-    }
-    if (expected.element_type() != actual.element_type()) {
-      return ::testing::AssertionFailure()
-             << PrimitiveType_Name(expected.element_type()) << " vs "
-             << PrimitiveType_Name(actual.element_type());
-    }
-    if (expected.dimensions_size() != actual.dimensions_size()) {
-      return ::testing::AssertionFailure()
-             << "want dimensions_size " << expected.dimensions_size()
-             << " got dimensions_size " << actual.dimensions_size();
-    }
-    for (int i = 0; i < expected.dimensions_size(); ++i) {
-      if (expected.dimensions(i) != actual.dimensions(i)) {
-        return ::testing::AssertionFailure()
-               << "mismatch in dimension #" << i
-               << " expected: " << ShapeUtil::HumanString(expected)
-               << " actual: " << ShapeUtil::HumanString(actual);
-      }
-    }
-  }
-  return ::testing::AssertionSuccess();
-}
-
-/* static */ void LiteralTestUtil::AssertEqualShapes(const Shape& expected,
-                                                     const Shape& actual) {
-  ASSERT_TRUE(EqualShapes(expected, actual));
-}
-
-/* static */ void LiteralTestUtil::AssertEqualShapesAndLayouts(
-    const Shape& expected, const Shape& actual) {
-  ASSERT_EQ(expected.ShortDebugString(), actual.ShortDebugString());
-}
-
-namespace {
-
-// Return a literal with all arrays of type FromNativeT converted to type
-// ToNativeT in the given literal.
-template <typename FromNativeT, typename ToNativeT>
-std::unique_ptr<Literal> ConvertType(const Literal& literal) {
-  // First construct shape of the result.
-  Shape result_shape(literal.shape());
-  ShapeUtil::ForEachMutableSubshape(
-      &result_shape, [](Shape* subshape, const ShapeIndex&) {
-        if (subshape->element_type() ==
-            primitive_util::NativeToPrimitiveType<FromNativeT>()) {
-          subshape->set_element_type(
-              primitive_util::NativeToPrimitiveType<ToNativeT>());
-        }
-      });
-  auto result = MakeUnique<Literal>(result_shape);
-
-  // Then copy over the data from 'literal' converting FromNativeT values to
-  // ToNativeT values as necessary.
-  ShapeUtil::ForEachSubshape(
-      literal.shape(),
-      [&](const Shape& subshape, const ShapeIndex& shape_index) {
-        if (ShapeUtil::IsArray(subshape)) {
-          if (subshape.element_type() ==
-              primitive_util::NativeToPrimitiveType<FromNativeT>()) {
-            auto src = literal.data<FromNativeT>(shape_index);
-            auto dest = result->data<ToNativeT>(shape_index);
-            for (int64 i = 0; i < src.size(); ++i) {
-              dest[i] = static_cast<ToNativeT>(src[i]);
-            }
-          } else {
-            TF_CHECK_OK(result->CopyFrom(literal,
-                                         /*dest_shape_index=*/shape_index,
-                                         /*src_shape_index=*/shape_index));
-          }
-        }
-      });
-  return result;
-}
-
-}  // namespace
-
-/* static */ std::unique_ptr<Literal> LiteralTestUtil::ConvertBF16ToF32(
-    const Literal& literal) {
-  return ConvertType<bfloat16, float>(literal);
-}
-
-/* static */ std::unique_ptr<Literal> LiteralTestUtil::ConvertF32ToBF16(
-    const Literal& literal) {
-  return ConvertType<float, bfloat16>(literal);
-}
-
 namespace {
 
-string Hostname() {
-  char hostname[1024];
-  gethostname(hostname, sizeof hostname);
-  hostname[sizeof hostname - 1] = 0;
-  return string(hostname);
-}
-
-// Helper function for comparing a floating point type, FloatT, bitwise equal
-// between the left-hand-side and right-hand-side, by bit-casting to UnsignedT
-// -- on miscompare, a nice error message is given in the AssertionFailure.
-template <typename FloatT, typename UnsignedT>
-::testing::AssertionResult CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
-  auto ulhs = tensorflow::bit_cast<UnsignedT>(lhs);
-  auto urhs = tensorflow::bit_cast<UnsignedT>(rhs);
-  auto lhs_double = static_cast<double>(lhs);
-  auto rhs_double = static_cast<double>(rhs);
-  if (ulhs != urhs) {
-    return ::testing::AssertionFailure() << Printf(
-               "floating values are not bitwise-equal; and equality testing "
-               "was requested: %s=%g=%a vs %s=%g=%a",
-               StrCat(tensorflow::strings::Hex(ulhs)).c_str(), lhs_double,
-               lhs_double, StrCat(tensorflow::strings::Hex(urhs)).c_str(),
-               rhs_double, rhs_double);
-  }
-  return ::testing::AssertionSuccess();
-}
-
-// Templated comparator that specializes for float equality comparison with the
-// bitwise helper above (this is the un-specialized fallback, to just use the
-// default gunit implementation).
-template <typename NativeT>
-::testing::AssertionResult CompareEqual(NativeT lhs, NativeT rhs) {
-  if (lhs == rhs) {
+// Writes the given literal to a file in the test temporary directory.
+void WriteLiteralToTempFile(const LiteralSlice& literal, const string& name) {
+  auto get_hostname = [] {
+    char hostname[1024];
+    gethostname(hostname, sizeof hostname);
+    hostname[sizeof hostname - 1] = 0;
+    return string(hostname);
+  };
+  int64 now_usec = tensorflow::Env::Default()->NowMicros();
+  string filename = tensorflow::io::JoinPath(
+      tensorflow::testing::TmpDir(),
+      tensorflow::strings::Printf("tempfile-%s-%llx-%s", get_hostname().c_str(),
+                                  now_usec, name.c_str()));
+  TF_CHECK_OK(tensorflow::WriteBinaryProto(tensorflow::Env::Default(), filename,
+                                           literal.ToProto()));
+  LOG(ERROR) << "wrote to " << name << " file: " << filename;
+}
+
+// Callback helper that dumps literals to temporary files in the event of a
+// miscomparison.
+void OnMiscompare(const LiteralSlice& expected, const LiteralSlice& actual,
+                  const LiteralSlice& mismatches) {
+  LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected.shape()) << " "
+            << literal_comparison::ToStringTruncated(expected);
+  LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual.shape()) << " "
+            << literal_comparison::ToStringTruncated(actual);
+  LOG(INFO) << "Dumping literals to temp files...";
+  WriteLiteralToTempFile(expected, "expected");
+  WriteLiteralToTempFile(actual, "actual");
+  WriteLiteralToTempFile(mismatches, "mismatches");
+}
+
+::testing::AssertionResult StatusToAssertion(const Status& s) {
+  if (s.ok()) {
     return ::testing::AssertionSuccess();
   }
-  ::testing::Message msg;
-  msg << "Expected equality of these values:";
-  msg << "\n  " << lhs;
-  msg << "\n  " << rhs;
-
-  return ::testing::AssertionFailure() << msg;
-}
-
-// Specializations for floating types that do bitwise comparisons when equality
-// comparison is requested.
-template <>
-::testing::AssertionResult CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs) {
-  return CompareFloatsBitwiseEqual<bfloat16, uint16>(lhs, rhs);
-}
-template <>
-::testing::AssertionResult CompareEqual<Eigen::half>(Eigen::half lhs,
-                                                     Eigen::half rhs) {
-  return CompareFloatsBitwiseEqual<Eigen::half, uint16>(lhs, rhs);
-}
-template <>
-::testing::AssertionResult CompareEqual<float>(float lhs, float rhs) {
-  return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs);
-}
-template <>
-::testing::AssertionResult CompareEqual<double>(double lhs, double rhs) {
-  return CompareFloatsBitwiseEqual<double, uint64>(lhs, rhs);
-}
-template <>
-::testing::AssertionResult CompareEqual<complex64>(complex64 lhs,
-                                                   complex64 rhs) {
-  auto res = CompareEqual<float>(lhs.real(), rhs.real());
-  if (!res) {
-    return res;
-  }
-  return CompareEqual<float>(lhs.imag(), rhs.imag());
-}
-
-// A recursive function which iterates through every index of expected and
-// actual literal and compares their values elementwise. Returns true if all
-// elements are equal.
-template <typename NativeT>
-bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
-                         tensorflow::gtl::MutableArraySlice<int64> multi_index,
-                         int64 dimension) {
-  if (dimension == expected.shape().dimensions_size()) {
-    NativeT expected_value = expected.Get<NativeT>(multi_index);
-    NativeT actual_value = actual.Get<NativeT>(multi_index);
-    ::testing::AssertionResult result =
-        CompareEqual<NativeT>(expected_value, actual_value);
-    return result;  // Defines implicit coersion to bool.
-  }
-
-  bool all_match = true;
-  for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) {
-    multi_index[dimension] = i;
-    all_match = all_match && ExpectLiteralsEqual<NativeT>(
-                                 expected, actual, multi_index, dimension + 1);
-  }
-  return all_match;
+  return ::testing::AssertionFailure() << s.error_message();
 }
 
 }  // namespace
 
-/* static */ void LiteralTestUtil::ExpectEqual(const Literal& expected,
-                                               const Literal& actual,
-                                               const string& message) {
-  EXPECT_TRUE(Equal(expected, actual))
-      << "expected:\n"
-      << expected.ToString() << "\n\tvs actual:\n"
-      << actual.ToString()
-      << (message.empty() ? "" : StrCat("\nmessage: ", message));
-}
-
-/* static */ void LiteralTestUtil::ExpectNotEqual(const Literal& expected,
-                                                  const Literal& actual) {
-  EXPECT_FALSE(Equal(expected, actual));
-}
-
-/* static */ ::testing::AssertionResult LiteralTestUtil::Equal(
-    const Literal& expected, const Literal& actual) {
-  VLOG(1) << "expected:";
-  XLA_VLOG_LINES(1, expected.ToString());
-  VLOG(1) << "actual:";
-  XLA_VLOG_LINES(1, actual.ToString());
-
-  AssertEqualShapes(expected.shape(), actual.shape());
-  std::vector<int64> multi_index(expected.shape().dimensions_size(), 0);
-  bool match = false;
-  switch (expected.shape().element_type()) {
-    case PRED:
-      match = ExpectLiteralsEqual<bool>(expected, actual, &multi_index, 0);
-      break;
-    case U8:
-      match = ExpectLiteralsEqual<uint8>(expected, actual, &multi_index, 0);
-      break;
-    case S32:
-      match = ExpectLiteralsEqual<int32>(expected, actual, &multi_index, 0);
-      break;
-    case S64:
-      match = ExpectLiteralsEqual<int64>(expected, actual, &multi_index, 0);
-      break;
-    case U32:
-      match = ExpectLiteralsEqual<uint32>(expected, actual, &multi_index, 0);
-      break;
-    case U64:
-      match = ExpectLiteralsEqual<uint64>(expected, actual, &multi_index, 0);
-      break;
-    case BF16:
-      match = ExpectLiteralsEqual<bfloat16>(expected, actual, &multi_index, 0);
-      break;
-    case F16:
-      match = ExpectLiteralsEqual<half>(expected, actual, &multi_index, 0);
-      break;
-    case F32:
-      match = ExpectLiteralsEqual<float>(expected, actual, &multi_index, 0);
-      break;
-    case F64:
-      match = ExpectLiteralsEqual<double>(expected, actual, &multi_index, 0);
-      break;
-    case C64:
-      match = ExpectLiteralsEqual<complex64>(expected, actual, &multi_index, 0);
-      break;
-    case TUPLE: {
-      bool tuple_match = true;
-      for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
-        SCOPED_TRACE(StrCat("Tuple index ", i, " in ",
-                            ShapeUtil::HumanString(expected.shape())));
-
-        // Create LiteralViews of the expected and actual elements.
-        auto result = Equal(LiteralView::Create(expected, {i}),
-                            LiteralView::Create(actual, {i}));
-        tuple_match = tuple_match ? !!result : false;
-      }
-      match = tuple_match;
-      break;
-    }
-    default:
-      LOG(FATAL)
-          << "Unsupported primitive type in LiteralTestUtil::ExpectEqual: "
-          << PrimitiveType_Name(expected.shape().element_type());
-  }
-  ::testing::AssertionResult result = ::testing::AssertionSuccess();
-  if (!match) {
-    result = ::testing::AssertionFailure()
-             << "expected: " << expected.ToString()
-             << "\nactual:   " << actual.ToString();
-    VLOG(1) << result.message();
-  }
-  return result;
-}
-
-namespace {
-
-// Gets the total element count.  For tuples, this is not the count of tuple
-// elements, but the sum of elements of each tuple element.
-int64 RecursiveElementCount(const Shape& shape) {
-  if (ShapeUtil::IsTuple(shape)) {
-    const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
-    int64 total = 0;
-    for (int64 i = 0; i < tuple_elements; ++i) {
-      total += RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
-    }
-    return total;
-  } else {
-    return ShapeUtil::ElementsIn(shape);
-  }
-}
-
-// Calling ToString on a literal with over 100 million elements takes around
-// 3 minutes.  The utility of printing a literal with >1000 elements is
-// questionable, especially when writing the Literal proto to disk is orders
-// of magnitude faster.
-string TruncateHugeLiteral(const Literal& literal) {
-  return RecursiveElementCount(literal.shape()) < 1000
-             ? literal.ToString()
-             : "[TRUNCATED, Literal with more than 1000 values]";
+/* static */ ::testing::AssertionResult LiteralTestUtil::EqualShapes(
+    const Shape& expected, const Shape& actual) {
+  return StatusToAssertion(literal_comparison::EqualShapes(expected, actual));
 }
 
-// Returns whether the actual and expected values are mismatched with respect to
-// nans. 'relaxed_nans' is interpreted as in xla::ErrorSpec.
-template <typename NativeT>
-bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) {
-  if (relaxed_nans) {
-    return !std::isnan(expected) && std::isnan(actual);
-  } else {
-    return std::isnan(expected) != std::isnan(actual);
+/* static */ ::testing::AssertionResult LiteralTestUtil::EqualShapesAndLayouts(
+    const Shape& expected, const Shape& actual) {
+  if (expected.ShortDebugString() != actual.ShortDebugString()) {
+    return ::testing::AssertionFailure()
+           << "want: " << expected.ShortDebugString()
+           << " got: " << actual.ShortDebugString();
   }
+  return ::testing::AssertionSuccess();
 }
 
-template <>
-bool NanMismatch<complex64>(complex64 expected, complex64 actual,
-                            bool relaxed_nans) {
-  return NanMismatch<float>(expected.real(), actual.real(), relaxed_nans) ||
-         NanMismatch<float>(expected.imag(), actual.imag(), relaxed_nans);
-}
-
-template <>
-bool NanMismatch<half>(half expected, half actual, bool relaxed_nans) {
-  return NanMismatch<float>(static_cast<float>(expected),
-                            static_cast<float>(actual), relaxed_nans);
-}
-
-// Converts the given floating-point value to a string.
-template <typename NativeT>
-string FpValueToString(NativeT value) {
-  return Printf("%8.4g", static_cast<double>(value));
-}
-
-template <>
-string FpValueToString<complex64>(complex64 value) {
-  return Printf("%8.4g + %8.4fi", value.real(), value.imag());
-}
-
-// Returns the absolute value of the given floating point value. This function
-// is used instead of std::abs directly in order to allow type-dependent
-// implementations for NearComparator.
-template <typename NativeT>
-float FpAbsoluteValue(NativeT value) {
-  return std::abs(value);
-}
-
-template <>
-float FpAbsoluteValue(bfloat16 value) {
-  return FpAbsoluteValue<float>(static_cast<float>(value));
-}
-
-template <>
-float FpAbsoluteValue(half value) {
-  return FpAbsoluteValue<float>(static_cast<float>(value));
-}
-
-// Helper class for comparing floating-point literals within an error bound.
-template <typename NativeT>
-class NearComparator {
- public:
-  // Compares the two array literals elementwise and returns an assertion
-  // result. The assertion result is successful if all actual and expected
-  // elements are within the given error bound. In case of error, the assertion
-  // result contains a detailed error message in case of failure.
-  static ::testing::AssertionResult Compare(const Literal& expected,
-                                            const Literal& actual,
-                                            ErrorSpec error,
-                                            bool detailed_message) {
-    NearComparator<NativeT> comparator(expected, actual, error,
-                                       detailed_message);
-    return comparator.Run();
-  }
-
- private:
-  // Data structure encapsulating metadata about a single element mismatch.
-  struct Mismatch {
-    NativeT actual;
-    NativeT expected;
-    float rel_error;
-    float abs_error;
-
-    // The linear index of the failure within the shape. This linear index is
-    // from the 'actual' literal.
-    int64 linear_index;
-
-    bool operator<(const Mismatch& other) const {
-      return rel_error < other.rel_error;
-    }
-
-    string ToString(const Shape& shape) const {
-      return Printf(
-          "actual %s, expected %s, index %s, rel error %8.3g, abs error %8.3g",
-          FpValueToString(actual).c_str(), FpValueToString(expected).c_str(),
-          LiteralTestUtil::MultiIndexAsString(
-              IndexUtil::LinearIndexToMultidimensionalIndex(shape,
-                                                            linear_index))
-              .c_str(),
-          rel_error, abs_error);
-    }
-  };
-
-  explicit NearComparator(const Literal& expected, const Literal& actual,
-                          ErrorSpec error, bool detailed_message)
-      : expected_(expected),
-        actual_(actual),
-        error_(error),
-        detailed_message_(detailed_message),
-        abs_value_buckets_(kAbsValueBucketBounds.size() - 1, {0, 0}),
-        abs_error_buckets_(kErrorBucketBounds.size(), 0),
-        rel_error_buckets_(kErrorBucketBounds.size(), 0) {}
-
-  // Runs the comparison between expected and actual literals.
-  ::testing::AssertionResult Run() {
-    VLOG(1) << "expected:";
-    XLA_VLOG_LINES(1, TruncateHugeLiteral(expected_));
-    VLOG(1) << "actual:";
-    XLA_VLOG_LINES(1, TruncateHugeLiteral(actual_));
-
-    // If the shapes mismatch, we simply fail the expectation instead of
-    // printing out data, as it's a type error rather than a value error.
-    ::testing::AssertionResult equal_shapes =
-        LiteralTestUtil::EqualShapes(expected_.shape(), actual_.shape());
-    if (!equal_shapes) {
-      return equal_shapes;
-    }
-    if (!ShapeUtil::IsArray(expected_.shape())) {
-      return ::testing::AssertionFailure() << "Expected array shape";
-    }
-
-    mismatches_ = Literal(ShapeUtil::ChangeElementType(actual_.shape(), PRED));
-    mismatches_.PopulateWithValue(false);
-
-    CompareLiterals();
-
-    if (num_mismatches_ == 0) {
-      return ::testing::AssertionSuccess();
-    } else if (!VLOG_IS_ON(1)) {
-      LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected_.shape())
-                << " " << TruncateHugeLiteral(expected_);
-      LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual_.shape())
-                << " " << TruncateHugeLiteral(actual_);
-      LOG(INFO) << "Dumping literals to temp files...";
-      WriteLiteralToTempFile(expected_, "expected");
-      WriteLiteralToTempFile(actual_, "actual");
-      WriteLiteralToTempFile(mismatches_, "mismatches");
-    }
-    return ::testing::AssertionFailure() << ErrorMessage();
-  }
-
-  // Insert the given absolute value into the absolute value bucket vector. The
-  // bounds of the buckets are given by kAbsValueBucketBounds.
-  void UpdateAbsValueBucket(NativeT value, bool is_mismatch) {
-    // Adjust the bucket containing the absolute values of the 'actual'
-    // elements.
-    const float abs_value = FpAbsoluteValue(value);
-    for (int i = 0; i < abs_value_buckets_.size(); ++i) {
-      if (i == abs_value_buckets_.size() - 1 ||
-          (abs_value >= kAbsValueBucketBounds[i] &&
-           abs_value < kAbsValueBucketBounds[i + 1])) {
-        // The first value of the pair is the count of elements in the bucket,
-        // the second is the count of mismatches in the bucket.
-        abs_value_buckets_[i].first++;
-        if (is_mismatch) {
-          abs_value_buckets_[i].second++;
-        }
-        return;
-      }
-    }
-  }
-
-  // Insert the given error into the given error bucket vector.
-  void UpdateErrorBucket(
-      float error, tensorflow::gtl::MutableArraySlice<int64> error_buckets) {
-    CHECK_EQ(error_buckets.size(), kErrorBucketBounds.size());
-    for (int i = 0; i < error_buckets.size(); ++i) {
-      if (error >= kErrorBucketBounds[i]) {
-        error_buckets[i]++;
-      }
-    }
-  }
-
-  // Compares the two given elements from the expected and actual literals at
-  // the given literal_index and keeps track of various mismatch statistics.
-  void CompareValues(NativeT expected, NativeT actual, int64 linear_index) {
-    const bool is_nan_mismatch =
-        NanMismatch(expected, actual, error_.relaxed_nans);
-    float abs_error;
-    float rel_error;
-    if (actual == expected) {
-      abs_error = 0;
-      rel_error = 0;
-    } else if (is_nan_mismatch) {
-      num_nan_mismatches_++;
-      // A nan mismatch is considered to have infinite error. rel_error is used
-      // for sorting a std::set of the top mismatchs, and a nan value here will
-      // result in undefined behavior because nan's do not satisfy the strict
-      // weak ordering requirement of std containers.
-      abs_error = std::numeric_limits<float>::infinity();
-      rel_error = std::numeric_limits<float>::infinity();
-    } else {
-      abs_error = FpAbsoluteValue(actual - expected);
-      rel_error = abs_error / FpAbsoluteValue(expected);
-    }
-    const bool is_abs_mismatch = abs_error > error_.abs;
-    const bool is_rel_mismatch = rel_error > error_.rel;
-    const bool is_mismatch =
-        is_nan_mismatch || (is_abs_mismatch && is_rel_mismatch);
-
-    // Update the error of the relative bucket only if the *absolute* error
-    // bound is exceeded and vice versa.
-    if (is_abs_mismatch) {
-      num_abs_mismatches_++;
-      UpdateErrorBucket(rel_error, &rel_error_buckets_);
-    }
-    if (is_rel_mismatch) {
-      num_rel_mismatches_++;
-      UpdateErrorBucket(abs_error, &abs_error_buckets_);
-    }
-
-    UpdateAbsValueBucket(actual, is_mismatch);
-
-    if (!is_mismatch) {
-      return;
-    }
-
-    num_mismatches_++;
-
-    // Keep track of the kTopRelativeErrorCount relative error mismatches.
-    if (top_rel_mismatches_.size() < kTopRelativeErrorCount ||
-        rel_error > top_rel_mismatches_.begin()->rel_error) {
-      Mismatch mismatch = {actual, expected, rel_error, abs_error,
-                           linear_index};
-      top_rel_mismatches_.insert(mismatch);
-      if (top_rel_mismatches_.size() > kTopRelativeErrorCount) {
-        top_rel_mismatches_.erase(top_rel_mismatches_.begin());
-      }
-    }
-
-    mismatches_.data<bool>()[linear_index] = true;
-  }
-
-  // Compares the two literals elementwise.
-  void CompareLiterals() {
-    // Fast path optimization for the case were layouts match.
-    if (LayoutUtil::Equal(actual_.shape().layout(),
-                          expected_.shape().layout())) {
-      tensorflow::gtl::ArraySlice<const NativeT> expected_data =
-          expected_.data<NativeT>();
-      tensorflow::gtl::ArraySlice<const NativeT> actual_data =
-          actual_.data<NativeT>();
-      const int64 len = expected_data.size();
-      for (int64 i = 0; i < len; ++i) {
-        CompareValues(expected_data[i], actual_data[i], i);
-      }
-      return;
-    }
-    std::vector<int64> multi_index(ShapeUtil::Rank(actual_.shape()), 0);
-    CompareLiteralsSlow(0, &multi_index);
-  }
-
-  // Slow path for CompareLiterals when 'actual' and 'expected' literals have
-  // different layouts. In this case, multidimensional indices are constructed
-  // and indexed for each element.
-  void CompareLiteralsSlow(int64 dimension, std::vector<int64>* multi_index) {
-    if (dimension == multi_index->size()) {
-      CompareValues(expected_.Get<NativeT>(*multi_index),
-                    actual_.Get<NativeT>(*multi_index),
-                    IndexUtil::MultidimensionalIndexToLinearIndex(
-                        actual_.shape(), *multi_index));
-    } else {
-      for (int64 i = 0; i < expected_.shape().dimensions(dimension); ++i) {
-        (*multi_index)[dimension] = i;
-        CompareLiteralsSlow(dimension + 1, multi_index);
-      }
-    }
-  }
-
-  // Writes the given literal to a file in the test temporary directory.
-  void WriteLiteralToTempFile(const Literal& literal, const string& name) {
-    int64 now_usec = tensorflow::Env::Default()->NowMicros();
-    string filename = tensorflow::io::JoinPath(
-        tensorflow::testing::TmpDir(),
-        Printf("tempfile-%s-%llx-%s", Hostname().c_str(), now_usec,
-               name.c_str()));
-    TF_CHECK_OK(tensorflow::WriteBinaryProto(tensorflow::Env::Default(),
-                                             filename, literal.ToProto()));
-    LOG(ERROR) << "wrote to " << name << " file: " << filename;
-  }
-
-  // Returns an error message string with a detailed breakdown of the
-  // mismatches. Called after calling Run().
-  string ErrorMessage() {
-    string out;
-    int64 element_count = ShapeUtil::ElementsIn(actual_.shape());
-
-    auto percent_string = [](float a, float b) {
-      float pct = b == 0.0 ? 0.0 : 100.0 * a / b;
-      return Printf("%0.4f%%", pct);
-    };
-
-    Appendf(&out,
-            "\nMismatch count %lld (%s) in shape %s (%lld elements), abs bound "
-            "%g, rel bound %g\n",
-            num_mismatches_,
-            percent_string(num_mismatches_, element_count).c_str(),
-            ShapeUtil::HumanString(actual_.shape()).c_str(),
-            ShapeUtil::ElementsIn(actual_.shape()), error_.abs, error_.rel);
-    if (num_nan_mismatches_ > 0) {
-      StrAppend(&out, "nan mismatches ", num_nan_mismatches_, "\n");
-    }
-    Appendf(&out, "Top relative error mismatches:\n");
-    for (auto it = top_rel_mismatches_.rbegin();
-         it != top_rel_mismatches_.rend(); ++it) {
-      StrAppend(&out, "  ", it->ToString(actual_.shape()).c_str(), "\n");
-    }
-
-    if (!detailed_message_) {
-      return out;
-    }
-
-    StrAppend(&out, "Absolute magnitude breakdown of actual values:\n");
-    CHECK_EQ(abs_value_buckets_.size() + 1, kAbsValueBucketBounds.size());
-    for (int i = 0; i < abs_value_buckets_.size(); ++i) {
-      const int64 bucket_size = abs_value_buckets_[i].first;
-      const int64 bucket_mismatches = abs_value_buckets_[i].second;
-      string mismatch_str = bucket_mismatches > 0
-                                ? Printf(", mismatches %lld", bucket_mismatches)
-                                : "";
-      Appendf(&out, "  %-6g <= x < %-6g : %7lld (%9s)%s\n",
-              kAbsValueBucketBounds[i], kAbsValueBucketBounds[i + 1],
-              bucket_size, percent_string(bucket_size, element_count).c_str(),
-              mismatch_str.c_str());
-    }
-
-    auto print_accum_buckets = [&](const string& header, int64 total,
-                                   tensorflow::gtl::ArraySlice<int64> buckets) {
-      StrAppend(&out, header, ":\n");
-      Appendf(&out, "  <  %-6g : %7lld (%s)\n", kErrorBucketBounds[0],
-              total - buckets[0],
-              percent_string(total - buckets[0], total).c_str());
-      CHECK_EQ(buckets.size(), kErrorBucketBounds.size());
-      for (int i = 0; i < kErrorBucketBounds.size(); ++i) {
-        Appendf(&out, "  >= %-6g : %7lld (%s)\n", kErrorBucketBounds[i],
-                buckets[i], percent_string(buckets[i], total).c_str());
-      }
-    };
-    Appendf(&out, "Elements exceeding abs error bound %g: %lld (%s)\n",
-            error_.abs, num_abs_mismatches_,
-            percent_string(num_abs_mismatches_, element_count).c_str());
-    print_accum_buckets(
-        "Relative error breakdown of elements exceeding abs error bound",
-        num_abs_mismatches_, rel_error_buckets_);
-    Appendf(&out, "Elements exceeding rel error bound %g: %lld (%s)\n",
-            error_.rel, num_rel_mismatches_,
-            percent_string(num_rel_mismatches_, element_count).c_str());
-    print_accum_buckets(
-        "Absolute error breakdown of elements exceeding rel error bound",
-        num_rel_mismatches_, abs_error_buckets_);
-    return out;
-  }
-
-  // 'actual' and 'expected' literals being compared.
-  const Literal& expected_;
-  const Literal& actual_;
-
-  // The error bounds of the comparison.
-  ErrorSpec error_;
-
-  // Whether to include detailed breakdown of mismatches in the error message.
-  bool detailed_message_;
-
-  // Number of element element mismatches encountered so far.
-  int64 num_mismatches_ = 0;
-
-  // Number of elements with a nan mismatch.
-  int64 num_nan_mismatches_ = 0;
-
-  // Number of elements which exceed the absolute/relative error bound.
-  int64 num_abs_mismatches_ = 0;
-  int64 num_rel_mismatches_ = 0;
-
-  // A Literal containing which elements did not match in the expected and
-  // actual literals. mismatches_ contains PREDs and is of the same sizes as
-  // the comparison literals.
-  Literal mismatches_;
-
-  // The number of mismatches to report in the output, sorted by relative error
-  // magnitude.
-  static constexpr int64 kTopRelativeErrorCount = 5;
-
-  // The set of mismatches with the largest relative error. The size of this set
-  // is bounded by kTopRelativeErrorCount.
-  std::multiset<Mismatch> top_rel_mismatches_;
-
-  // Actual values are bucketed by absolute value. kAbsValueBucketBounds is the
-  // bounds of these buckets. abs_value_buckets_ contains a pair for each
-  // bucket: the element count and failure count.
-  static constexpr std::array<float, 7> kAbsValueBucketBounds = {
-      0.0, 0.0001, 0.001, 0.01, 0.1, 1, std::numeric_limits<float>::infinity()};
-  std::vector<std::pair<int64, int64>> abs_value_buckets_;
-
-  // Buckets for relative and absolute errors. The relative error buckets only
-  // contains those elements which exceed the *absolute* error bound, and vice
-  // versa. This makes it easy to see the effect of adjusting the relative (or
-  // absolute) error bound on the success of the comparison. kErrorBucketBounds
-  // are the lower bounds of the buckets in both vectors. The error buckets are
-  // a cumulative distribution so an error value may appear in more than one
-  // bucket. For example an error value of 0.003 may appear in the buckets
-  // bounded by 0.01, 0.1, and 1.0.
-  static constexpr std::array<float, 5> kErrorBucketBounds = {0.0001, 0.001,
-                                                              0.01, 0.1, 1};
-  std::vector<int64> abs_error_buckets_;
-  std::vector<int64> rel_error_buckets_;
-};
-
-template <typename NativeT>
-constexpr std::array<float, 7> NearComparator<NativeT>::kAbsValueBucketBounds;
-template <typename NativeT>
-constexpr std::array<float, 5> NearComparator<NativeT>::kErrorBucketBounds;
-
-// Helper function for comparing two literals for nearness. Handles tuple-shapes
-// via recursion. shape_index is the ShapeIndex of expected (or actual)
-// currently being compared.
-::testing::AssertionResult NearHelper(const Literal& expected,
-                                      const Literal& actual,
-                                      const ErrorSpec& error,
-                                      bool detailed_message,
-                                      const ShapeIndex& shape_index) {
-  ::testing::AssertionResult err =
-      LiteralTestUtil::EqualShapes(expected.shape(), actual.shape());
-  if (!err) {
-    return err;
-  }
-
-  if (ShapeUtil::IsTuple(expected.shape())) {
-    for (int64 i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
-      const auto expected_element = LiteralView::Create(expected, {i});
-      const auto actual_element = LiteralView::Create(actual, {i});
-      ShapeIndex element_index = shape_index;
-      element_index.push_back(i);
-      ::testing::AssertionResult res =
-          NearHelper(expected_element, actual_element, error, detailed_message,
-                     element_index);
-      if (!res) {
-        string err_message =
-            Printf("\nArray at shape index %s%s",
-                   element_index.ToString().c_str(), res.message());
-        if (err) {
-          err = ::testing::AssertionFailure() << err_message;
-        } else {
-          err << err_message;
-        }
-      }
-    }
-    if (!err && shape_index.empty()) {
-      // Emit a top-level error message containing the top-level shape in case
-      // of mismatch.
-      int64 total_elements = RecursiveElementCount(actual.shape());
-      err = ::testing::AssertionFailure()
-            << Printf("\nMismatches in shape %s (%lld elements):\n%s",
-                      ShapeUtil::HumanString(actual.shape()).c_str(),
-                      total_elements, err.message());
-    }
-    return err;
-  }
-
-  if (ShapeUtil::ElementIsFloating(expected.shape()) ||
-      ShapeUtil::ElementIsComplex(expected.shape())) {
-    switch (expected.shape().element_type()) {
-      case BF16:
-        return NearComparator<bfloat16>::Compare(expected, actual, error,
-                                                 detailed_message);
-        break;
-      case F16:
-        return NearComparator<half>::Compare(expected, actual, error,
-                                             detailed_message);
-        break;
-      case F32:
-        return NearComparator<float>::Compare(expected, actual, error,
-                                              detailed_message);
-        break;
-      case F64:
-        return NearComparator<double>::Compare(expected, actual, error,
-                                               detailed_message);
-        break;
-      case C64:
-        return NearComparator<complex64>::Compare(expected, actual, error,
-                                                  detailed_message);
-        break;
-      default:
-        LOG(FATAL) << "Unsupported primitive type in near comparator: "
-                   << PrimitiveType_Name(expected.shape().element_type())
-                   << ". Must be floating-point type.";
-    }
-  }
-
-  // Non-floating point literal.
-  return LiteralTestUtil::Equal(expected, actual);
+/* static */ ::testing::AssertionResult LiteralTestUtil::Equal(
+    const LiteralSlice& expected, const LiteralSlice& actual) {
+  return StatusToAssertion(literal_comparison::Equal(expected, actual));
 }
 
-}  // namespace
-
 /* static */ ::testing::AssertionResult LiteralTestUtil::Near(
-    const Literal& expected, const Literal& actual, const ErrorSpec& error,
-    bool detailed_message) {
-  return NearHelper(expected, actual, error, detailed_message,
-                    /*shape_index=*/{});
-}
-
-/* static */ void LiteralTestUtil::ExpectNear(const Literal& expected,
-                                              const Literal& actual,
-                                              const ErrorSpec& error,
-                                              const string& message) {
-  ::testing::AssertionResult res =
-      Near(expected, actual, error, /*detailed_message=*/false);
-  if (!res) {
-    res << "Expected: " << TruncateHugeLiteral(expected) << "\n";
-    res << "Actual: " << TruncateHugeLiteral(actual) << "\n";
-    if (!message.empty()) {
-      res << StrCat("\nmessage: ", message);
-    }
-  }
-  EXPECT_TRUE(res);
+    const LiteralSlice& expected, const LiteralSlice& actual,
+    const ErrorSpec& error_spec, bool detailed_message) {
+  return StatusToAssertion(literal_comparison::Near(
+      expected, actual, error_spec, detailed_message, &OnMiscompare));
 }
 
-/*static*/ ::testing::AssertionResult LiteralTestUtil::NearOrEqual(
-    const Literal& expected, const Literal& actual,
+/* static */ ::testing::AssertionResult LiteralTestUtil::NearOrEqual(
+    const LiteralSlice& expected, const LiteralSlice& actual,
     const tensorflow::gtl::optional<ErrorSpec>& error) {
   if (error.has_value()) {
     VLOG(1) << "Expects near";
-    return Near(expected, actual, *error);
+    return StatusToAssertion(literal_comparison::Near(
+        expected, actual, *error, /*detailed_message=*/false, &OnMiscompare));
   }
   VLOG(1) << "Expects equal";
-  return Equal(expected, actual);
-}
-
-/*static*/ void LiteralTestUtil::ExpectNearOrEqual(
-    const Literal& expected, const Literal& actual,
-    const tensorflow::gtl::optional<ErrorSpec>& error) {
-  EXPECT_TRUE(NearOrEqual(expected, actual, error));
-}
-
-/* static */ string LiteralTestUtil::MultiIndexAsString(
-    tensorflow::gtl::ArraySlice<int64> multi_index) {
-  return StrCat("{", tensorflow::str_util::Join(multi_index, ","), "}");
-}
-
-/* static */ std::unique_ptr<Literal> LiteralTestUtil::Reshape(
-    tensorflow::gtl::ArraySlice<int64> new_dimensions,
-    tensorflow::gtl::ArraySlice<int64> minor_to_major, const Literal& literal) {
-  int64 new_num_elements = 1;
-  for (int64 i = 0; i < new_dimensions.size(); ++i) {
-    new_num_elements *= new_dimensions[i];
-  }
-  CHECK_EQ(ShapeUtil::ElementsIn(literal.shape()), new_num_elements);
-  CHECK_EQ(new_dimensions.size(), minor_to_major.size());
-
-  auto new_literal = MakeUnique<Literal>(
-      ShapeUtil::MakeShape(literal.shape().element_type(), new_dimensions));
-
-  // Create a new shape with the given minor-to-major layout. This shape is used
-  // solely for converting linear address to multi-dimensional addresses when
-  // writing elements to the new literal.
-  Shape shape_with_layout = new_literal->shape();
-  *shape_with_layout.mutable_layout() = LayoutUtil::MakeLayout(minor_to_major);
-
-  // Copy data into new literal, element-by-element.
-  for (int64 i = 0; i < ShapeUtil::ElementsIn(literal.shape()); ++i) {
-    std::vector<int64> from_multi_index =
-        IndexUtil::LinearIndexToMultidimensionalIndex(literal.shape(), i);
-    std::vector<int64> to_multi_index =
-        IndexUtil::LinearIndexToMultidimensionalIndex(shape_with_layout, i);
-    switch (literal.shape().element_type()) {
-      case PRED:
-        new_literal->Set<bool>(to_multi_index,
-                               literal.Get<bool>(from_multi_index));
-        break;
-      case U8:
-        new_literal->Set<uint8>(to_multi_index,
-                                literal.Get<uint8>(from_multi_index));
-        break;
-      case U32:
-        new_literal->Set<uint32>(to_multi_index,
-                                 literal.Get<uint32>(from_multi_index));
-        break;
-      case S32:
-        new_literal->Set<int32>(to_multi_index,
-                                literal.Get<int32>(from_multi_index));
-        break;
-      case U64:
-        new_literal->Set<uint64>(to_multi_index,
-                                 literal.Get<uint64>(from_multi_index));
-        break;
-      case S64:
-        new_literal->Set<int64>(to_multi_index,
-                                literal.Get<int64>(from_multi_index));
-        break;
-      case F32:
-        new_literal->Set<float>(to_multi_index,
-                                literal.Get<float>(from_multi_index));
-        break;
-      case F64:
-        new_literal->Set<double>(to_multi_index,
-                                 literal.Get<double>(from_multi_index));
-        break;
-      case C64:
-        new_literal->Set<complex64>(to_multi_index,
-                                    literal.Get<complex64>(from_multi_index));
-        break;
-      default:
-        LOG(FATAL) << "Unhandled primitive element type: "
-                   << PrimitiveType_Name(literal.shape().element_type());
-    }
-  }
-
-  return new_literal;
+  return StatusToAssertion(literal_comparison::Equal(expected, actual));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index a755568c0f098e15512bd1d3720269c867bc9c49..d1b8a6cf0b2552f1b7d95a2560d502da14ddc39a 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
+#include "tensorflow/compiler/xla/error_spec.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -38,282 +39,190 @@ limitations under the License.
 
 namespace xla {
 
-// Structure describing permissible absolute and relative error bounds.
-struct ErrorSpec {
-  explicit ErrorSpec(float aabs, float arel = 0, bool relaxed_nans = false)
-      : abs(aabs), rel(arel), relaxed_nans(relaxed_nans) {}
-
-  float abs;  // Absolute error bound.
-  float rel;  // Relative error bound.
-
-  // If relaxed_nans is true then any result is valid if we are expecting NaNs.
-  // In effect, this allows the tested operation to produce incorrect results
-  // for inputs outside its mathematical domain.
-  bool relaxed_nans;
-};
-
 // Utility class for making expectations/assertions related to XLA literals.
 class LiteralTestUtil {
  public:
   // Asserts that the given shapes have the same rank, dimension sizes, and
   // primitive types.
-  static ::testing::AssertionResult EqualShapes(const Shape& expected,
-                                                const Shape& actual);
-  static void AssertEqualShapes(const Shape& expected, const Shape& actual);
+  static ::testing::AssertionResult EqualShapes(
+      const Shape& expected, const Shape& actual) TF_MUST_USE_RESULT;
 
   // Asserts that the provided shapes are equal as defined in AssertEqualShapes
   // and that they have the same layout.
-  static void AssertEqualShapesAndLayouts(const Shape& expected,
-                                          const Shape& actual);
-
-  // If the given literal's data type is bfloat16, converts it to a float
-  // literal; otherwise, returns a copy of it. If the literal is a tuple,
-  // recursively converts its elements.
-  static std::unique_ptr<Literal> ConvertBF16ToF32(const Literal& bf16_literal);
-
-  // If the given literal's data type is float, converts it to a bfloat16
-  // literal; otherwise, returns a copy of it. If the literal is a tuple,
-  // recursively converts its elements.
-  static std::unique_ptr<Literal> ConvertF32ToBF16(const Literal& f32_literal);
-
-  // Asserts that the expected and actual literals are (bitwise) equal for all
-  // elements in the literal. Also, asserts that the rank, dimensions sizes, and
-  // primitive type are equal.
-  static ::testing::AssertionResult Equal(
-      const Literal& expected, const Literal& actual) TF_MUST_USE_RESULT;
+  static ::testing::AssertionResult EqualShapesAndLayouts(
+      const Shape& expected, const Shape& actual) TF_MUST_USE_RESULT;
 
-  // Expects that expected and actual are Equal.
-  static void ExpectEqual(const Literal& expected, const Literal& actual,
-                          const string& message = "");
-
-  // Expects that expected and actual are Not Equal.
-  static void ExpectNotEqual(const Literal& expected, const Literal& actual);
+  static ::testing::AssertionResult Equal(const LiteralSlice& expected,
+                                          const LiteralSlice& actual)
+      TF_MUST_USE_RESULT;
 
   // Asserts the given literal are (bitwise) equal to given expected values.
   template <typename NativeT>
-  static void ExpectR0Equal(NativeT expected, const Literal& actual);
+  static void ExpectR0Equal(NativeT expected, const LiteralSlice& actual);
+
   template <typename NativeT>
   static void ExpectR1Equal(tensorflow::gtl::ArraySlice<NativeT> expected,
-                            const Literal& actual);
+                            const LiteralSlice& actual);
   template <typename NativeT>
   static void ExpectR2Equal(
       std::initializer_list<std::initializer_list<NativeT>> expected,
-      const Literal& actual);
+      const LiteralSlice& actual);
+
   template <typename NativeT>
   static void ExpectR3Equal(
       std::initializer_list<
           std::initializer_list<std::initializer_list<NativeT>>>
           expected,
-      const Literal& actual);
+      const LiteralSlice& actual);
 
   // Asserts the given literal are (bitwise) equal to given array.
   template <typename NativeT>
   static void ExpectR2EqualArray2D(const Array2D<NativeT>& expected,
-                                   const Literal& actual);
+                                   const LiteralSlice& actual);
   template <typename NativeT>
   static void ExpectR3EqualArray3D(const Array3D<NativeT>& expected,
-                                   const Literal& actual);
+                                   const LiteralSlice& actual);
   template <typename NativeT>
   static void ExpectR4EqualArray4D(const Array4D<NativeT>& expected,
-                                   const Literal& actual);
+                                   const LiteralSlice& actual);
 
-  // Asserts that the expected and actual literals are within the given error
-  // bound for all elements. Also, asserts that the rank, dimensions sizes, and
-  // bounds are equivalent.
+  // Decorates literal_comparison::Near() with an AssertionResult return type.
   //
-  // Tuples are matched recursively.  When comparing tensors of
-  // non-floating-point type, checks for exact equality, ignoring the ErrorSpec.
-  //
-  // If the shape of the literals is neither a complex/floating-point tensor nor
-  // a tuple which contains a complex/floating-point tensor, Near() is
-  // equivalent to Equal().  We don't raise an error in this case, because we
-  // want to allow callers to call Near() even if they have no preconceptions
-  // about the shapes being compared.
-  //
-  // If detailed_message is true, then the error message in the assertion result
-  // will contain a more detailed breakdown of mismatches.
+  // See comment on literal_comparison::Near().
   static ::testing::AssertionResult Near(
-      const Literal& expected, const Literal& actual, const ErrorSpec& error,
+      const LiteralSlice& expected, const LiteralSlice& actual,
+      const ErrorSpec& error_spec,
       bool detailed_message = false) TF_MUST_USE_RESULT;
 
-  // Expects expected and actual to be Near with the given error.
-  static void ExpectNear(const Literal& expected, const Literal& actual,
-                         const ErrorSpec& error, const string& message = "");
-
   // Asserts the given literal are within the given error bound of the given
   // expected values. Only supported for floating point values.
   template <typename NativeT>
-  static void ExpectR0Near(NativeT expected, const Literal& actual,
+  static void ExpectR0Near(NativeT expected, const LiteralSlice& actual,
                            const ErrorSpec& error);
+
   template <typename NativeT>
   static void ExpectR1Near(tensorflow::gtl::ArraySlice<NativeT> expected,
-                           const Literal& actual, const ErrorSpec& error);
+                           const LiteralSlice& actual, const ErrorSpec& error);
+
   template <typename NativeT>
   static void ExpectR2Near(
       std::initializer_list<std::initializer_list<NativeT>> expected,
-      const Literal& actual, const ErrorSpec& error);
+      const LiteralSlice& actual, const ErrorSpec& error);
+
   template <typename NativeT>
   static void ExpectR3Near(
       std::initializer_list<
           std::initializer_list<std::initializer_list<NativeT>>>
           expected,
-      const Literal& actual, const ErrorSpec& error);
+      const LiteralSlice& actual, const ErrorSpec& error);
+
   template <typename NativeT>
   static void ExpectR4Near(
       std::initializer_list<std::initializer_list<
           std::initializer_list<std::initializer_list<NativeT>>>>
           expected,
-      const Literal& actual, const ErrorSpec& error);
+      const LiteralSlice& actual, const ErrorSpec& error);
 
   // Asserts the given literal are within the given error bound to the given
   // array. Only supported for floating point values.
   template <typename NativeT>
   static void ExpectR2NearArray2D(const Array2D<NativeT>& expected,
-                                  const Literal& actual,
+                                  const LiteralSlice& actual,
                                   const ErrorSpec& error);
+
   template <typename NativeT>
   static void ExpectR3NearArray3D(const Array3D<NativeT>& expected,
-                                  const Literal& actual,
+                                  const LiteralSlice& actual,
                                   const ErrorSpec& error);
+
   template <typename NativeT>
   static void ExpectR4NearArray4D(const Array4D<NativeT>& expected,
-                                  const Literal& actual,
+                                  const LiteralSlice& actual,
                                   const ErrorSpec& error);
 
   // If the error spec is given, returns whether the expected and the actual are
   // within the error bound; otherwise, returns whether they are equal. Tuples
   // will be compared recursively.
   static ::testing::AssertionResult NearOrEqual(
-      const Literal& expected, const Literal& actual,
+      const LiteralSlice& expected, const LiteralSlice& actual,
       const tensorflow::gtl::optional<ErrorSpec>& error) TF_MUST_USE_RESULT;
 
-  // If the error spec is given, expects the expected and the actual to be near;
-  // otherwise, expects them to be equal. Tuples will be compared recursively.
-  static void ExpectNearOrEqual(
-      const Literal& expected, const Literal& actual,
-      const tensorflow::gtl::optional<ErrorSpec>& error);
-
-  // Returns a multi-dimensional index as a string. For example: '{7, 8}' will
-  // be returned for a 2-dimensional index with dimension 0 index equal to 7,
-  // dimension 1 equal to 8.
-  static string MultiIndexAsString(
-      tensorflow::gtl::ArraySlice<int64> multi_index);
-
-  // Creates a literal with a new shape with the given new dimensions using the
-  // data in the given input literal. For reshaping purposes the (flat) data
-  // buffer of the input literal is assumed to have the given minor_to_major
-  // layout order.
-  static std::unique_ptr<Literal> Reshape(
-      tensorflow::gtl::ArraySlice<int64> new_dimensions,
-      tensorflow::gtl::ArraySlice<int64> minor_to_major,
-      const Literal& literal);
-
-  // Creates a literal with the supplied shape, and uses the provided value
-  // generator to populate the literal's values.
-  // Returns the new literal object, or an error Status if failed.
-  template <
-      PrimitiveType type,
-      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
-  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
-      const Shape& shape,
-      const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator);
-
-  // Creates a literal with the supplied shape, and initializes the literal
-  // values using a normal distribution with given mean and stddev standard
-  // deviation, and using the engine as entropy generator.
-  // Returns the new literal object, or an error Status if failed.
-  template <
-      PrimitiveType type, typename E,
-      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
-  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
-      const Shape& shape, E* engine, T mean, T stddev);
-
-  // Creates a literal with the supplied shape, and initializes the literal
-  // values using a normal distribution with given mean and stddev standard
-  // deviation.
-  // Returns the new literal object, or an error Status if failed.
-  template <
-      PrimitiveType type,
-      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
-  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
-      const Shape& shape, T mean, T stddev);
-
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(LiteralTestUtil);
 };
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR0Equal(NativeT expected,
-                                                 const Literal& actual) {
-  ExpectEqual(*Literal::CreateR0<NativeT>(expected), actual);
+                                                 const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(*Literal::CreateR0<NativeT>(expected), actual));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR1Equal(
-    tensorflow::gtl::ArraySlice<NativeT> expected, const Literal& actual) {
-  ExpectEqual(*Literal::CreateR1<NativeT>(expected), actual);
+    tensorflow::gtl::ArraySlice<NativeT> expected, const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(*Literal::CreateR1<NativeT>(expected), actual));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR2Equal(
     std::initializer_list<std::initializer_list<NativeT>> expected,
-    const Literal& actual) {
-  ExpectEqual(*Literal::CreateR2<NativeT>(expected), actual);
+    const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(*Literal::CreateR2<NativeT>(expected), actual));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR3Equal(
     std::initializer_list<std::initializer_list<std::initializer_list<NativeT>>>
         expected,
-    const Literal& actual) {
-  ExpectEqual(*Literal::CreateR3<NativeT>(expected), actual);
+    const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(*Literal::CreateR3<NativeT>(expected), actual));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR2EqualArray2D(
-    const Array2D<NativeT>& expected, const Literal& actual) {
-  ExpectEqual(*Literal::CreateR2FromArray2D(expected), actual);
+    const Array2D<NativeT>& expected, const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(*Literal::CreateR2FromArray2D(expected), actual));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR3EqualArray3D(
-    const Array3D<NativeT>& expected, const Literal& actual) {
-  ExpectEqual(*Literal::CreateR3FromArray3D(expected), actual);
+    const Array3D<NativeT>& expected, const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(*Literal::CreateR3FromArray3D(expected), actual));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR4EqualArray4D(
-    const Array4D<NativeT>& expected, const Literal& actual) {
-  ExpectEqual(*Literal::CreateR4FromArray4D(expected), actual);
+    const Array4D<NativeT>& expected, const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(*Literal::CreateR4FromArray4D(expected), actual));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR0Near(NativeT expected,
-                                                const Literal& actual,
+                                                const LiteralSlice& actual,
                                                 const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR0<NativeT>(expected), actual, error);
+  EXPECT_TRUE(Near(*Literal::CreateR0<NativeT>(expected), actual, error));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR1Near(
-    tensorflow::gtl::ArraySlice<NativeT> expected, const Literal& actual,
+    tensorflow::gtl::ArraySlice<NativeT> expected, const LiteralSlice& actual,
     const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR1<NativeT>(expected), actual, error);
+  EXPECT_TRUE(Near(*Literal::CreateR1<NativeT>(expected), actual, error));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR2Near(
     std::initializer_list<std::initializer_list<NativeT>> expected,
-    const Literal& actual, const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR2<NativeT>(expected), actual, error);
+    const LiteralSlice& actual, const ErrorSpec& error) {
+  EXPECT_TRUE(Near(*Literal::CreateR2<NativeT>(expected), actual, error));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR3Near(
     std::initializer_list<std::initializer_list<std::initializer_list<NativeT>>>
         expected,
-    const Literal& actual, const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR3<NativeT>(expected), actual, error);
+    const LiteralSlice& actual, const ErrorSpec& error) {
+  EXPECT_TRUE(Near(*Literal::CreateR3<NativeT>(expected), actual, error));
 }
 
 template <typename NativeT>
@@ -321,63 +230,29 @@ template <typename NativeT>
     std::initializer_list<std::initializer_list<
         std::initializer_list<std::initializer_list<NativeT>>>>
         expected,
-    const Literal& actual, const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR4<NativeT>(expected), actual, error);
+    const LiteralSlice& actual, const ErrorSpec& error) {
+  EXPECT_TRUE(Near(*Literal::CreateR4<NativeT>(expected), actual, error));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR2NearArray2D(
-    const Array2D<NativeT>& expected, const Literal& actual,
+    const Array2D<NativeT>& expected, const LiteralSlice& actual,
     const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR2FromArray2D(expected), actual, error);
+  EXPECT_TRUE(Near(*Literal::CreateR2FromArray2D(expected), actual, error));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR3NearArray3D(
-    const Array3D<NativeT>& expected, const Literal& actual,
+    const Array3D<NativeT>& expected, const LiteralSlice& actual,
     const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR3FromArray3D(expected), actual, error);
+  EXPECT_TRUE(Near(*Literal::CreateR3FromArray3D(expected), actual, error));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR4NearArray4D(
-    const Array4D<NativeT>& expected, const Literal& actual,
+    const Array4D<NativeT>& expected, const LiteralSlice& actual,
     const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR4FromArray4D(expected), actual, error);
-}
-
-template <PrimitiveType type, typename T>
-/* static */ StatusOr<std::unique_ptr<Literal>>
-LiteralTestUtil::CreateRandomLiteral(
-    const Shape& shape,
-    const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator) {
-  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
-  TF_RET_CHECK(shape.element_type() == type);
-  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
-  TF_RETURN_IF_ERROR(literal.get()->Populate<NativeT>(
-      [&](tensorflow::gtl::ArraySlice<int64> indexes) {
-        return generator(indexes);
-      }));
-  return std::move(literal);
-}
-
-template <PrimitiveType type, typename E, typename T>
-/* static */ StatusOr<std::unique_ptr<Literal>>
-LiteralTestUtil::CreateRandomLiteral(const Shape& shape, E* engine, T mean,
-                                     T stddev) {
-  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
-  std::normal_distribution<NativeT> generator(mean, stddev);
-  return CreateRandomLiteral<type, NativeT>(
-      shape, [&](tensorflow::gtl::ArraySlice<int64> /*indexes*/) {
-        return generator(*engine);
-      });
-}
-
-template <PrimitiveType type, typename T>
-/* static */ StatusOr<std::unique_ptr<Literal>>
-LiteralTestUtil::CreateRandomLiteral(const Shape& shape, T mean, T stddev) {
-  std::minstd_rand0 engine;
-  return CreateRandomLiteral<type>(shape, &engine, mean, stddev);
+  EXPECT_TRUE(Near(*Literal::CreateR4FromArray4D(expected), actual, error));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index 9d619a77c7e8d6398b559e8f562cd7f8194e0811..bbac7285aefbb1f028fad152e4b7fe6af01e9f6d 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -34,7 +34,7 @@ TEST(LiteralTestUtilTest, ComparesEqualTuplesEqual) {
   std::unique_ptr<Literal> literal = Literal::MakeTuple({
       Literal::CreateR0<int32>(42).get(), Literal::CreateR0<int32>(64).get(),
   });
-  LiteralTestUtil::ExpectEqual(*literal, *literal);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *literal));
 }
 
 TEST(LiteralTestUtilTest, ComparesUnequalTuplesUnequal) {
@@ -97,6 +97,15 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
   }
 }
 
+TEST(LiteralTestUtilTest, NotEqualHasValuesInMessage) {
+  auto expected = Literal::CreateR1<int32>({1, 2, 3});
+  auto actual = Literal::CreateR1<int32>({4, 5, 6});
+  ::testing::AssertionResult result =
+      LiteralTestUtil::Equal(*expected, *actual);
+  EXPECT_THAT(result.message(), ::testing::HasSubstr("expected: {1, 2, 3}"));
+  EXPECT_THAT(result.message(), ::testing::HasSubstr("actual:   {4, 5, 6}"));
+}
+
 TEST(LiteralTestUtilTest, NearComparatorR1) {
   auto a =
       Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 44c6811df84f49b6c1b24c11449939e2d375a9d1..96858c00d6bbe59b673a34e7d5ca261756709596 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -210,12 +210,12 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) {
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0}));
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralSlice(*result_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
       {{10.0f, 20.0f}, {30.0f, 40.0f}},
-      LiteralView::Create(*result_literal, {1}));
+      LiteralSlice(*result_literal, {1}));
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {2}));
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralSlice(*result_literal, {2}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
@@ -239,16 +239,16 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {1}));
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralSlice(*result_literal, {1}));
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f}, {3.0f, 4.0f}},
-      LiteralView::Create(*result_literal, {0, 0}));
+      LiteralSlice(*result_literal, {0, 0}));
   LiteralTestUtil::ExpectR2Equal<float>(
       {{10.0f, 20.0f}, {30.0f, 40.0f}},
-      LiteralView::Create(*result_literal, {0, 1}));
+      LiteralSlice(*result_literal, {0, 1}));
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f}, {3.0f, 4.0f}},
-      LiteralView::Create(*result_literal, {0, 2}));
+      LiteralSlice(*result_literal, {0, 2}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) {
@@ -274,9 +274,9 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) {
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0}));
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralSlice(*result_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {1}));
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralSlice(*result_literal, {1}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, TupleArguments) {
@@ -321,9 +321,9 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) {
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{56.0f, 46.0f}, {36.0f, 26.0f}},
-      LiteralView::Create(*result_literal, {0}));
+      LiteralSlice(*result_literal, {0}));
   LiteralTestUtil::ExpectR1Equal<float>(
-      {40.0f, 71.0f, 117.0f}, LiteralView::Create(*result_literal, {1}));
+      {40.0f, 71.0f, 117.0f}, LiteralSlice(*result_literal, {1}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) {
@@ -361,9 +361,9 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) {
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{-1.0, -2.0}, {-3.0, -4}}, LiteralView::Create(*result_literal, {0}));
+      {{-1.0, -2.0}, {-3.0, -4}}, LiteralSlice(*result_literal, {0}));
   LiteralTestUtil::ExpectR1Equal<float>(
-      {264.0, 73.0, 133.0}, LiteralView::Create(*result_literal, {1}));
+      {264.0, 73.0, 133.0}, LiteralSlice(*result_literal, {1}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) {
@@ -391,16 +391,16 @@ XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) {
   std::unique_ptr<Literal> result_0_literal = ShapedBufferToLiteral(result_0);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{-1.0, -2.0}, {-3.0, -4.0}},
-      LiteralView::Create(*result_0_literal, {0}));
+      LiteralSlice(*result_0_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{22.0, 6.0}, {8.0, 10}}, LiteralView::Create(*result_0_literal, {1}));
+      {{22.0, 6.0}, {8.0, 10}}, LiteralSlice(*result_0_literal, {1}));
 
   ScopedShapedBuffer result_1 = ExecuteLocallyOrDie(computation, {&result_0});
   std::unique_ptr<Literal> result_1_literal = ShapedBufferToLiteral(result_1);
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{1.0, 2.0}, {3.0, 4.0}}, LiteralView::Create(*result_1_literal, {0}));
+      {{1.0, 2.0}, {3.0, 4.0}}, LiteralSlice(*result_1_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{44.0, 12.0}, {16.0, 20}}, LiteralView::Create(*result_1_literal, {1}));
+      {{44.0, 12.0}, {16.0, 20}}, LiteralSlice(*result_1_literal, {1}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, LargeTuple) {
@@ -447,7 +447,7 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) {
 
   for (int i = 0; i < kElementCount; ++i) {
     LiteralTestUtil::ExpectR1Near<float>(
-        {2.0f * i, 0.0f}, LiteralView::Create(*result_literal, {i}),
+        {2.0f * i, 0.0f}, LiteralSlice(*result_literal, {i}),
         error_spec_);
   }
 }
@@ -502,7 +502,7 @@ XLA_TEST_F(LocalClientExecuteTest, LargeNestedTuple) {
   for (int i = 0; i < kFanout; ++i) {
     for (int j = 0; j < kFanout; ++j) {
       LiteralTestUtil::ExpectR0Near<float>(
-          i + j + i * kFanout + j, LiteralView::Create(*result_literal, {i, j}),
+          i + j + i * kFanout + j, LiteralSlice(*result_literal, {i, j}),
           error_spec_);
     }
   }
@@ -548,7 +548,7 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) {
     index.push_back(0);
   }
   LiteralTestUtil::ExpectR0Equal<float>(
-      165.0, LiteralView::Create(*result_literal, index));
+      165.0, LiteralSlice(*result_literal, index));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) {
@@ -754,9 +754,9 @@ XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) {
       ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {});
   std::unique_ptr<Literal> tuple_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR1Equal<float>(
-      {2.0f, 4.0f, 6.0f}, LiteralView::Create(*tuple_literal, {0}));
+      {2.0f, 4.0f, 6.0f}, LiteralSlice(*tuple_literal, {0}));
   LiteralTestUtil::ExpectR1Equal<float>(
-      {1.0f, 2.0f, 3.0f}, LiteralView::Create(*tuple_literal, {1}));
+      {1.0f, 2.0f, 3.0f}, LiteralSlice(*tuple_literal, {1}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) {
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index e859b3059eea86b362443c3269f99ccae941dfe2..88797a7d0a7d0567b3a380c5fb1ad0c0ee875587 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -35,9 +35,9 @@ namespace xla {
 
 /* static */ TestAllocator* LocalClientTestBase::allocator_;
 
-StatusOr<se::DeviceMemoryBase> TestAllocator::Allocate(int device_ordinal,
-                                                       uint64 size,
-                                                       bool retry_on_failure) {
+StatusOr<OwningDeviceMemory> TestAllocator::Allocate(int device_ordinal,
+                                                     uint64 size,
+                                                     bool retry_on_failure) {
   VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")";
   {
     tensorflow::mutex_lock lock(count_mutex_);
@@ -48,8 +48,7 @@ StatusOr<se::DeviceMemoryBase> TestAllocator::Allocate(int device_ordinal,
                                                  retry_on_failure);
 }
 
-tensorflow::Status TestAllocator::Deallocate(int device_ordinal,
-                                             se::DeviceMemoryBase* mem) {
+Status TestAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
   VLOG(2) << "Deallocate(" << device_ordinal << ")";
   {
     tensorflow::mutex_lock lock(count_mutex_);
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index 3bbb760c806412a671bc2502846e123e2582fd16..258226523d830b40ecaa761df95988dc90f5ca47 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -46,10 +46,9 @@ class TestAllocator : public StreamExecutorMemoryAllocator {
             platform, PlatformUtil::GetStreamExecutors(platform).ValueOrDie()) {
   }
 
-  StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
-                                          bool retry_on_failure) override;
-  tensorflow::Status Deallocate(int device_ordinal,
-                                se::DeviceMemoryBase* mem) override;
+  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                        bool retry_on_failure) override;
+  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
 
   // Return the number of allocations that have been performed.
   int64 allocation_count() const;
diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
index 464cc012140d4838de88c5bf5b3b2f1372c2c19b..27fd36e06acdc589f3a84ad561164e4a33b93506 100644
--- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index 0a603f4954badd12adf3144320789a5edd0d9c6c..ec7ca20bdf266cf8ed220809c0c24bee473359be 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <new>
 #include <utility>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -108,7 +107,7 @@ class MultiOutputFusionTest : public HloTestBase {
     expect.PopulateWithValue<float>(size * 1.5f * 3.5f);
     auto actual = ExecuteAndTransfer(
         std::move(hlo_module), {Literal::CreateR0<float>(-9.0f).get(), &arg1});
-    LiteralTestUtil::ExpectNear(expect, *actual, error_spec_);
+    EXPECT_TRUE(LiteralTestUtil::Near(expect, *actual, error_spec_));
   }
 
   void RunTest1D(bool manual_fusion, int size) {
@@ -168,7 +167,7 @@ class MultiOutputFusionTest : public HloTestBase {
 
     Literal expect = std::move(*Literal::CreateR1<float>({size * 1.5f * 3.5f}));
     auto actual = ExecuteAndTransfer(std::move(hlo_module), {&input0, &input1});
-    LiteralTestUtil::ExpectNear(expect, *actual, error_spec_);
+    EXPECT_TRUE(LiteralTestUtil::Near(expect, *actual, error_spec_));
   }
 };
 
@@ -211,5 +210,68 @@ XLA_TEST_F(MultiOutputFusionTest, FusionNodeIsRoot) {
       *result, *Literal::MakeTupleOwned(Literal::CreateR0<int32>(42))));
 }
 
+XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFusion) {
+  const char* testcase = R"(
+    HloModule m
+
+    fused_computation {
+      p = f32[4] parameter(0)
+      multiply = f32[4] multiply(p, p)
+      less-than = pred[4] less-than(p, multiply)
+      ROOT tuple = (pred[4], f32[4]) tuple(less-than, multiply)
+    }
+
+    ENTRY PredFloatMOF {
+      p0 = f32[4] parameter(0)
+      fusion = (pred[4], f32[4]) fusion(p0), kind=kLoop, calls=fused_computation
+      gte0 = pred[4] get-tuple-element(fusion), index=0
+      gte1 = f32[4] get-tuple-element(fusion), index=1
+      const = f32[4] constant({0, 0, 0, 0})
+      ROOT select = f32[4] select(gte0, gte1, const)
+    })";
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param = Literal::CreateR1<float>({1.0, 2.0, 3.0, -1.0});
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module), {param.get()}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result, *Literal::CreateR1<float>({0.0, 4.0, 9.0, 1.0})));
+}
+
+XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFeedingMap) {
+  const char* testcase = R"(
+    HloModule m
+
+    fused_computation {
+      p = f32[] parameter(0)
+      multiply = f32[] multiply(p, p)
+      less-than = pred[] less-than(p, multiply)
+      ROOT tuple = (pred[], f32[]) tuple(less-than, multiply)
+    }
+
+    map_computation {
+      p0 = f32[] parameter(0)
+      fusion = (pred[], f32[]) fusion(p0), kind=kLoop, calls=fused_computation
+      gte0 = pred[] get-tuple-element(fusion), index=0
+      gte1 = f32[] get-tuple-element(fusion), index=1
+      const = f32[] constant(0)
+      ROOT select = f32[] select(gte0, gte1, const)
+    }
+
+    ENTRY MapMOF {
+      p1 = f32[3] parameter(0)
+      ROOT map = f32[3] map(p1), to_apply=map_computation
+    })";
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param = Literal::CreateR1<float>({1.0, 2.0, 3.0});
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module), {param.get()}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result, *Literal::CreateR1<float>({0.0, 4.0, 9.0})));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc
index 97dab860c06bddb2a0ffd45e48c4912c5f55d574..838f1b4e2f0f0e0871ec717bdeefcbbc653397e3 100644
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
@@ -161,7 +160,7 @@ XLA_TEST_F(ParamsTest, MissingParameter) {
   auto p = builder.Parameter(2, ShapeUtil::MakeShape(F32, {}), "param2");
   auto computation_status = builder.Build();
 
-  ASSERT_NE(computation_status.status(), tensorflow::Status::OK());
+  ASSERT_NE(computation_status.status(), Status::OK());
 }
 
 XLA_TEST_F(ParamsTest, UnusedParameter) {
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 29a4f75001c688f2215745ab913df68bf2f62b76..1a2de6937c3e134852a730f62f7b56417cf49b28 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -273,11 +273,11 @@ XLA_TEST_F(PrngTest, PassInGlobalRngSeed) {
                                              &execution_options_));
   }
 
-  LiteralTestUtil::ExpectEqual(*result1, *result2);
-  LiteralTestUtil::ExpectEqual(*result1, *result3);
-  LiteralTestUtil::ExpectNotEqual(*result1, *result4);
-  LiteralTestUtil::ExpectNotEqual(*result4, *result5);
-  LiteralTestUtil::ExpectNotEqual(*result5, *result6);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result1, *result2));
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result1, *result3));
+  EXPECT_FALSE(LiteralTestUtil::Equal(*result1, *result4));
+  EXPECT_FALSE(LiteralTestUtil::Equal(*result4, *result5));
+  EXPECT_FALSE(LiteralTestUtil::Equal(*result5, *result6));
 }
 
 XLA_TEST_F(PrngTest, TenValuesN01) {
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index bcc05c2d41d8439b021cdf6533b5ca87c19aec1f..d671d40456a276a44b462f390c95aa4af301263a 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -34,7 +34,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 10a3da3a387641ec45baf02d15790e32371601fa..266760e8202fddc48792ac66dda334255e428808 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -356,12 +356,8 @@ XLA_TEST_P(ReduceWindowTest, R6AddMultipleStrides) {
   std::vector<int64> input_dims(6, 8);
   auto shape = ShapeUtil::MakeShape(F32, input_dims);
 
-  std::unique_ptr<Literal> arg_literal = Literal::CreateFromShape(shape);
-  auto generator = [&](tensorflow::gtl::ArraySlice<int64> indexes) -> float {
-    return 1.0f;
-  };
-  TF_EXPECT_OK(arg_literal->Populate<float>(generator));
-
+  auto arg_literal = MakeUnique<Literal>(shape);
+  arg_literal->PopulateWithValue(1.0f);
   const auto input = CreateConstantFromLiteral(*arg_literal, &builder_);
 
   Padding padding = Padding::kValid;
@@ -371,13 +367,8 @@ XLA_TEST_P(ReduceWindowTest, R6AddMultipleStrides) {
   std::vector<int64> output_dims = {6, 8, 6, 6, 8, 8};
   Shape result_shape =
       ShapeUtil::MakeShapeWithLayout(F32, output_dims, output_layout);
-  std::unique_ptr<Literal> expected = Literal::CreateFromShape(result_shape);
-  auto out_generator =
-      [&](tensorflow::gtl::ArraySlice<int64> indexes) -> float {
-    return 27.0f;
-  };
-  TF_EXPECT_OK(expected->Populate<float>(out_generator));
-
+  auto expected = MakeUnique<Literal>(result_shape);
+  expected->PopulateWithValue(27.0f);
   ComputeAndCompareLiteral(&builder_, *expected, {}, DefaultErrorSpec());
 }
 
@@ -1348,7 +1339,7 @@ INSTANTIATE_TEST_CASE_P(
 class ReduceWindowTextTest : public HloTestBase {};
 
 TEST_F(ReduceWindowTextTest, R2General256x384) {
-  const string& hlo_string = R"(
+  const string hlo_string = R"(
 HloModule R2Window
 mul {
   lhs = f32[] parameter(0)
@@ -1365,7 +1356,7 @@ ENTRY R2Window {
 }
 
 TEST_F(ReduceWindowTextTest, R2General256x384Layout01) {
-  const string& hlo_string = R"(
+  const string hlo_string = R"(
 HloModule R2Window
 mul {
 lhs = f32[] parameter(0)
@@ -1382,7 +1373,7 @@ ROOT reduce-window = f32[256,384]{0,1} reduce-window(operand, constant), window=
 }
 
 TEST_F(ReduceWindowTextTest, R2General2x5) {
-  const string& hlo_string = R"(
+  const string hlo_string = R"(
 HloModule R2Window
 mul {
   lhs = f32[] parameter(0)
@@ -1399,7 +1390,7 @@ ENTRY R2Window {
 }
 
 TEST_F(ReduceWindowTextTest, R2EffectiveScalar) {
-  const string& hlo_string = R"(
+  const string hlo_string = R"(
 HloModule R2Window
 mul {
   lhs = f32[] parameter(0)
@@ -1417,7 +1408,7 @@ ENTRY R2Window {
 }
 
 TEST_F(ReduceWindowTextTest, R3EffectiveScalar) {
-  const string& hlo_string = R"(
+  const string hlo_string = R"(
 HloModule R3Window
 mul {
   lhs = f32[] parameter(0)
@@ -1435,7 +1426,7 @@ ENTRY R3Window {
 }
 
 TEST_F(HloTestBase, ReduceWindowIdentity) {
-  const string& hlo_string = R"(
+  const string hlo_string = R"(
 HloModule ReduceWindowIdentity
 identity.pad_to_reduce_window {
   param0 = f32[] parameter(0)
@@ -1444,7 +1435,26 @@ identity.pad_to_reduce_window {
 ENTRY reduce-window-identity {
   operand = f32[1,32,64]{2,1,0} parameter(0)
   constant.4466 = f32[] constant(0)
-  ROOT reduce-window = f32[1,33,64]{2,1,0} reduce-window(operand, constant.4466),     window={size=1x1x1 pad=0_0x1_0x0_0}, to_apply=identity.pad_to_reduce_window
+  ROOT reduce-window = f32[1,33,64]{2,1,0} reduce-window(operand, constant.4466), window={size=1x1x1 pad=0_0x1_0x0_0}, to_apply=identity.pad_to_reduce_window
+}
+
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, tensorflow::gtl::nullopt));
+}
+
+TEST_F(HloTestBase, ReduceWindowS32) {
+  const string hlo_string = R"(
+HloModule reduce-window
+
+%identity.pad_to_reduce_window (param0: s32[], param1: s32[]) -> s32[] {
+  %param0 = s32[] parameter(0)
+  ROOT %param1 = s32[] parameter(1)
+}
+
+ENTRY %reduce-window (parameter.0: s32[81,8], parameter.1: s32[]) -> s32[82,8] {
+  %parameter.0 = s32[81,8]{1,0} parameter(0)
+  %parameter.1 = s32[] parameter(1)
+  ROOT %reduce-window = s32[82,8]{1,0} reduce-window(s32[81,8]{1,0} %parameter.0, s32[] %parameter.1), window={size=1x1 pad=0_1x0_0}, to_apply=%identity.pad_to_reduce_window
 }
 
 )";
diff --git a/tensorflow/compiler/xla/tests/reshape_motion_test.cc b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
index 5ebd5268992846e80dcce2675f8e92038e190ecf..da1b588ec41cef711412367e89b2a9b1029bca71 100644
--- a/tensorflow/compiler/xla/tests/reshape_motion_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index d7462d581b8596dc43b81b0162b3f5020cebb546..a4580cd71d46ad0a0186eddd51291f9c322b6f49 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -656,9 +656,9 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   std::unique_ptr<Literal> expected =
       Literal::CreateR2FromArray2D<float>(expected_array);
   if (use_bfloat16()) {
-    expected = LiteralTestUtil::ConvertF32ToBF16(*expected);
+    expected = Literal::ConvertF32ToBF16(*expected);
   }
-  LiteralTestUtil::ExpectEqual(*expected, *actual);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *actual));
 }
 
 XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
@@ -731,7 +731,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1});
 
   std::unique_ptr<Literal> expected =
-      LiteralTestUtil::Reshape({2, 1}, {1, 0}, *input_literal);
+      Literal::ReshapeSlice({2, 1}, {1, 0}, *input_literal);
   ComputeAndCompareLiteral(&builder, *expected, {input_data.get()},
                            zero_error_spec_);
 }
@@ -753,7 +753,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2});
 
   std::unique_ptr<Literal> expected =
-      LiteralTestUtil::Reshape({4, 2}, {1, 0}, *input_literal);
+      Literal::ReshapeSlice({4, 2}, {1, 0}, *input_literal);
   ComputeAndCompareLiteral(&builder, *expected, {input_data.get()},
                            zero_error_spec_);
 }
@@ -817,7 +817,7 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
   // Since the reshape is a no-op, verify that it does not change the underlying
   // data.
   if (use_bfloat16()) {
-    auto expected = LiteralTestUtil::ConvertF32ToBF16(*input_literal);
+    auto expected = Literal::ConvertF32ToBF16(*input_literal);
     EXPECT_EQ(expected->data<bfloat16>(), output_literal->data<bfloat16>());
   } else {
     EXPECT_EQ(input_literal->data<float>(), output_literal->data<float>());
@@ -886,7 +886,7 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) {
                   /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
-      LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
+      Literal::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal)
           ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
 
   // Specify the requested output shape explicitly to ensure that this reshape
@@ -915,7 +915,7 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
                   /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
-      LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
+      Literal::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal)
           ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
 
   // Specify the requested output shape explicitly to ensure that this reshape
@@ -944,7 +944,7 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
                   /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
-      LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
+      Literal::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal)
           ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
 
   // Specify the requested output shape explicitly to ensure that this reshape
@@ -974,7 +974,7 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
                   /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
-      LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
+      Literal::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal)
           ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
 
   // Specify the requested output shape explicitly to ensure that this reshape
@@ -1003,7 +1003,7 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
                   /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
-      LiteralTestUtil::Reshape(new_bounds, {1, 0, 2, 3}, *input_literal)
+      Literal::ReshapeSlice(new_bounds, {1, 0, 2, 3}, *input_literal)
           ->Relayout(input_literal->shape().layout());
 
   // Specify the requested output shape explicitly to ensure that this reshape
diff --git a/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc b/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc
index 8cbfcc6f5c4272706a0f9fd809041516bf32432b..7cfca781acda15879075f4386c2096e537877aac 100644
--- a/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc
+++ b/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc
@@ -100,7 +100,7 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim0Minor) {
   EXPECT_EQ(46.0f, actual->Get<float>({1, 1}));
 
   std::unique_ptr<Literal> round_tripped = RoundTripToServer(*actual);
-  LiteralTestUtil::ExpectEqual(*round_tripped, *actual);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*round_tripped, *actual));
 }
 
 TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim1Minor) {
@@ -135,7 +135,7 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim1Minor) {
   EXPECT_EQ(46.0f, actual->Get<float>({1, 1}));
 
   std::unique_ptr<Literal> round_tripped = RoundTripToServer(*actual);
-  LiteralTestUtil::ExpectEqual(*round_tripped, *actual);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*round_tripped, *actual));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc b/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc
index 32db45f8a66266712ba4091c2aa6368f0b822bd2..f334a8c1318a59bbfdd27dd1a63ed162600089ce 100644
--- a/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc
+++ b/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc
@@ -41,7 +41,7 @@ class RoundTripTransferTest : public ClientLibraryTestBase {
         client_->TransferToServer(original).ConsumeValueOrDie();
     std::unique_ptr<Literal> result =
         client_->Transfer(*data).ConsumeValueOrDie();
-    LiteralTestUtil::ExpectEqual(original, *result);
+    EXPECT_TRUE(LiteralTestUtil::Equal(original, *result));
   }
 };
 
diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
index f35bc43a4952137b4b6c94c771819e0514d4228f..308d3fc78a51e63c0e3db8c0cda18caf11f665bd 100644
--- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc
+++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
@@ -390,7 +390,7 @@ XLA_TEST_F(ScalarComputationsTest, DivU32s) {
                                      &execution_options_)
                 .ConsumeValueOrDie();
         auto expected_literal = Literal::CreateR0<uint32>(dividend / divisor);
-        LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+        EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *actual_literal));
       }
     }
   }
@@ -431,7 +431,7 @@ XLA_TEST_F(ScalarComputationsTest, RemU32s) {
                                      &execution_options_)
                 .ConsumeValueOrDie();
         auto expected_literal = Literal::CreateR0<uint32>(dividend % divisor);
-        LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+        EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *actual_literal));
       }
     }
   }
diff --git a/tensorflow/compiler/xla/tests/select_test.cc b/tensorflow/compiler/xla/tests/select_test.cc
index 3d694a9c3fe894107c3b0a8fc2e5d07310cb476c..72707f224446c7585d1d90ac6681a7b38c41d5f1 100644
--- a/tensorflow/compiler/xla/tests/select_test.cc
+++ b/tensorflow/compiler/xla/tests/select_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 810cc25f1b5b1199984a3229909a70f9548c7dd2..de1865138802bc72e9a4b2db7a21343b0d327108 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -107,7 +107,7 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
     }
     return Literal::MakeTupleOwned(std::move(elements));
   }
-  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
+  auto literal = MakeUnique<Literal>(shape);
   switch (shape.element_type()) {
     case BF16:
       PopulateWithRandomFloatingPointData<bfloat16>(literal.get(), engine);
diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
index e2067bc1b835a946fc56801cbf227e05ef0686b4..0063e7ad415e9b6718c164f415ced6fb76cbf44a 100644
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@@ -175,7 +175,7 @@ XLA_TEST_F(TransferManagerTest, TransferTuple) {
                           transfer_manager_->TransferLiteralFromDevice(
                               stream_executor_, device_buffer));
 
-  LiteralTestUtil::ExpectEqual(*literal, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *result));
 }
 
 XLA_TEST_F(TransferManagerTest, TransferEmptyTuple) {
@@ -189,7 +189,7 @@ XLA_TEST_F(TransferManagerTest, TransferEmptyTuple) {
                           transfer_manager_->TransferLiteralFromDevice(
                               stream_executor_, device_buffer));
 
-  LiteralTestUtil::ExpectEqual(*literal, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *result));
 }
 
 XLA_TEST_F(TransferManagerTest, TransferNestedTuple) {
@@ -209,7 +209,7 @@ XLA_TEST_F(TransferManagerTest, TransferNestedTuple) {
                           transfer_manager_->TransferLiteralFromDevice(
                               stream_executor_, device_buffer));
 
-  LiteralTestUtil::ExpectEqual(*literal, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *result));
 }
 
 XLA_TEST_F(TransferManagerTest, TransferComplexValue) {
@@ -224,7 +224,7 @@ XLA_TEST_F(TransferManagerTest, TransferComplexValue) {
                           transfer_manager_->TransferLiteralFromDevice(
                               stream_executor_, device_buffer));
 
-  LiteralTestUtil::ExpectEqual(*literal, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *result));
 }
 
 XLA_TEST_F(TransferManagerTest, TransferComplexValueInTuple) {
@@ -243,7 +243,7 @@ XLA_TEST_F(TransferManagerTest, TransferComplexValueInTuple) {
                           transfer_manager_->TransferLiteralFromDevice(
                               stream_executor_, device_buffer));
 
-  LiteralTestUtil::ExpectEqual(*literal, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *result));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/transpose_test.cc b/tensorflow/compiler/xla/tests/transpose_test.cc
index 59ce23d0247b58c6aebc2b5a65453157c1ca15ff..fe1e3da7eca00e128377e6e56af877868aafa836 100644
--- a/tensorflow/compiler/xla/tests/transpose_test.cc
+++ b/tensorflow/compiler/xla/tests/transpose_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 5c287bac6a7cab5a3c2642971a5a67070ee56c72..41189231b90e842292830a932cf381af60456d4c 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
@@ -496,7 +495,7 @@ XLA_TEST_F(TupleTest, ComplexTuples) {
   auto sum = Literal::CreateR2<complex64>({{{111, 222}, {331, 442}},
                                            {{1011, 2022}, {3031, 4042}},
                                            {{10011, 20022}, {30031, 40042}}});
-  auto prod = Literal::CreateFromShape(sum->shape());
+  auto prod = MakeUnique<Literal>(sum->shape());
   ASSERT_TRUE(prod->Populate<complex64>(
                       [&sum](tensorflow::gtl::ArraySlice<int64> indexes) {
                         return sum->Get<complex64>(indexes) *
@@ -515,7 +514,7 @@ XLA_TEST_F(TupleTest, ComplexTuples) {
 class TupleHloTest : public HloTestBase {};
 
 // Disabled on the interpreter because bitcast doesn't exist on the interpreter.
-TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
+XLA_TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
   const char* testcase = R"(
     HloModule m
 
diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc
index 50c8766f2e3976c7077046283ab3b3e762622fc5..c3abe22797f5eaa76ced2ad8534bd68c32983e60 100644
--- a/tensorflow/compiler/xla/tests/unary_op_test.cc
+++ b/tensorflow/compiler/xla/tests/unary_op_test.cc
@@ -84,6 +84,11 @@ int UnaryOpTest::inf<int>() {
   return 2147483647;
 }
 
+template <>
+int64 UnaryOpTest::inf<int64>() {
+  return 0x7FFFFFFFFFFFFFFFl;
+}
+
 template <>
 void UnaryOpTest::AbsTestHelper<complex64>() {
   XlaBuilder builder(TestName());
@@ -176,6 +181,7 @@ XLA_TEST_F(UnaryOpTest, SignTestR0) {
 
 XLA_TEST_F(UnaryOpTest, SignTestR1) {
   SignTestHelper<int>();
+  SignTestHelper<int64>();
   SignTestHelper<float>();
   SignTestHelper<complex64>();
 }
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 7944b5132f3d11cf84488acbd920cc98c084072a..3c9a01653c67203cbc962a3d3d967142f7a2102c 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -84,8 +84,8 @@ Status ParseOneProfileOutputLine(
   string match_percentage = "\\d+\\.\\d\\d%";
   string match_cycles = "(\\d+) cycles +\\( *(" + match_percentage + ")\\)";
   string match_usecs = "([0-9.]+) usec";
-  string match_flops = "([^ ]+)";
-  string match_trops = "([^ ]+)";
+  string match_flops = "([^ ]*)";
+  string match_trops = "([^ ]*)";
   string match_bytes_per_sec = "([0-9.TGMKi]+)B/s";
   string match_bytes_per_cycle = "([0-9.TGMKi]+)B/cycle";
 
diff --git a/tensorflow/compiler/xla/text_literal_writer.cc b/tensorflow/compiler/xla/text_literal_writer.cc
index 6e3061b78a554f028b2ffae2e0590d91a4fe48e2..373c0d2d8d8ab05dec11e51f265d41b91e7920bf 100644
--- a/tensorflow/compiler/xla/text_literal_writer.cc
+++ b/tensorflow/compiler/xla/text_literal_writer.cc
@@ -30,7 +30,7 @@ limitations under the License.
 
 namespace xla {
 
-/* static */ tensorflow::Status TextLiteralWriter::WriteToPath(
+/* static */ Status TextLiteralWriter::WriteToPath(
     const Literal& literal, tensorflow::StringPiece path) {
   std::unique_ptr<tensorflow::WritableFile> f;
   auto s = tensorflow::Env::Default()->NewWritableFile(std::string(path), &f);
@@ -43,7 +43,7 @@ namespace xla {
     return s;
   }
 
-  tensorflow::Status status;
+  Status status;
   tensorflow::WritableFile* f_ptr = f.get();
   literal.EachCellAsString(
       [f_ptr, &status](tensorflow::gtl::ArraySlice<int64> indices,
diff --git a/tensorflow/compiler/xla/text_literal_writer.h b/tensorflow/compiler/xla/text_literal_writer.h
index 7375493f4309c9bf75fc9d724626267dff7ce5ed..0a1235b5e04675da0f412bafab6c4ecf04367787 100644
--- a/tensorflow/compiler/xla/text_literal_writer.h
+++ b/tensorflow/compiler/xla/text_literal_writer.h
@@ -37,8 +37,8 @@ namespace xla {
 // This should be readable by xla::TextLiteralReader.
 class TextLiteralWriter {
  public:
-  static tensorflow::Status WriteToPath(const Literal& literal,
-                                        tensorflow::StringPiece path);
+  static Status WriteToPath(const Literal& literal,
+                            tensorflow::StringPiece path);
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(TextLiteralWriter);
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 78ab2dccafc37aa4f93da0b8d5b39a779ddd5db8..415cf9c16a2613913265d0342e5ab9932de5eb19 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -36,11 +36,10 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
-        "//tensorflow/compiler/xla/service:session_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -63,10 +62,9 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:interpreter_plugin",
-        "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -84,12 +82,10 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:testing",
         "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -165,12 +161,11 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:computation_tracker",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:interpreter_plugin",
-        "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -184,12 +179,11 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:interpreter_plugin",
-        "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -202,13 +196,12 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:interpreter_plugin",
-        "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
index 21ae8583d7cd3343230dcaff7dc17456e9e3e702..befb55453777dce30af89bcaad2ffe1647097576 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
@@ -17,7 +17,7 @@ limitations under the License.
 //
 // Dumps a graphviz URL for a snapshot computation to the command line.
 //
-// some_binary_snapshot_proto is obtained by serializing the SessionModule from
+// some_binary_snapshot_proto is obtained by serializing the HloSnapshot from
 // ServiceInterface::SnapshotComputation to disk.
 //
 // The GraphViz URL is placed into the log stderr, whereas computation
@@ -30,11 +30,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -49,10 +48,11 @@ namespace tools {
 void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
   Client* client = ClientLibrary::LocalClientOrDie();
   for (char* arg : args) {
-    SessionModule module;
+    HloSnapshot module;
     TF_CHECK_OK(
         tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
-    Computation computation = client->LoadSnapshot(module).ConsumeValueOrDie();
+    XlaComputation computation =
+        client->LoadSnapshot(module).ConsumeValueOrDie();
     DebugOptions debug_options = legacy_flags::GetDebugOptionsFromFlags();
     debug_options.set_xla_generate_hlo_graph(".*");
     ComputationStats stats =
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index b82f1c81c84b487c1661af5267b9123da97bb107..cfb8f37487d6499b803438a135be54524fcf17d2 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -21,11 +21,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -66,16 +65,16 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
   LocalService* local_service =
       ClientLibrary::GetXlaService(client->platform());
   for (char* arg : args) {
-    SessionModule session_module;
+    HloSnapshot snapshot;
     TF_CHECK_OK(tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg,
-                                            &session_module));
-    auto computation_status = client->LoadSnapshot(session_module);
+                                            &snapshot));
+    auto computation_status = client->LoadSnapshot(snapshot);
     if (!computation_status.ok()) {
       fprintf(stderr, "could not load snapshot for %s: %s\n", arg,
               computation_status.status().ToString().c_str());
       continue;
     }
-    Computation computation = computation_status.ConsumeValueOrDie();
+    XlaComputation computation = computation_status.ConsumeValueOrDie();
 
     std::unique_ptr<ProgramShape> program_shape =
         client->GetComputationShape(computation).ConsumeValueOrDie();
@@ -89,8 +88,7 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
     build_options.set_device_ordinal(0);
     build_options.set_result_layout(program_shape->result());
     StatusOr<std::unique_ptr<Executable>> executable =
-        local_service->CompileExecutable(computation.handle(), layouts,
-                                         build_options);
+        local_service->CompileExecutable(computation, layouts, build_options);
 
     const HloModule& module = executable.ValueOrDie()->module();
 
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 05c0fdf97d27c09eb2bbb0f265b5b2a5982ca7b1..b815bbf854b82b323da7879c230a1026cae96625 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -19,11 +19,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/computation_tracker.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -40,16 +39,16 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
   LocalService* local_service =
       ClientLibrary::GetXlaService(client->platform());
   for (char* arg : args) {
-    SessionModule session_module;
+    HloSnapshot snapshot;
     TF_CHECK_OK(tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg,
-                                            &session_module));
-    auto computation_status = client->LoadSnapshot(session_module);
+                                            &snapshot));
+    auto computation_status = client->LoadSnapshot(snapshot);
     if (!computation_status.ok()) {
       fprintf(stderr, "could not load snapshot for %s: %s\n", arg,
               computation_status.status().ToString().c_str());
       continue;
     }
-    Computation computation = computation_status.ConsumeValueOrDie();
+    XlaComputation computation = computation_status.ConsumeValueOrDie();
 
     if (compile) {
       std::unique_ptr<ProgramShape> program_shape =
@@ -65,8 +64,7 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
       build_options.set_device_ordinal(0);
       build_options.set_result_layout(program_shape->result());
       StatusOr<std::unique_ptr<Executable>> executable =
-          local_service->CompileExecutable(computation.handle(), layouts,
-                                           build_options);
+          local_service->CompileExecutable(computation, layouts, build_options);
 
       const HloModule& module = executable.ValueOrDie()->module();
 
@@ -74,13 +72,11 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
               local_service->backend().platform()->Name().c_str(),
               module.ToString(HloPrintOptions::ShortParsable()).c_str());
     } else {
-      const ComputationTracker& tracker = local_service->computation_tracker();
-      UserComputation* user_computation =
-          tracker.Resolve(computation.handle()).ConsumeValueOrDie();
-      VersionedComputationHandle versioned_handle =
-          user_computation->GetVersionedHandle();
+      auto config = HloModule::CreateModuleConfigFromProto(computation.proto(),
+                                                           DebugOptions())
+                        .ConsumeValueOrDie();
       std::unique_ptr<HloModule> module =
-          tracker.BuildHloModule(versioned_handle, HloModuleConfig())
+          HloModule::CreateFromProto(computation.proto(), config)
               .ConsumeValueOrDie();
 
       fprintf(stdout, "%s\n",
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
index 51f90b07c66f7d839f587350726333b9dbe6a9f0..a5dce20456c6a2402f425ebb3d575d1bb625f839 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
@@ -28,11 +28,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -48,10 +47,11 @@ namespace tools {
 void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
   Client* client = ClientLibrary::LocalClientOrDie();
   for (char* arg : args) {
-    SessionModule module;
+    HloSnapshot module;
     TF_CHECK_OK(
         tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
-    Computation computation = client->LoadSnapshot(module).ConsumeValueOrDie();
+    XlaComputation computation =
+        client->LoadSnapshot(module).ConsumeValueOrDie();
     DebugOptions debug_options = legacy_flags::GetDebugOptionsFromFlags();
     debug_options.set_xla_generate_hlo_graph(".*");
     debug_options.set_xla_hlo_dump_as_graphdef(true);
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index 3a945fb3b1b54ea92e577a6bea5f771ac0e5defd..d0e7af8844203da93dac5b45cb7e13916448dd47 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -30,6 +30,7 @@ namespace {
 
 using tensorflow::StringPiece;
 using tensorflow::gtl::optional;
+using tensorflow::str_util::Join;
 using tensorflow::str_util::Split;
 using tensorflow::str_util::SplitAndParseAsInts;
 using tensorflow::strings::Printf;
@@ -53,7 +54,7 @@ class HloParser {
   std::unique_ptr<HloModule> ConsumeHloModule() { return std::move(module_); }
 
   // Returns the error information.
-  string GetError() const { return tensorflow::str_util::Join(error_, "\n"); }
+  string GetError() const { return Join(error_, "\n"); }
 
  private:
   // ParseXXX returns false if an error occurred.
@@ -245,7 +246,7 @@ bool HloParser::Error(LocTy loc, StringPiece msg) {
   error_lines.push_back(std::string(lexer_.GetLine(loc)));
   error_lines.push_back(col == 0 ? "" : StrCat(string(col - 1, ' '), "^"));
 
-  error_.push_back(tensorflow::str_util::Join(error_lines, "\n"));
+  error_.push_back(Join(error_lines, "\n"));
   VLOG(1) << "Error: " << error_.back();
   return false;
 }
@@ -439,6 +440,10 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
   optional<OpMetadata> metadata;
   attrs["metadata"] = {/*required=*/false, AttrTy::kMetadata, &metadata};
 
+  optional<string> backend_config;
+  attrs["backend_config"] = {/*required=*/false, AttrTy::kString,
+                             &backend_config};
+
   HloInstruction* instruction;
   switch (opcode) {
     case HloOpcode::kParameter: {
@@ -476,10 +481,12 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kImag:
     case HloOpcode::kIsFinite:
     case HloOpcode::kFloor:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kNot:
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
@@ -1093,8 +1100,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
 
   instruction->set_name(name);
 
-  // Add common attrs (sharding, control predecessors) to the instruction, if
-  // they were seen.
+  // Add shared attributes like metadata to the instruction, if they were seen.
   if (sharding) {
     instruction->set_sharding(
         HloSharding::FromProto(sharding.value()).ValueOrDie());
@@ -1111,6 +1117,9 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
   if (metadata) {
     instruction->set_metadata(*metadata);
   }
+  if (backend_config) {
+    instruction->set_backend_config(std::move(*backend_config));
+  }
   return AddInstruction(name, instruction, name_loc);
 }  // NOLINT(readability/fn_size)
 
@@ -1488,11 +1497,10 @@ bool HloParser::ParseDenseLiteral(std::unique_ptr<Literal>* literal,
     std::vector<int64> elems_seen_until_dim(elems_seen_per_dim.begin(),
                                             elems_seen_per_dim.begin() + dim);
     return StrCat("[",
-                  tensorflow::str_util::Join(
-                      elems_seen_until_dim, ",",
-                      [](string* out, const int64& num_elems) {
-                        tensorflow::strings::StrAppend(out, num_elems - 1);
-                      }),
+                  Join(elems_seen_until_dim, ",",
+                       [](string* out, const int64& num_elems) {
+                         tensorflow::strings::StrAppend(out, num_elems - 1);
+                       }),
                   "]");
   };
   do {
@@ -1680,7 +1688,7 @@ bool HloParser::ParseSparseLiteralHelper(std::unique_ptr<Literal>* literal,
         return Error(
             index_loc,
             StrCat("invalid multi-dimension index for shape with rank ", rank,
-                   ": [", tensorflow::str_util::Join(index, ", "), "]"));
+                   ": [", Join(index, ", "), "]"));
       }
     }
     if (!ParseToken(TokKind::kColon,
@@ -1848,7 +1856,19 @@ bool HloParser::ParseAttributeHelper(
   }
   auto attr_it = attrs.find(name);
   if (attr_it == attrs.end()) {
-    return Error(loc, Printf("unexpected attribute %s", name.c_str()));
+    string allowed_attrs;
+    if (attrs.empty()) {
+      allowed_attrs = "No attributes are allowed here.";
+    } else {
+      allowed_attrs = StrCat(
+          "Allowed attributes: ",
+          Join(attrs, ", ",
+               [&](string* out, const std::pair<string, AttrConfig>& kv) {
+                 StrAppend(out, kv.first);
+               }));
+    }
+    return Error(loc, Printf("unexpected attribute \"%s\".  %s", name.c_str(),
+                             allowed_attrs.c_str()));
   }
   AttrTy attr_type = attr_it->second.attr_type;
   void* attr_out_ptr = attr_it->second.result;
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index 4e085bc89c6dc6021a5b9cb1c5a57f0282f41ee1..131aded95ab04c4327c275ed8cd18b8fc7ac1bd6 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -65,7 +65,7 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
 R"(HloModule constant_pred_module
 
 ENTRY %constant_pred () -> pred[] {
-  ROOT %constant = pred[] constant(true), metadata={op_type="const" op_name="\"it\'s not a problem\n" source_file="path/to/test.cc" source_line=68}
+  ROOT %constant = pred[] constant(true), metadata={op_type="const" op_name="\"it\'s not a problem\n" source_file="path/to/test.cc" source_line=68}, backend_config="foo\" bar"
 }
 
 )"
@@ -81,13 +81,14 @@ ENTRY %constant_s32 () -> s32[] {
 
 )"
 },
-// f32 constant, but the value is not a decimal
+// f32 constant, but the value is not a decimal and there is a backend
+// configuration
 {
 "ConstantF32",
 R"(HloModule ConstantF32_module
 
 ENTRY %ConstantF32.v4 () -> f32[] {
-  ROOT %constant = f32[] constant(42)
+  ROOT %constant = f32[] constant(42), backend_config="this is a configuration"
 }
 
 )"
@@ -937,13 +938,13 @@ INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserShortTest,
 TEST_F(HloParserTest, Empty) {
   const string original = "";
   auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  EXPECT_NE(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, Garbage) {
   const string original = "HloModule thi$ str1ng makes# N0 sen$e @all!*&^%$";
   auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  EXPECT_NE(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, WrongOpcode) {
@@ -957,7 +958,7 @@ ENTRY %blabla (x: f32[], y: f32[]) -> f32[] {
 
 )";
   auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  EXPECT_NE(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, WrongShape) {
@@ -969,7 +970,7 @@ ENTRY %blabla (x: g32[]) -> g32[] {
 
 )";
   auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  EXPECT_NE(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, WrongOperandsSize) {
@@ -982,7 +983,7 @@ ENTRY %blabla (x: f32[]) -> pred[] {
 
 )";
   auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  EXPECT_NE(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, OperandNotFound) {
@@ -993,7 +994,7 @@ ENTRY %blabla (x: f32[]) -> pred[] {
 }
 )";
   auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  EXPECT_NE(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, MoreConstants) {
@@ -1013,6 +1014,19 @@ ENTRY %SelectScalarS32True.v4 () -> s32[] {
   // but the constant names will not be exactly the same.
 }
 
+TEST_F(HloParserTest, ConfigurationField) {
+  const string original = R"(HloModule AModule
+ENTRY %configuration_test() -> s32[] {
+  %constant = s32[] constant(42), backend_config="foo bar"
+})";
+  auto result = Parse(original);
+  TF_ASSERT_OK(result.status());
+  EXPECT_EQ("foo bar", result.ValueOrDie()
+                           ->entry_computation()
+                           ->root_instruction()
+                           ->backend_config());
+}
+
 TEST_F(HloParserTest, LiteralDimensionsMismatch_1) {
   const string original = R"(HloModule some_2_module
 
@@ -1022,7 +1036,7 @@ ENTRY %some_2 () -> f32[2] {
 
 )";
   auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  EXPECT_NE(Status::OK(), result.status());
   ExpectHasSubstr(result.status().error_message(),
                   "expects nested array in rank 1, but sees larger");
 }
@@ -1036,7 +1050,7 @@ ENTRY %some_2x3 () -> f32[2,3] {
 
 )";
   auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  EXPECT_NE(Status::OK(), result.status());
   ExpectHasSubstr(result.status().error_message(),
                   "expects nested array in rank 2, but sees 1");
 }
@@ -1050,7 +1064,7 @@ ENTRY %some_2x3x2 () -> f32[2,3,2] {
 
 )";
   auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  EXPECT_NE(Status::OK(), result.status());
   ExpectHasSubstr(result.status().error_message(),
                   "expects 3 elements in the [0]th element");
 }
@@ -1065,7 +1079,7 @@ ENTRY %ConstantF16Overflow.v4 () -> f16[] {
 
 )";
   auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  EXPECT_NE(Status::OK(), result.status());
   ExpectHasSubstr(result.status().error_message(),
                   "is out of range for literal's primitive type F16");
 }
@@ -1092,7 +1106,7 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
   %input = f32[1,2,1]{2,1,0} parameter(0)
   %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input)
   %filter = f32[1,1,1]{2,1,0} parameter(1)
-  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), sharding={maximal device=1}, dim_labels=b0f_0io->b0f, window={pad=1_1 size=2}
+  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), sharding={maximal device=1}, backend_config="foo", dim_labels=b0f_0io->b0f, window={pad=1_1 size=2}
 }
 
 )";
@@ -1138,7 +1152,7 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 
 )";
   ExpectHasSubstr(Parse(original).status().error_message(),
-                  "unexpected attribute calls");
+                  "unexpected attribute \"calls\"");
 }
 
 TEST_F(HloParserTest, MissingAttribute) {
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index d8cedad65ea68ef86b94394a1accf2c08517c0b2..df0501386c1e4de9111fbb6b2d9e8ec372dbf41e 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -17,7 +17,7 @@ limitations under the License.
 //
 // Replays computations and shows the results on the command line.
 //
-// some_binary_snapshot_proto is obtained by serializing the SessionModule from
+// some_binary_snapshot_proto is obtained by serializing the HloSnapshot from
 // ServiceInterface::SnapshotComputation to disk.
 //
 // Computations that require arguments can be replayed using fake data by
@@ -36,14 +36,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -76,13 +74,9 @@ struct Options {
 //
 // Similarly, infeeds fake data of shape fake_infeed_shape if it is provided;
 // otherwise, no infeed is performed.
-template <typename ModuleT>
-StatusOr<std::unique_ptr<Literal>> ReplayComputation(const ModuleT& module,
+StatusOr<std::unique_ptr<Literal>> ReplayComputation(const HloSnapshot& module,
                                                      Client* client,
                                                      const Options& opts) {
-  static_assert(std::is_same<ModuleT, HloSnapshot>::value ||
-                    std::is_same<ModuleT, SessionModule>::value,
-                "Proto must be in HloSnapshot or SessionModule format");
   TF_ASSIGN_OR_RETURN(auto computation, client->LoadSnapshot(module));
 
   std::vector<std::unique_ptr<GlobalData>> arguments;
@@ -161,40 +155,13 @@ int RealMain(tensorflow::gtl::ArraySlice<char*> args, const Options& opts) {
   for (char* arg : args) {
     HloSnapshot snapshot;
     auto status = tensorflow::ReadBinaryProto(env, arg, &snapshot);
-    if (status.ok()) {
-      StatusOr<std::unique_ptr<Literal>> result_status =
-          ReplayComputation(snapshot, client, opts);
-      if (!result_status.ok()) {
-        fprintf(stderr, "%s: error: %s\n", arg,
-                result_status.status().ToString().c_str());
-        exit_status = EXIT_FAILURE;
-        continue;
-      }
-
-      std::unique_ptr<Literal> result = result_status.ConsumeValueOrDie();
-      if (result != nullptr) {
-        fprintf(stdout, "%s: %s :: %s:%s\n", arg,
-                snapshot.hlo().hlo_module().name().c_str(),
-                ShapeUtil::HumanString(result->shape()).c_str(),
-                result->ToString().c_str());
-        if (snapshot.has_result()) {
-          std::unique_ptr<Literal> literal =
-              Literal::CreateFromProto(snapshot.result()).ConsumeValueOrDie();
-          fprintf(stdout, "was %s:%s\n",
-                  ShapeUtil::HumanString(snapshot.result().shape()).c_str(),
-                  literal->ToString().c_str());
-        }
-      }
-
+    if (!status.ok()) {
+      fprintf(stderr, "%s: is not HloSnapshot: %s.\n", arg,
+              status.ToString().c_str());
       continue;
     }
-    fprintf(stderr, "%s: is not HloSnapshot: %s. Trying as SessionModule...\n",
-            arg, status.ToString().c_str());
-
-    SessionModule module;
-    TF_CHECK_OK(tensorflow::ReadBinaryProto(env, arg, &module));
     StatusOr<std::unique_ptr<Literal>> result_status =
-        ReplayComputation(module, client, opts);
+        ReplayComputation(snapshot, client, opts);
     if (!result_status.ok()) {
       fprintf(stderr, "%s: error: %s\n", arg,
               result_status.status().ToString().c_str());
@@ -204,14 +171,15 @@ int RealMain(tensorflow::gtl::ArraySlice<char*> args, const Options& opts) {
 
     std::unique_ptr<Literal> result = result_status.ConsumeValueOrDie();
     if (result != nullptr) {
-      fprintf(stdout, "%s: %s :: %s:%s\n", arg, module.entry().name().c_str(),
+      fprintf(stdout, "%s: %s :: %s:%s\n", arg,
+              snapshot.hlo().hlo_module().name().c_str(),
               ShapeUtil::HumanString(result->shape()).c_str(),
               result->ToString().c_str());
-      if (module.has_result()) {
+      if (snapshot.has_result()) {
         std::unique_ptr<Literal> literal =
-            Literal::CreateFromProto(module.result()).ConsumeValueOrDie();
+            Literal::CreateFromProto(snapshot.result()).ConsumeValueOrDie();
         fprintf(stdout, "was %s:%s\n",
-                ShapeUtil::HumanString(module.result().shape()).c_str(),
+                ShapeUtil::HumanString(snapshot.result().shape()).c_str(),
                 literal->ToString().c_str());
       }
     }
diff --git a/tensorflow/compiler/xla/tools/show_signature.cc b/tensorflow/compiler/xla/tools/show_signature.cc
index 1f3340cbc6afa9bda8bf639d01b8185968f79a4d..4e53fafcc97ff53afc5713e7ed8ee5222fac316b 100644
--- a/tensorflow/compiler/xla/tools/show_signature.cc
+++ b/tensorflow/compiler/xla/tools/show_signature.cc
@@ -18,7 +18,7 @@ limitations under the License.
 // Shows the signature (ProgramShape) of binary snapshot proto(s) on the command
 // line.
 //
-// some_binary_snapshot_proto is obtained by serializing the SessionModule from
+// some_binary_snapshot_proto is obtained by serializing the HloSnapshot from
 // ServiceInterface::SnapshotComputation to disk.
 //
 // The output format is:
@@ -31,9 +31,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -49,13 +48,14 @@ namespace tools {
 void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
   Client* client = ClientLibrary::LocalClientOrDie();
   for (char* arg : args) {
-    SessionModule module;
+    HloSnapshot module;
     TF_CHECK_OK(
         tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
-    Computation computation = client->LoadSnapshot(module).ConsumeValueOrDie();
+    auto computation = client->LoadSnapshot(module).ConsumeValueOrDie();
     std::unique_ptr<ProgramShape> shape =
         client->GetComputationShape(computation).ConsumeValueOrDie();
-    fprintf(stdout, "%s: %s :: %s\n", arg, module.entry().name().c_str(),
+    fprintf(stdout, "%s: %s :: %s\n", arg,
+            module.hlo().hlo_module().name().c_str(),
             ShapeUtil::HumanString(*shape).c_str());
   }
 }
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 750d72d797b4f8680e13597ac02f6f9fa6e37bcd..b895ac045c361b2336e0081eadf16334d49d3bee 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -814,6 +814,12 @@ enum UnaryOperation {
 
   // Elementwise, computes clz(x).
   UNOP_CLZ = 17;
+
+  // Elementwise, computes exp(x)-1.
+  UNOP_EXPM1 = 18;
+
+  // Elementwise, computes log(x+1).
+  UNOP_LOG1P = 19;
 }
 
 message UnaryOpRequest {
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index abdbdb4cd22ff38a0fae89af10c600a178d9a3d4..0f9c80404ad33c39ae783e0bfa3cfb26e342fe3d 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -71,6 +71,7 @@ py_library(
         "//tensorflow/contrib/memory_stats:memory_stats_py",
         "//tensorflow/contrib/meta_graph_transform",
         "//tensorflow/contrib/metrics:metrics_py",
+        "//tensorflow/contrib/mixed_precision:mixed_precision",
         "//tensorflow/contrib/model_pruning",
         "//tensorflow/contrib/nccl:nccl_py",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 9f5459f41da3e5a13286f7002e4b519978bc189b..9aad772f0acd941d50d6ba238d345616195a6939 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -60,6 +60,7 @@ from tensorflow.contrib import lookup
 from tensorflow.contrib import losses
 from tensorflow.contrib import memory_stats
 from tensorflow.contrib import metrics
+from tensorflow.contrib import mixed_precision
 from tensorflow.contrib import model_pruning
 from tensorflow.contrib import nccl
 from tensorflow.contrib import nn
diff --git a/tensorflow/contrib/android/BUILD b/tensorflow/contrib/android/BUILD
index 60306ebdc6cddb04e8807bfd495fa92a56e55ecd..c10179ba8b290b6209f5567d6323df4bcf711585 100644
--- a/tensorflow/contrib/android/BUILD
+++ b/tensorflow/contrib/android/BUILD
@@ -72,7 +72,7 @@ cc_binary(
         "-s",
         "-Wl,--gc-sections",
         "-Wl,--version-script",  # This line must be directly followed by LINKER_SCRIPT.
-        LINKER_SCRIPT,
+        "$(location {})".format(LINKER_SCRIPT),
     ]),
     linkshared = 1,
     linkstatic = 1,
diff --git a/tensorflow/contrib/autograph/CONTRIBUTING.md b/tensorflow/contrib/autograph/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..a7a3fe1452d2a3e9c2a37a25ae96f541f8f939e0
--- /dev/null
+++ b/tensorflow/contrib/autograph/CONTRIBUTING.md
@@ -0,0 +1,45 @@
+# How to Contribute
+
+We'd love to have your patches and contributions! Here are some guidelines. In general, we follow the [TensorFlow contributing guidelines](../../CONTRIBUTING.md), but have some [AutoGraph-specific style guidelines](STYLE_GUIDE.md). More details below.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult [GitHub
+Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
+
+After a pull request is approved, we merge it. Note our merging process differs
+from GitHub in that we pull and submit the change into an internal version
+control system. This system automatically pushes a git commit to the GitHub
+repository (with credit to the original author) and closes the pull request.
+
+## Style
+
+See the [TensorFlow AutoGraph style guide](STYLE_GUIDE.md).
+
+## Unit tests
+
+Please include unit tests when contributing new features ([example here](converters/continue_statements_test.py)), as they help to a) prove that your code works correctly, and b) guard against future breaking
+changes to lower the maintenance cost.
+It's also helpful to check that any
+changes you propose do not break existing unit tests. You can run tests using the command,
+
+```shell
+bazel test --config=opt --copt=-O3 --copt=-march=native \
+  //tensorflow/contrib/autograph/...
+```
+
+from the root of the `tensorflow` repository. For more details see the [main TensorFlow Contributing File](../../CONTRIBUTING.md)
diff --git a/tensorflow/contrib/autograph/README.md b/tensorflow/contrib/autograph/README.md
index 0ba99c396fc1c8ee1e12fbb4fe0293ee52ed9bc9..674859bed4ec157d5d5b33b6fc015c930e54b392 100644
--- a/tensorflow/contrib/autograph/README.md
+++ b/tensorflow/contrib/autograph/README.md
@@ -1,6 +1,6 @@
 # AutoGraph
 
-IMPORTANT: AutoGraph is pre-alpha, under active development. Expect rough edges and bugs, but if you try it, we appreciate early feedback!
+IMPORTANT: AutoGraph is alpha software, and under active development. Expect rough edges and bugs, but if you try it, we appreciate early feedback! We'd also love contributions ([please see our contributing guidelines](CONTRIBUTING.md) and our [style guide](STYLE_GUIDE.md)).
 
 AutoGraph is a Python to TensorFlow compiler.
 
diff --git a/tensorflow/contrib/autograph/STYLE_GUIDE.md b/tensorflow/contrib/autograph/STYLE_GUIDE.md
new file mode 100644
index 0000000000000000000000000000000000000000..5618ec3e34499ad0f0b2a0d8b0ad04c11ee9bf9c
--- /dev/null
+++ b/tensorflow/contrib/autograph/STYLE_GUIDE.md
@@ -0,0 +1,125 @@
+# TensorFlow AutoGraph Style Guide
+
+This page contains style decisions that both developers and users of TensorFlow
+AutoGraph should follow to increase the readability of their code, reduce the
+number of errors, and promote consistency. We borrow many style principles from the TensorFlow Probability style guide.
+
+## TensorFlow Style
+
+Follow the [TensorFlow style
+guide](https://www.tensorflow.org/community/style_guide) and [documentation
+guide](https://www.tensorflow.org/community/documentation). Below are additional
+TensorFlow conventions not noted in those guides. In the future, these noted
+conventions may be moved upstream.
+
+1.  The name is TensorFlow, not Tensorflow.
+2.  The name is AutoGraph, not Autograph.
+
+## TensorFlow Code of Conduct
+Please review and follow the [TensorFlow Code of Conduct](../../CODE_OF_CONDUCT.md).
+
+## TensorFlow AutoGraph Style
+
+Below are TensorFlow AutoGraph-specific conventions. In the event of conflict,
+it supercedes all previous conventions.
+
+1.  __Importing submodule aliases.__ Use the Pythonic style 
+`from tensorflow.contrib.autograph.converters import ifexp` and `from tensorflow.contrib import autograph as ag`.
+
+2.  __Examples in Docstrings.__ Write a `#### Examples` subsection below `Args`,
+    `Returns`, `Raises`, etc. to illustrate examples. If the docstring's last
+    line is a fence bracket (\`\`\`) closing a code snippet, add an empty line
+    before closing the docstring with \"\"\". This properly displays the code
+    snippet.
+
+    Justification: Users regularly need to remind themselves of args and
+    semantics. But rarely look at examples more than the first time. But since
+    examples are usually long (which is great!) it means they have to do a lot
+    of annoying scrolling ...unless Examples follow Args/Returns/Raises.
+
+3.  __Citations in Docstrings.__ Write a `#### References` subsection at the
+    bottom of any docstring with citations. Use ICLR’s bibliography style to
+    write references; for example, order entries by the first author's last
+    name. Add a link to the paper if the publication is open source (ideally,
+    arXiv).
+
+    Write in-paragraph citations in general, e.g., [(Tran and Blei, 2018)][1].
+    Write in-text citations when the citation is a noun, e.g., [Tran and Blei
+    (2018)][1]. Write citations with more than two authors using et al., e.g.,
+    [(Tran et al., 2018)][1]. Separate multiple citations with semicolon, e.g.,
+    ([Tran and Blei, 2018][1]; [Gelman and Rubin, 1992][2]).
+
+    Examples:
+
+    ```none
+    #### References
+
+    # technical report
+    [1]: Tony Finch. Incremental calculation of weighted mean and variance.
+         _Technical Report_, 2009.
+         http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
+
+    # journal
+    [2]: Andrew Gelman and Donald B. Rubin. Inference from Iterative Simulation
+         Using Multiple Sequences. _Statistical Science_, 7(4):457-472, 1992.
+
+    # arXiv preprint
+    # use "et al." for papers with too many authors to maintain
+    [3]: Aaron van den Oord et al. Parallel WaveNet: Fast High-Fidelity Speech
+         Synthesis. _arXiv preprint arXiv:1711.10433_, 2017.
+         https://arxiv.org/abs/1711.10433
+
+    # conference
+    [4]: Yeming Wen, Paul Vicol, Jimmy Ba, Dustin Tran, and Roger Grosse.
+         Flipout: Efficient Pseudo-Independent Weight Perturbations on
+         Mini-Batches. In _International Conference on Learning
+         Representations_, 2018.
+         https://arxiv.org/abs/1803.04386
+    ```
+
+4.  When doing float math over literals eg use `1.` instead of `1` or `1.0`.
+
+    *   Using `1.` is another line of defense against an automatic casting
+        mistake. (Using `1.0` is also such a defense but is not minimal.)
+
+5.  Prefer using named args for functions' 2nd args onward.
+
+    *   Definitely use named args for 2nd args onward in docstrings.
+
+9.  Avoid LaTeX in docstrings.
+
+    *   It is not rendered in many (if not most) editors and can be hard to read
+        for both LaTeX experts and non-experts.
+
+10. Write docstring and comment math using ASCII friendly notation; python using
+    operators. E.g., `x**2` better than `x^2`, `x[i, j]` better than `x_{i,j}`,
+    `sum{ f(x[i]) : i=1...n }` better than `\sum_{i=1}^n f(x_i)` `int{sin(x) dx:
+    x in [0, 2 pi]}` better than `\int_0^{2\pi} sin(x) dx`.
+
+    *   The more we stick to python style, the more someone can
+        copy/paste/execute.
+    *   Python style is usually easier to read as ASCII.
+
+11. All public functions require docstrings with: one line description, Args,
+    Returns, Raises (if raises exceptions).
+
+    *   Returns docstrings should be in the same format as Args, eg, of the form
+        "name: Description." Part of the rationale is that we are suggesting a
+        reasonable variable name for the returned object(s).
+
+12. Regard `*args` and/or `**kwargs` as features of last resort.
+
+    *   Keyword arguments make the intention of a function call more clear.
+    *   [Possible exceptions for
+        `kwargs`](https://stackoverflow.com/questions/1415812/why-use-kwargs-in-python-what-are-some-real-world-advantages-over-using-named).
+
+18. The `__init__.py` file for modules should use TensorFlow's
+    `remove_undocumented` feature, which seals the module's methods.
+
+21. Use `"{}".format()` rather than `"" %` for string formatting.
+
+    Justification: [PEP 3101](https://www.python.org/dev/peps/pep-3101/) and
+    [Python official
+    tutorials](https://docs.python.org/3.2/tutorial/inputoutput.html#old-string-formatting):
+    "...this old style of formatting will eventually be removed from the
+    language, str.format() should generally be used."
diff --git a/tensorflow/contrib/autograph/converters/break_statements.py b/tensorflow/contrib/autograph/converters/break_statements.py
index 1be1c96dd31bf05b746fae6a2b02774e20ca0c4f..35877224b87c1abda1a270be4869e9dcfd0cf97c 100644
--- a/tensorflow/contrib/autograph/converters/break_statements.py
+++ b/tensorflow/contrib/autograph/converters/break_statements.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gast
+
 from tensorflow.contrib.autograph.pyct import anno
 from tensorflow.contrib.autograph.pyct import templates
 from tensorflow.contrib.autograph.pyct import transformer
@@ -52,8 +54,13 @@ class BreakStatementTransformer(transformer.Base):
 
   def _guard_if_present(self, block, var_name):
     """Prevents the block from executing if var_name is set."""
+
+    # If we don't have statements that immediately depend on the break
+    # we still need to make sure that the break variable remains
+    # used, in case the break becomes useful in later stages of transformation.
+    # Not having this broke the break_in_inner_loop test.
     if not block:
-      return block
+      block = [gast.Pass()]
     template = """
         if not var_name:
           block
diff --git a/tensorflow/contrib/autograph/converters/call_trees.py b/tensorflow/contrib/autograph/converters/call_trees.py
index 554f0471d44d54194c45c3855b1483796ae65a6a..b6ecdcb7809b1ad7e7461324cb6a110ef4180609 100644
--- a/tensorflow/contrib/autograph/converters/call_trees.py
+++ b/tensorflow/contrib/autograph/converters/call_trees.py
@@ -292,15 +292,25 @@ class CallTreeTransformer(transformer.Base):
         raise NotImplementedError(
             'py_func with return values (unknown function)')
     else:
+      if anno.hasanno(node.func, anno.Basic.QN):
+        # Special-case a few builtins that otherwise go undetected. This
+        # normally doesn't pose a problem, but the dict built-in doesn't
+        # work with inspect.getargspec which is required for dynamic functions.
+        # Note: expecting this is resilient to aliasing (e.g.
+        # dict = an_evil_dict), because in those cases the regular mechanisms
+        # process a simple user function.
+        qn = anno.getanno(node.func, anno.Basic.QN)
+        # Add items to this list as needed.
+        if str(qn) in ('dict',):
+          return node
+
       if ast_util.matches(node, 'super(_)'):
         # super() calls are preserved. The class conversion mechanism will
         # ensure that they return the correct value.
-        pass
-      elif self.context.recursive:
+        return node
+
+      if self.context.recursive:
         node = self._insert_dynamic_conversion(node)
-      else:
-        # Unresolved functions are allowed in non-recursive mode.
-        pass
     return node
 
 
diff --git a/tensorflow/contrib/autograph/converters/control_flow.py b/tensorflow/contrib/autograph/converters/control_flow.py
index 935a2786db0289c67860be2da97e3f554f12500c..d7ddbe8a04f64848d6ec21155d8d85f60e19d276 100644
--- a/tensorflow/contrib/autograph/converters/control_flow.py
+++ b/tensorflow/contrib/autograph/converters/control_flow.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Handles control flow statements: while, if."""
+"""Handles control flow statements: while, for, if."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -25,6 +25,7 @@ from tensorflow.contrib.autograph.pyct import ast_util
 from tensorflow.contrib.autograph.pyct import parser
 from tensorflow.contrib.autograph.pyct import templates
 from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.static_analysis import cfg
 from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
@@ -47,9 +48,6 @@ class SymbolNamer(object):
 class ControlFlowTransformer(transformer.Base):
   """Transforms control flow structures like loops an conditionals."""
 
-  def __init__(self, context):
-    super(ControlFlowTransformer, self).__init__(context)
-
   def _create_cond_branch(self, body_name, aliased_orig_names,
                           aliased_new_names, body, returns):
     if aliased_orig_names:
@@ -98,30 +96,63 @@ class ControlFlowTransformer(transformer.Base):
 
     body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
     orelse_scope = anno.getanno(node, NodeAnno.ORELSE_SCOPE)
-
-    if body_scope.created - orelse_scope.created:
-      raise ValueError(
-          'The if branch creates new symbols that the else branch does not.')
-    if orelse_scope.created - body_scope.created:
-      raise ValueError(
-          'The else branch creates new symbols that the if branch does not.')
-
-    modified = tuple(body_scope.modified | orelse_scope.modified)
-    all_referenced = body_scope.referenced | orelse_scope.referenced
+    body_defs = body_scope.created | body_scope.modified
+    orelse_defs = orelse_scope.created | orelse_scope.modified
+    live = anno.getanno(node, 'live_out')
+
+    # We'll need to check if we're closing over variables that are defined
+    # elsewhere in the function
+    # NOTE: we can only detect syntactic closure in the scope
+    # of the code passed in. If the AutoGraph'd function itself closes
+    # over other variables, this analysis won't take that into account.
+    defined = anno.getanno(node, 'defined_in')
+
+    # We only need to return variables that are
+    # - modified by one or both branches
+    # - live (or has a live parent) at the end of the conditional
+    modified = []
+    for def_ in body_defs | orelse_defs:
+      def_with_parents = set((def_,)) | def_.support_set
+      if live & def_with_parents:
+        modified.append(def_)
+
+    # We need to check if live created variables are balanced
+    # in both branches
+    created = live & (body_scope.created | orelse_scope.created)
+
+    # The if statement is illegal if there are variables that are created,
+    # that are also live, but both branches don't create them.
+    if created:
+      if created != (body_scope.created & live):
+        raise ValueError(
+            'The main branch does not create all live symbols that the else '
+            'branch does.')
+      if created != (orelse_scope.created & live):
+        raise ValueError(
+            'The else branch does not create all live symbols that the main '
+            'branch does.')
 
     # Alias the closure variables inside the conditional functions
     # to avoid errors caused by the local variables created in the branch
     # functions.
-    need_alias = (
-        (body_scope.modified | orelse_scope.modified) -
-        (body_scope.created | orelse_scope.created))
-    aliased_orig_names = tuple(need_alias)
-    aliased_new_names = tuple(
-        self.context.namer.new_symbol(s.ssf(), all_referenced)
-        for s in aliased_orig_names)
-    alias_map = dict(zip(aliased_orig_names, aliased_new_names))
-    node_body = ast_util.rename_symbols(node.body, alias_map)
-    node_orelse = ast_util.rename_symbols(node.orelse, alias_map)
+    # We will alias variables independently for body and orelse scope,
+    # because different branches might write different variables.
+    aliased_body_orig_names = tuple(body_scope.modified - body_scope.created)
+    aliased_orelse_orig_names = tuple(orelse_scope.modified -
+                                      orelse_scope.created)
+    aliased_body_new_names = tuple(
+        self.context.namer.new_symbol(s.ssf(), body_scope.referenced)
+        for s in aliased_body_orig_names)
+    aliased_orelse_new_names = tuple(
+        self.context.namer.new_symbol(s.ssf(), orelse_scope.referenced)
+        for s in aliased_orelse_orig_names)
+
+    alias_body_map = dict(zip(aliased_body_orig_names, aliased_body_new_names))
+    alias_orelse_map = dict(
+        zip(aliased_orelse_orig_names, aliased_orelse_new_names))
+
+    node_body = ast_util.rename_symbols(node.body, alias_body_map)
+    node_orelse = ast_util.rename_symbols(node.orelse, alias_orelse_map)
 
     if not modified:
       # When the cond would return no value, we leave the cond called without
@@ -134,26 +165,47 @@ class ControlFlowTransformer(transformer.Base):
     else:
       results = gast.Tuple([s.ast() for s in modified], None)
 
-    body_name = self.context.namer.new_symbol('if_true', all_referenced)
-    orelse_name = self.context.namer.new_symbol('if_false', all_referenced)
+    body_name = self.context.namer.new_symbol('if_true', body_scope.referenced)
+    orelse_name = self.context.namer.new_symbol('if_false',
+                                                orelse_scope.referenced)
     if modified:
-      body_returns = tuple(
-          alias_map[s] if s in aliased_orig_names else s for s in modified)
+
+      def build_returns(aliased_names, alias_map, scope):
+        """Builds list of return variables for a branch of a conditional."""
+        returns = []
+        for s in modified:
+          if s in aliased_names:
+            returns.append(alias_map[s])
+          else:
+            if s not in scope.created | defined:
+              raise ValueError(
+                  'Attempting to return variable "%s" from the true branch of '
+                  'a conditional, but it was not closed over, or created in '
+                  'this branch.' % str(s))
+            else:
+              returns.append(s)
+        return tuple(returns)
+
+      body_returns = build_returns(aliased_body_orig_names, alias_body_map,
+                                   body_scope)
+      orelse_returns = build_returns(aliased_orelse_orig_names,
+                                     alias_orelse_map, orelse_scope)
+
     else:
-      body_returns = templates.replace('tf.ones(())')[0].value
+      body_returns = orelse_returns = templates.replace('tf.ones(())')[0].value
 
     body_def = self._create_cond_branch(
         body_name,
-        aliased_orig_names=tuple(aliased_orig_names),
-        aliased_new_names=tuple(aliased_new_names),
+        aliased_orig_names=tuple(aliased_body_orig_names),
+        aliased_new_names=tuple(aliased_body_new_names),
         body=node_body,
         returns=body_returns)
     orelse_def = self._create_cond_branch(
         orelse_name,
-        aliased_orig_names=tuple(aliased_orig_names),
-        aliased_new_names=tuple(aliased_new_names),
+        aliased_orig_names=tuple(aliased_orelse_orig_names),
+        aliased_new_names=tuple(aliased_orelse_new_names),
         body=node_orelse,
-        returns=body_returns)
+        returns=orelse_returns)
     cond_expr = self._create_cond_expr(results, node.test, body_name,
                                        orelse_name)
 
@@ -284,6 +336,7 @@ class ControlFlowTransformer(transformer.Base):
 
 
 def transform(node, context):
-  t = ControlFlowTransformer(context)
-  node = t.visit(node)
+  cfg.run_analyses(node, cfg.Liveness(context))
+  cfg.run_analyses(node, cfg.Defined(context))
+  node = ControlFlowTransformer(context).visit(node)
   return node
diff --git a/tensorflow/contrib/autograph/converters/control_flow_test.py b/tensorflow/contrib/autograph/converters/control_flow_test.py
index c5610b16b4e5de374f404307d3583660707d5e0b..1a863590f97add9bfa587d1142a09ae26a9fdb44 100644
--- a/tensorflow/contrib/autograph/converters/control_flow_test.py
+++ b/tensorflow/contrib/autograph/converters/control_flow_test.py
@@ -22,6 +22,7 @@ from tensorflow.contrib.autograph.converters import control_flow
 from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
 
@@ -95,6 +96,91 @@ class ControlFlowTest(converter_test_base.TestCase):
       with self.test_session() as sess:
         self.assertEqual(-1, sess.run(result.test_fn(constant_op.constant(1))))
 
+  def test_imbalanced_aliasing(self):
+
+    def test_fn(n):
+      if n > 0:
+        n = 3
+      return n
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node, control_flow_ops.cond) as result:
+      with self.test_session() as sess:
+        self.assertEqual(3, sess.run(result.test_fn(constant_op.constant(2))))
+        self.assertEqual(-3, sess.run(result.test_fn(constant_op.constant(-3))))
+
+  def test_ignore_unread_variable(self):
+
+    def test_fn(n):
+      b = 3  # pylint: disable=unused-variable
+      if n > 0:
+        b = 4
+      return n
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node, control_flow_ops.cond, array_ops.ones) as result:
+      with self.test_session() as sess:
+        self.assertEqual(3, sess.run(result.test_fn(constant_op.constant(3))))
+        self.assertEqual(-3, sess.run(result.test_fn(constant_op.constant(-3))))
+
+  def test_handle_temp_variable(self):
+
+    def test_fn_using_temp(x, y, w):
+      if x < y:
+        z = x + y
+      else:
+        w = 2
+        tmp = w
+        z = x - tmp
+      return z, w
+
+    node = self.parse_and_analyze(test_fn_using_temp, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node, control_flow_ops.cond, array_ops.ones) as result:
+      with self.test_session() as sess:
+        z, w = sess.run(
+            result.test_fn_using_temp(
+                constant_op.constant(-3), constant_op.constant(3),
+                constant_op.constant(3)))
+        self.assertEqual(0, z)
+        self.assertEqual(3, w)
+        z, w = sess.run(
+            result.test_fn_using_temp(
+                constant_op.constant(3), constant_op.constant(-3),
+                constant_op.constant(3)))
+        self.assertEqual(1, z)
+        self.assertEqual(2, w)
+
+    def test_fn_ignoring_temp(x, y, w):
+      if x < y:
+        z = x + y
+      else:
+        w = 2
+        tmp = w
+        z = x - tmp
+      return z
+
+    node = self.parse_and_analyze(test_fn_ignoring_temp, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node, control_flow_ops.cond, array_ops.ones) as result:
+      with self.test_session() as sess:
+        z = sess.run(
+            result.test_fn_ignoring_temp(
+                constant_op.constant(-3), constant_op.constant(3),
+                constant_op.constant(3)))
+        self.assertEqual(0, z)
+        z = sess.run(
+            result.test_fn_ignoring_temp(
+                constant_op.constant(3), constant_op.constant(-3),
+                constant_op.constant(3)))
+        self.assertEqual(1, z)
+
   def test_simple_for(self):
 
     def test_fn(l):
diff --git a/tensorflow/contrib/autograph/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py
index 5edd8e74a8899a25fb51e2a4e133f3cb7933fa26..bc61498b5422f5e130bbfeef935d0a796b4f5922 100644
--- a/tensorflow/contrib/autograph/impl/conversion_test.py
+++ b/tensorflow/contrib/autograph/impl/conversion_test.py
@@ -24,7 +24,7 @@ from tensorflow.contrib.autograph import utils
 from tensorflow.contrib.autograph.impl import api
 from tensorflow.contrib.autograph.impl import conversion
 from tensorflow.python.framework import constant_op
-from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras.engine import training
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/BUILD b/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
index 83f3bafc4217649db6499566d548c1657428ad0b..8064a967cd389e88d3febbeb21cac87b0fef9e18 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
@@ -19,6 +19,7 @@ py_library(
     srcs = [
         "activity.py",
         "annos.py",
+        "cfg.py",
         "live_values.py",
         "type_info.py",
     ],
@@ -43,6 +44,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "cfg_test",
+    srcs = ["cfg_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    deps = [
+        ":static_analysis",
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
+
 py_test(
     name = "live_values_test",
     srcs = ["live_values_test.py"],
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py b/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad97fdfa8e78d1fd4c38724612d83519c6609cce
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py
@@ -0,0 +1,445 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Control flow graph analysis.
+
+Given a Python AST we construct a control flow graph, with edges both to the
+next and previous statements (so it can easily walk the graph both ways). Its
+nodes contain the AST of the statements. It can then perform forward or backward
+analysis on this CFG.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+import functools
+import operator
+
+import gast
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct.static_analysis import activity
+
+
+class CfgNode(object):
+  """A node in the CFG."""
+  __slots__ = ['next', 'value', 'prev']
+
+  def __init__(self, value):
+    self.next = set()
+    self.prev = set()
+    self.value = value
+
+
+class Cfg(namedtuple('Cfg', ['entry', 'exit'])):
+  """A Control Flow Graph.
+
+  Each statement is represented as a node. For control flow statements such
+  as conditionals and loops the conditional itself is a node which either
+  branches or cycles, respectively.
+  Attributes:
+    entry: The entry node, which contains the `gast.arguments` node of the
+        function definition.
+    exit: The exit node. This node is special because it has no value (i.e. no
+        corresponding AST node). This is because Python functions can have
+        multiple return statements.
+  """
+  pass
+
+
+class CfgBuilder(gast.NodeVisitor):
+  """Construct a control flow graph.
+
+  Construct a CFG starting from a FunctionDef node.
+  Usage:
+    cfg_obj = CfgBuilder().build_cfg(fndef_node)
+  """
+
+  def __init__(self):
+    # The current leaves of the CFG
+    self.current_leaves = []
+    # TODO(alexbw): generalize to break, return, continue, yield, etc.
+    # A stack of lists, tracking continue statements
+    self.continue_ = []
+    # A stack of lists tracking break nodes
+    self.break_ = []
+
+  def set_current_leaves(self, cfg_node):
+    """Link this cfg_node to the current leaves.
+
+    This is the central function for building the CFG. It links the current
+    head cfg_nodes to the passed cfg_node. It then resets the head to the
+    passed cfg_node.
+
+    Args:
+      cfg_node: A CfgNode instance.
+    """
+    for head in self.current_leaves:
+      head.next.add(cfg_node)
+      # While we're linking the CFG forward, add backlinks
+      cfg_node.prev.add(head)
+    self.current_leaves = [cfg_node]
+
+  def build_cfg(self, node):
+    """Build a CFG for a function.
+
+    Implementation of building a CFG for dataflow analysis. See, e.g.:
+    https://www.seas.harvard.edu/courses/cs252/2011sp/slides/Lec02-Dataflow.pdf
+
+    Args:
+      node: A function definition the body of which to analyze.
+    Returns:
+      A CFG object.
+    Raises:
+      TypeError: If the input is not a function definition.
+    """
+    if not isinstance(node, gast.FunctionDef):
+      raise TypeError('input must be a function definition')
+    entry_cfg_node = CfgNode(node.args)
+    self.current_leaves = [entry_cfg_node]
+    self.visit_statements(node.body)
+    exit_cfg_node = CfgNode(None)
+    self.set_current_leaves(exit_cfg_node)
+    return Cfg(entry_cfg_node, exit_cfg_node)
+
+  def visit_statements(self, nodes):
+    for node in nodes:
+      # Check for control flow
+      if isinstance(node, (gast.For, gast.While, gast.If, gast.Try, gast.Break,
+                           gast.Continue, gast.With)):
+        self.visit(node)
+      else:
+        expr = CfgNode(node)
+        self.set_current_leaves(expr)
+
+  def generic_visit(self, node):
+    raise ValueError('unknown control flow')
+
+  def visit_If(self, node):
+    # TODO(alexbw): change this to use immutable tuples instead of lists
+    # The current head will hold the conditional
+    test = CfgNode(node.test)
+    self.set_current_leaves(test)
+    # Handle the body
+    self.visit_statements(node.body)
+    body_exit = self.current_leaves
+    self.current_leaves = [test]
+    # Handle the orelse
+    self.visit_statements(node.orelse)
+    self.current_leaves.extend(body_exit)
+
+  def visit_While(self, node):
+    test = CfgNode(node.test)
+    self.set_current_leaves(test)
+    # Start a new level of nesting
+    self.break_.append([])
+    self.continue_.append([])
+    # Handle the body
+    self.visit_statements(node.body)
+    body_exit = self.current_leaves
+    self.current_leaves.extend(self.continue_.pop())
+    self.set_current_leaves(test)
+    # Handle the orelse
+    self.visit_statements(node.orelse)
+    # The break statements and the test go to the next node
+    self.current_leaves.extend(self.break_.pop())
+    # Body and orelse statements can reach out of the loop
+    self.current_leaves.extend(body_exit)
+
+  def visit_For(self, node):
+    iter_ = CfgNode(node.iter)
+    self.set_current_leaves(iter_)
+    self.break_.append([])
+    self.continue_.append([])
+    self.visit_statements(node.body)
+    body_exit = self.current_leaves
+    self.current_leaves.extend(self.continue_.pop())
+    self.set_current_leaves(iter_)
+    # Handle the orelse
+    self.visit_statements(node.orelse)
+    # The break statements and the test go to the next node
+    self.current_leaves.extend(self.break_.pop())
+    # Body and orelse statements can reach out of the loop
+    self.current_leaves.extend(body_exit)
+
+  def visit_Break(self, node):
+    self.break_[-1].extend(self.current_leaves)
+    self.current_leaves[:] = []
+
+  def visit_Continue(self, node):
+    self.continue_[-1].extend(self.current_leaves)
+    self.current_leaves[:] = []
+
+  def visit_Try(self, node):
+    self.visit_statements(node.body)
+    body = self.current_leaves
+    handlers = []
+    for handler in node.handlers:
+      self.current_leaves = body[:]
+      self.visit_statements(handler.body)
+      handlers.extend(self.current_leaves)
+    self.current_leaves = body
+    self.visit_statements(node.orelse)
+    self.current_leaves = handlers + self.current_leaves
+    self.visit_statements(node.finalbody)
+
+  def visit_With(self, node):
+    for item in node.items:
+      self.set_current_leaves(CfgNode(item))
+    self.visit_statements(node.body)
+
+
+# TODO(alexbw): once CFG analysis occurs at a block level,
+# this extra class will not be necessary
+class PropagateAnalysis(gast.NodeVisitor):
+  """Port analysis annotations from statements to their enclosing blocks."""
+
+  def __init__(self, analysis):
+    self.transfer_fn = analysis.transfer_fn
+    self.in_label = analysis.in_label
+    self.out_label = analysis.out_label
+    super(PropagateAnalysis, self).__init__()
+
+  def visit_If(self, node):
+    # Depth-first.
+    self.generic_visit(node)
+    incoming = anno.getanno(node.body[0], self.in_label)
+    incoming |= anno.getanno(node.test, self.in_label)
+    outgoing = anno.getanno(node.body[-1], self.out_label)
+    outgoing |= anno.getanno(node.test, self.out_label)
+    if node.orelse:
+      orelse_outgoing = anno.getanno(node.orelse[-1], self.out_label)
+      outgoing = self.transfer_fn(outgoing, orelse_outgoing)
+    anno.setanno(node, self.in_label, incoming)
+    anno.setanno(node, self.out_label, outgoing)
+
+  def visit_For(self, node):
+    self.generic_visit(node)
+    incoming = set(anno.getanno(node.body[0], self.in_label))
+    incoming -= set((anno.getanno(node.target, anno.Basic.QN),))
+    outgoing = anno.getanno(node.body[-1], self.out_label)
+    if node.orelse:
+      orelse_outgoing = anno.getanno(node.orelse[-1], self.out_label)
+      outgoing = self.transfer_fn(outgoing, orelse_outgoing)
+    anno.setanno(node, self.in_label, frozenset(incoming))
+    anno.setanno(node, self.out_label, outgoing)
+
+  def visit_While(self, node):
+    self.generic_visit(node)
+    incoming = anno.getanno(node.body[0], self.in_label)
+    incoming |= anno.getanno(node.test, self.in_label)
+    outgoing = anno.getanno(node.body[-1], self.out_label)
+    if node.orelse:
+      orelse_outgoing = anno.getanno(node.orelse[-1], self.out_label)
+      outgoing = self.transfer_fn(outgoing, orelse_outgoing)
+    anno.setanno(node, self.in_label, incoming)
+    anno.setanno(node, self.out_label, outgoing)
+
+  def visit_With(self, node):
+    self.generic_visit(node)
+    incoming = anno.getanno(node.body[0], self.in_label)
+    for item in node.items:
+      incoming |= anno.getanno(item, self.in_label)
+    outgoing = anno.getanno(node.body[-1], self.out_label)
+    anno.setanno(node, self.in_label, incoming)
+    anno.setanno(node, self.out_label, outgoing)
+
+
+# TODO(alexbw): Abstract the CFG walking machinery into a superclass
+# which is parameterized on which fields it selects when walking.
+# TODO(alexbw): Abstract the application of dataflow analysis
+class Forward(object):
+  """Forward analysis on CFG.
+
+  Args:
+    label: A name for this analysis e.g. 'active' for activity analysis. The AST
+      nodes in the CFG will be given annotations 'name_in', 'name_out',
+      'name_gen' and 'name_kill' which contain the incoming values, outgoing
+      values, values generated by the statement, and values deleted by the
+      statement respectively.
+    transfer_fn: Either the AND or OR operator. If the AND operator is used it
+      turns into forward must analysis (i.e. a value will only be carried
+      forward if it appears on all incoming paths). The OR operator means that
+      forward may analysis is done (i.e. the union of incoming values will be
+      taken).
+  """
+
+  def __init__(self, label, context, transfer_fn=operator.or_):
+    self.transfer_fn = transfer_fn
+    self.context = context
+    self.out_label = label + '_out'
+    self.in_label = label + '_in'
+    self.gen_label = label + '_gen'
+    self.kill_label = label + '_kill'
+
+  # TODO(alexbw): see if we can simplify by visiting breadth-first
+  def visit(self, node):
+    """Depth-first walking the CFG, applying dataflow information propagtion."""
+    # node.value is None only for the exit CfgNode.
+    if not node.value:
+      return
+
+    if anno.hasanno(node.value, self.out_label):
+      before = hash(anno.getanno(node.value, self.out_label))
+    else:
+      before = None
+    preds = [
+        anno.getanno(pred.value, self.out_label)
+        for pred in node.prev
+        if anno.hasanno(pred.value, self.out_label)
+    ]
+    if preds:
+      incoming = functools.reduce(self.transfer_fn, preds[1:], preds[0])
+    else:
+      incoming = frozenset()
+    anno.setanno(node.value, self.in_label, incoming)
+    gen, kill = self.get_gen_kill(node, incoming)
+    anno.setanno(node.value, self.gen_label, gen)
+    anno.setanno(node.value, self.kill_label, kill)
+    anno.setanno(node.value, self.out_label, (incoming - kill) | gen)
+
+    if hash(anno.getanno(node.value, self.out_label)) != before:
+      for succ in node.next:
+        self.visit(succ)
+
+  def get_gen_kill(self, cfg_node, incoming):
+    """Calculate Gen and Kill properties of a CFG node in dataflow analysis.
+
+    A function which takes the CFG node as well as a set of incoming
+    values. It must return a set of newly generated values by the statement as
+    well as a set of deleted (killed) values.
+
+    Args:
+      cfg_node: A CfgNode instance.
+      incoming:
+    """
+    raise NotImplementedError()
+
+
+class Backward(Forward):
+  """Backward analysis on CFG."""
+
+  def visit(self, cfg_node):
+    # cfg_node.value is None for the exit node, which will be visited only once
+    if not cfg_node.value:
+      for pred in cfg_node.prev:
+        self.visit(pred)
+      return
+
+    if anno.hasanno(cfg_node.value, self.in_label):
+      before = hash(anno.getanno(cfg_node.value, self.in_label))
+    else:
+      before = None
+    succs = [
+        anno.getanno(succ.value, self.in_label)
+        for succ in cfg_node.next
+        if anno.hasanno(succ.value, self.in_label)
+    ]
+    if succs:
+      incoming = functools.reduce(self.transfer_fn, succs[1:], succs[0])
+    else:
+      incoming = frozenset()
+    anno.setanno(cfg_node.value, self.out_label, incoming)
+    gen, kill = self.get_gen_kill(cfg_node, incoming)
+    anno.setanno(cfg_node.value, self.gen_label, gen)
+    anno.setanno(cfg_node.value, self.kill_label, kill)
+    anno.setanno(cfg_node.value, self.in_label, (incoming - kill) | gen)
+    if hash(anno.getanno(cfg_node.value, self.in_label)) != before:
+      for pred in cfg_node.prev:
+        self.visit(pred)
+
+
+def run_analyses(node, analyses):
+  """Perform dataflow analysis on all functions within an AST.
+
+  Args:
+    node: An AST node on which to run dataflow analysis.
+    analyses: Either an instance of the Forward or Backward dataflow analysis
+      class, or a list or tuple of them.
+
+  Returns:
+    node: The node, but now with annotations on the AST nodes containing the
+    results of the dataflow analyses.
+  """
+  if not isinstance(analyses, (tuple, list)):
+    analyses = (analyses,)
+  for analysis in analyses:
+    if not isinstance(analysis, (Forward, Backward)):
+      raise TypeError('not a valid forward analysis object')
+
+  for child_node in gast.walk(node):
+    if isinstance(child_node, gast.FunctionDef):
+      cfg_obj = CfgBuilder().build_cfg(child_node)
+      for analysis in analyses:
+        if isinstance(analysis, Backward):
+          analysis.visit(cfg_obj.exit)
+        elif isinstance(analysis, Forward):
+          analysis.visit(cfg_obj.entry)
+  for analysis in analyses:
+    PropagateAnalysis(analysis).visit(node)
+  return node
+
+
+class Liveness(Backward):
+  """Perform a liveness analysis.
+
+  Each statement is annotated with a set of variables that may be used
+  later in the program.
+  """
+
+  def __init__(self, context):
+    super(Liveness, self).__init__('live', context)
+
+  def get_gen_kill(self, node, _):
+    # A variable's parents are live if it is live
+    # e.g. x is live if x.y is live. This means gen needs to return
+    # all parents of a variable (if it's an Attribute or Subscript).
+    # This doesn't apply to kill (e.g. del x.y doesn't affect liveness of x)
+    gen = activity.get_read(node.value, self.context)
+    gen = functools.reduce(lambda left, right: left | right.support_set, gen,
+                           gen)
+    kill = activity.get_updated(node.value, self.context)
+    return gen, kill
+
+
+class ReachingDefinitions(Forward):
+  """Perform reaching definition analysis.
+
+  Each statement is annotated with a set of (variable, definition) pairs.
+  """
+
+  def __init__(self, context):
+    super(ReachingDefinitions, self).__init__('definitions', context)
+
+  def get_gen_kill(self, node, incoming):
+    definitions = activity.get_updated(node.value, self.context)
+    gen = frozenset((id_, node.value) for id_ in definitions)
+    kill = frozenset(def_ for def_ in incoming if def_[0] in definitions)
+    return gen, kill
+
+
+class Defined(Forward):
+  """Perform defined variable analysis.
+
+  Each statement is annotated with a set of variables which are guaranteed to
+  be defined at that point.
+  """
+
+  def __init__(self, context):
+    super(Defined, self).__init__('defined', context, transfer_fn=operator.and_)
+
+  def get_gen_kill(self, node, _):
+    gen = activity.get_updated(node.value, self.context)
+    return gen, frozenset()
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/cfg_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/cfg_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc07fa3447b23c0595a5893329de8a2d7055ca15
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/cfg_test.py
@@ -0,0 +1,306 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for cfg module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import gast
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import context
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct.static_analysis import cfg
+from tensorflow.python.platform import test
+
+
+class CFGTest(test.TestCase):
+
+  def _parse_and_analyze(self, test_fn, namespace, arg_types=None):
+    arg_types = arg_types or {}
+    node, source = parser.parse_entity(test_fn)
+    ctx = context.EntityContext(
+        namer=None,
+        source_code=source,
+        source_file=None,
+        namespace=namespace,
+        arg_values=None,
+        arg_types=arg_types,
+        owner_type=None,
+        recursive=True)
+    node = qual_names.resolve(node)
+    return node, ctx
+
+  def _check_anno_matches(self, node, anno_name, var_names):
+    if isinstance(var_names, str):
+      var_names = (var_names,)
+    qual_vars = set()
+    for var_name in var_names:
+      if isinstance(var_name, str):
+        if '[' in var_name or ']' in var_name:
+          raise ValueError('Annotation matching not supported with subscript.')
+        if '.' not in var_name:
+          qual_vars.add(qual_names.QN(var_name))
+        else:
+          attrs = var_name.split('.')
+          this_qn = functools.reduce(qual_names.QN, attrs[1:],
+                                     qual_names.QN(attrs[0]))
+          qual_vars.add(this_qn)
+    self.assertEqual(anno.getanno(node, anno_name), qual_vars)
+
+  def test_reaching(self):
+
+    def f(x):
+      print(x)
+      while True:
+        x = x
+        x = x
+      return x
+
+    node, ctx = self._parse_and_analyze(f, {})
+    cfg.run_analyses(node, cfg.ReachingDefinitions(ctx))
+    body = node.body[0].body
+    # Only the argument reaches the expression
+    def_in = anno.getanno(body[0], 'definitions_in')
+    # One element, x, from arguments
+    self.assertEqual(set(type(d[1]) for d in def_in), set((gast.arguments,)))
+
+    while_body = body[1].body
+    def_in = anno.getanno(while_body[0], 'definitions_in')
+    # One definition, two possible sources.
+    # - One from an assignment (if the loop is entered)
+    # - The other from the arguments (if loop is not entered)
+    self.assertEqual(
+        set(type(d[1]) for d in def_in), set((gast.arguments, gast.Assign)))
+
+    def_in = anno.getanno(while_body[1], 'definitions_in')
+    # If we've reached this line, the only reaching definition of x is the
+    # Assign node in previous line
+    self.assertEqual(set(type(d[1]) for d in def_in), set((gast.Assign,)))
+
+    def_in = anno.getanno(body[2], 'definitions_in')
+    # Same situation as while_body[0]
+    self.assertEqual(
+        set(type(d[1]) for d in def_in), set((gast.arguments, gast.Assign)))
+
+  def test_defined(self):
+
+    def f(x):
+      if x:
+        y = 2  # pylint: disable=unused-variable
+      return x
+
+    node, ctx = self._parse_and_analyze(f, {})
+    cfg.run_analyses(node, cfg.Defined(ctx))
+    body = node.body[0].body
+    # only x is for sure defined at the end
+    self._check_anno_matches(body[1], 'defined_in', 'x')
+    # at the end of the if body both x and y are defined
+    if_body = body[0].body
+    self._check_anno_matches(if_body[0], 'defined_out', ('x', 'y'))
+
+  def _get_live_annotated_fnbody(self, f):
+    node, ctx = self._parse_and_analyze(f, {})
+    cfg.run_analyses(node, cfg.Liveness(ctx))
+    body = node.body[0].body
+    return body
+
+  def test_live_straightline(self):
+
+    def f1(x):
+      a = g(x)  # pylint: disable=undefined-variable
+      b = h(a)  # pylint: disable=undefined-variable, unused-variable
+      return x
+
+    body = self._get_live_annotated_fnbody(f1)
+    self._check_anno_matches(body[1], 'live_in', ('a', 'h', 'x'))
+    self._check_anno_matches(body[2], 'live_in', ('x'))
+    self._check_anno_matches(body[0], 'live_in', ('g', 'h', 'x'))
+    self._check_anno_matches(body[2], 'live_out', ())
+
+  def test_live_stacked_conds_with_else(self):
+
+    def f2(x, a):  # pylint: disable=unused-argument
+      if a > 0:  # x should not be live
+        x = 0
+      if a > 1:
+        x = 1
+      else:
+        x = 2
+
+    body = self._get_live_annotated_fnbody(f2)
+    self._check_anno_matches(body[0], 'live_in', ('a'))
+    self._check_anno_matches(body[1], 'live_in', ('a'))
+
+  def test_live_stacked_conds(self):
+
+    def f3(x, a):
+      if a > 0:  # x and a should be live
+        x = 0
+      if a > 1:  # x and a should be live_in
+        x = 1
+      return x  # x should be live
+
+    body = self._get_live_annotated_fnbody(f3)
+    self._check_anno_matches(body[0], 'live_in', ('a', 'x'))
+    self._check_anno_matches(body[1], 'live_in', ('a', 'x'))
+    self._check_anno_matches(body[2], 'live_in', ('x'))
+
+  def test_live_possibly_unused_cond(self):
+
+    def f4(x, a):
+      if a > 0:  # x should be live
+        x = 0
+      x += 1
+
+    body = self._get_live_annotated_fnbody(f4)
+    self._check_anno_matches(body[0], 'live_in', ('x', 'a'))
+    self._check_anno_matches(body[1], 'live_in', ('x'))
+
+  def test_live_attribute_in_cond(self):
+
+    def f5(x, a):
+      if a > 0:  # x.y should be live
+        x.y = 0
+      return x.y
+
+    body = self._get_live_annotated_fnbody(f5)
+    self._check_anno_matches(body[0], 'live_in', ('x', 'x.y', 'a'))
+
+  def test_live_noop(self):
+
+    def f6(x):
+      return x  # should this cause x.* to be live?
+
+    body = self._get_live_annotated_fnbody(f6)
+    self._check_anno_matches(body[0], 'live_in', ('x'))
+
+  def test_live_loop(self):
+
+    def f7(x, n):
+      for i in range(n):
+        x += i
+      return x
+
+    body = self._get_live_annotated_fnbody(f7)
+    self._check_anno_matches(body[0], 'live_in', ('x', 'n', 'range'))
+    self._check_anno_matches(body[1], 'live_in', ('x'))
+
+  def test_live_context_manager(self):
+
+    def f8(x, f):
+      with f:
+        x += 1
+
+    body = self._get_live_annotated_fnbody(f8)
+    self._check_anno_matches(body[0], 'live_in', ('f', 'x'))
+
+  def test_node_equality(self):
+    node_a = gast.parse('y = x').body[0]
+    node_b = gast.parse('y = x').body[0]
+    self.assertNotEqual(node_a, node_b)
+
+  def test_nested_functions_defined(self):
+
+    def f(x):
+      y = x * 2
+
+      def g(z):
+        return z + y
+
+      return g(x)
+
+    node, ctx = self._parse_and_analyze(f, {})
+    cfg.run_analyses(node, cfg.Defined(ctx))
+
+    body = node.body[0].body
+    self.assertEqual(
+        anno.getanno(body[2], 'defined_in'),
+        frozenset(map(qual_names.QN, ('g', 'x', 'y'))))
+
+    # TODO(alexbw): CFG analysis doesn't currently cross FunctionDef boundaries.
+    # NOTE: 'z' is easy to find, but 'y' is  not identified as
+    # defined, because CFG analysis is applied with each function separately.
+    # fndef_body = body[1].body
+    # self.assertEqual(
+    #     anno.getanno(fndef_body[0], 'defined_in'),
+    #     frozenset(map(qual_names.QN, ('z', 'y'))))
+
+  def test_nested_functions_dont_leak_definitions(self):
+
+    def f(x):
+      print(x)
+
+      def g():
+        y = 2
+        return y
+
+      return g()  # y is not defined here
+
+    node, ctx = self._parse_and_analyze(f, {})
+    cfg.run_analyses(node, cfg.Defined(ctx))
+    body = node.body[0].body
+    self.assertEqual(
+        anno.getanno(body[2], 'defined_in'),
+        frozenset(map(qual_names.QN, ('x', 'g'))))
+
+  def test_loop_else(self):
+
+    # Disabling useless-else-on-loop error, because 'break' and 'continue'
+    # canonicalization are a separate analysis pass, and here we test
+    # the CFG analysis in isolation.
+    def for_orelse(x):
+      y = 0
+      for i in range(len(x)):
+        x += i
+      else:  # pylint: disable=useless-else-on-loop
+        y = 1
+      return x, y
+
+    def while_orelse(x, i):
+      y = 0
+      while x < 10:
+        x += i
+      else:  # pylint: disable=useless-else-on-loop
+        y = 1
+      return x, y
+
+    for f in (for_orelse, while_orelse):
+      node, ctx = self._parse_and_analyze(f, {})
+      cfg.run_analyses(node, cfg.ReachingDefinitions(ctx))
+      body = node.body[0].body
+      return_node = body[-1]
+      reaching_defs = anno.getanno(return_node, 'definitions_in')
+
+      # Y could be defined by Assign(Num(0)) or Assign(Num(1))
+      # X could be defined as an argument or an AugAssign.
+      y_defs = [node for var, node in reaching_defs if str(var) == 'y']
+      x_defs = [node for var, node in reaching_defs if str(var) == 'x']
+
+      self.assertEqual(set((gast.Assign,)), set(type(def_) for def_ in y_defs))
+      self.assertEqual(set((0, 1)), set(def_.value.n for def_ in y_defs))
+      self.assertEqual(len(y_defs), 2)
+      self.assertEqual(
+          set((gast.arguments, gast.AugAssign)),
+          set(type(def_) for def_ in x_defs))
+      self.assertEqual(len(x_defs), 2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
index 9994c84ebdb930eea0818188225488eb5eca84eb..758754feac31f1d2cf10e69d7a9a6d288931c900 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -45,6 +45,7 @@ from tensorflow.python.training import training_util
 
 _DNN_LEARNING_RATE = 0.001
 
+
 def _get_optimizer(optimizer):
   if callable(optimizer):
     return optimizer()
@@ -73,6 +74,7 @@ def _dnn_tree_combined_model_fn(features,
                                 dnn_input_layer_partitioner=None,
                                 dnn_input_layer_to_tree=True,
                                 dnn_steps_to_train=10000,
+                                predict_with_tree_only=False,
                                 tree_feature_columns=None,
                                 tree_center_bias=False,
                                 use_core_versions=False):
@@ -108,6 +110,8 @@ def _dnn_tree_combined_model_fn(features,
     as a feature to the tree.
     dnn_steps_to_train: Number of steps to train dnn for before switching
       to gbdt.
+    predict_with_tree_only: Whether to use only the tree model output as the
+      final prediction.
     tree_feature_columns: An iterable containing all the feature columns
       used by the model's boosted trees. If dnn_input_layer_to_tree is
       set to True, these features are in addition to dnn_feature_columns.
@@ -132,8 +136,7 @@ def _dnn_tree_combined_model_fn(features,
   dnn_parent_scope = "dnn"
   dnn_partitioner = dnn_input_layer_partitioner or (
       partitioned_variables.min_max_variable_partitioner(
-          max_partitions=config.num_ps_replicas,
-          min_slice_size=64 << 20))
+          max_partitions=config.num_ps_replicas, min_slice_size=64 << 20))
 
   with variable_scope.variable_scope(
       dnn_parent_scope,
@@ -171,8 +174,7 @@ def _dnn_tree_combined_model_fn(features,
       _add_hidden_layer_summary(net, hidden_layer_scope.name)
       previous_layer = net
     with variable_scope.variable_scope(
-        "logits",
-        values=(previous_layer,)) as logits_scope:
+        "logits", values=(previous_layer,)) as logits_scope:
       dnn_logits = layers.fully_connected(
           previous_layer,
           head.logits_dimension,
@@ -190,8 +192,7 @@ def _dnn_tree_combined_model_fn(features,
           optimizer=_get_optimizer(dnn_optimizer),
           name=dnn_parent_scope,
           variables=ops.get_collection(
-              ops.GraphKeys.TRAINABLE_VARIABLES,
-              scope=dnn_parent_scope),
+              ops.GraphKeys.TRAINABLE_VARIABLES, scope=dnn_parent_scope),
           # Empty summaries to prevent optimizers from logging training_loss.
           summaries=[])
 
@@ -230,7 +231,10 @@ def _dnn_tree_combined_model_fn(features,
         update_op = state_ops.assign_add(global_step, 1).op
         return update_op
 
-  tree_train_logits = dnn_logits + tree_logits
+  if predict_with_tree_only:
+    tree_train_logits = tree_logits
+  else:
+    tree_train_logits = dnn_logits + tree_logits
 
   def _no_train_op_fn(loss):
     """Returns a no-op."""
@@ -288,10 +292,10 @@ def _dnn_tree_combined_model_fn(features,
   finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor()
 
   model_fn_ops.training_hooks.extend([
-      trainer_hooks.SwitchTrainOp(
-          dnn_train_op, dnn_steps_to_train, tree_train_op),
-      trainer_hooks.StopAfterNTrees(
-          num_trees, attempted_trees, finalized_trees)])
+      trainer_hooks.SwitchTrainOp(dnn_train_op, dnn_steps_to_train,
+                                  tree_train_op),
+      trainer_hooks.StopAfterNTrees(num_trees, attempted_trees, finalized_trees)
+  ])
 
   return model_fn_ops
 
@@ -318,6 +322,7 @@ class DNNBoostedTreeCombinedClassifier(estimator.Estimator):
                dnn_input_layer_partitioner=None,
                dnn_input_layer_to_tree=True,
                dnn_steps_to_train=10000,
+               predict_with_tree_only=False,
                tree_feature_columns=None,
                tree_center_bias=False,
                use_core_versions=False):
@@ -360,6 +365,8 @@ class DNNBoostedTreeCombinedClassifier(estimator.Estimator):
       as a feature to the tree.
       dnn_steps_to_train: Number of steps to train dnn for before switching
         to gbdt.
+      predict_with_tree_only: Whether to use only the tree model output as the
+        final prediction.
       tree_feature_columns: An iterable containing all the feature columns
         used by the model's boosted trees. If dnn_input_layer_to_tree is
         set to True, these features are in addition to dnn_feature_columns.
@@ -377,16 +384,32 @@ class DNNBoostedTreeCombinedClassifier(estimator.Estimator):
 
     def _model_fn(features, labels, mode, config):
       return _dnn_tree_combined_model_fn(
-          features, labels, mode, head, dnn_hidden_units, dnn_feature_columns,
-          tree_learner_config, num_trees, tree_examples_per_layer, config,
-          dnn_optimizer, dnn_activation_fn, dnn_dropout,
-          dnn_input_layer_partitioner, dnn_input_layer_to_tree,
-          dnn_steps_to_train, tree_feature_columns, tree_center_bias,
-          use_core_versions)
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          dnn_hidden_units=dnn_hidden_units,
+          dnn_feature_columns=dnn_feature_columns,
+          tree_learner_config=tree_learner_config,
+          num_trees=num_trees,
+          tree_examples_per_layer=tree_examples_per_layer,
+          config=config,
+          dnn_optimizer=dnn_optimizer,
+          dnn_activation_fn=dnn_activation_fn,
+          dnn_dropout=dnn_dropout,
+          dnn_input_layer_partitioner=dnn_input_layer_partitioner,
+          dnn_input_layer_to_tree=dnn_input_layer_to_tree,
+          dnn_steps_to_train=dnn_steps_to_train,
+          predict_with_tree_only=predict_with_tree_only,
+          tree_feature_columns=tree_feature_columns,
+          tree_center_bias=tree_center_bias,
+          use_core_versions=use_core_versions)
 
     super(DNNBoostedTreeCombinedClassifier, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir,
-        config=config, feature_engineering_fn=feature_engineering_fn)
+        model_fn=_model_fn,
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
 
 
 class DNNBoostedTreeCombinedRegressor(estimator.Estimator):
@@ -410,6 +433,7 @@ class DNNBoostedTreeCombinedRegressor(estimator.Estimator):
                dnn_input_layer_partitioner=None,
                dnn_input_layer_to_tree=True,
                dnn_steps_to_train=10000,
+               predict_with_tree_only=False,
                tree_feature_columns=None,
                tree_center_bias=False,
                use_core_versions=False):
@@ -452,6 +476,8 @@ class DNNBoostedTreeCombinedRegressor(estimator.Estimator):
       as a feature to the tree.
       dnn_steps_to_train: Number of steps to train dnn for before switching
         to gbdt.
+      predict_with_tree_only: Whether to use only the tree model output as the
+        final prediction.
       tree_feature_columns: An iterable containing all the feature columns
         used by the model's boosted trees. If dnn_input_layer_to_tree is
         set to True, these features are in addition to dnn_feature_columns.
@@ -474,16 +500,32 @@ class DNNBoostedTreeCombinedRegressor(estimator.Estimator):
 
     def _model_fn(features, labels, mode, config):
       return _dnn_tree_combined_model_fn(
-          features, labels, mode, head, dnn_hidden_units, dnn_feature_columns,
-          tree_learner_config, num_trees, tree_examples_per_layer, config,
-          dnn_optimizer, dnn_activation_fn, dnn_dropout,
-          dnn_input_layer_partitioner, dnn_input_layer_to_tree,
-          dnn_steps_to_train, tree_feature_columns, tree_center_bias,
-          use_core_versions)
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          dnn_hidden_units=dnn_hidden_units,
+          dnn_feature_columns=dnn_feature_columns,
+          tree_learner_config=tree_learner_config,
+          num_trees=num_trees,
+          tree_examples_per_layer=tree_examples_per_layer,
+          config=config,
+          dnn_optimizer=dnn_optimizer,
+          dnn_activation_fn=dnn_activation_fn,
+          dnn_dropout=dnn_dropout,
+          dnn_input_layer_partitioner=dnn_input_layer_partitioner,
+          dnn_input_layer_to_tree=dnn_input_layer_to_tree,
+          dnn_steps_to_train=dnn_steps_to_train,
+          predict_with_tree_only=predict_with_tree_only,
+          tree_feature_columns=tree_feature_columns,
+          tree_center_bias=tree_center_bias,
+          use_core_versions=use_core_versions)
 
     super(DNNBoostedTreeCombinedRegressor, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir,
-        config=config, feature_engineering_fn=feature_engineering_fn)
+        model_fn=_model_fn,
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
 
 
 class DNNBoostedTreeCombinedEstimator(estimator.Estimator):
@@ -508,6 +550,7 @@ class DNNBoostedTreeCombinedEstimator(estimator.Estimator):
                dnn_input_layer_partitioner=None,
                dnn_input_layer_to_tree=True,
                dnn_steps_to_train=10000,
+               predict_with_tree_only=False,
                tree_feature_columns=None,
                tree_center_bias=False,
                use_core_versions=False):
@@ -545,6 +588,8 @@ class DNNBoostedTreeCombinedEstimator(estimator.Estimator):
       as a feature to the tree.
       dnn_steps_to_train: Number of steps to train dnn for before switching
         to gbdt.
+      predict_with_tree_only: Whether to use only the tree model output as the
+        final prediction.
       tree_feature_columns: An iterable containing all the feature columns
         used by the model's boosted trees. If dnn_input_layer_to_tree is
         set to True, these features are in addition to dnn_feature_columns.
@@ -553,15 +598,32 @@ class DNNBoostedTreeCombinedEstimator(estimator.Estimator):
       use_core_versions: Whether feature columns and loss are from the core (as
         opposed to contrib) version of tensorflow.
     """
+
     def _model_fn(features, labels, mode, config):
       return _dnn_tree_combined_model_fn(
-          features, labels, mode, head, dnn_hidden_units, dnn_feature_columns,
-          tree_learner_config, num_trees, tree_examples_per_layer, config,
-          dnn_optimizer, dnn_activation_fn, dnn_dropout,
-          dnn_input_layer_partitioner, dnn_input_layer_to_tree,
-          dnn_steps_to_train, tree_feature_columns, tree_center_bias,
-          use_core_versions)
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          dnn_hidden_units=dnn_hidden_units,
+          dnn_feature_columns=dnn_feature_columns,
+          tree_learner_config=tree_learner_config,
+          num_trees=num_trees,
+          tree_examples_per_layer=tree_examples_per_layer,
+          config=config,
+          dnn_optimizer=dnn_optimizer,
+          dnn_activation_fn=dnn_activation_fn,
+          dnn_dropout=dnn_dropout,
+          dnn_input_layer_partitioner=dnn_input_layer_partitioner,
+          dnn_input_layer_to_tree=dnn_input_layer_to_tree,
+          dnn_steps_to_train=dnn_steps_to_train,
+          predict_with_tree_only=predict_with_tree_only,
+          tree_feature_columns=tree_feature_columns,
+          tree_center_bias=tree_center_bias,
+          use_core_versions=use_core_versions)
 
     super(DNNBoostedTreeCombinedEstimator, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir,
-        config=config, feature_engineering_fn=feature_engineering_fn)
+        model_fn=_model_fn,
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 44a8ffaf4b2f5a9c11b3abc46ce55a18c80ad318..04e32267cc4a00b3169c3abbcbf549805a0fb462 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -422,6 +422,10 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
               GradientStats(*gradients_t, *hessians_t, bucket_idx);
         }
         present_gradient_stats *= normalizer_ratio;
+        GradientStats not_present =
+            root_gradient_stats - present_gradient_stats;
+        // If there was (almost) no sparsity, fix the default direction to LEFT.
+        bool fixed_default_direction = not_present.IsAlmostZero();
 
         GradientStats left_gradient_stats;
         for (int64 element_idx = start_index; element_idx < end_index;
@@ -441,6 +445,7 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
           // backward pass gradients.
           GradientStats right_gradient_stats =
               present_gradient_stats - left_gradient_stats;
+
           {
             NodeStats left_stats_default_left =
                 ComputeNodeStats(root_gradient_stats - right_gradient_stats);
@@ -457,7 +462,9 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
               best_dimension_idx = dimension_id;
             }
           }
-          {
+          // Consider calculating the default direction only when there were
+          // enough missing examples.
+          if (!fixed_default_direction) {
             NodeStats left_stats_default_right =
                 ComputeNodeStats(left_gradient_stats);
             NodeStats right_stats_default_right =
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index 9d6cc9245aa463d0c8cfc7ad209736357b6c0323..f06b73c00d0bebb2717a79b7894e2addf914daba 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -501,11 +501,18 @@ def sparse_make_stats_update(
         example_partition_ids)
 
     # Compute aggregate stats for each partition.
+    # Since unsorted_segment_sum can be numerically unstable, use 64bit
+    # operation.
+    gradients64 = math_ops.cast(gradients, dtypes.float64)
+    hessians64 = math_ops.cast(hessians, dtypes.float64)
     per_partition_gradients = math_ops.unsorted_segment_sum(
-        gradients, mapped_partitions, array_ops.size(unique_partitions))
+        gradients64, mapped_partitions, array_ops.size(unique_partitions))
     per_partition_hessians = math_ops.unsorted_segment_sum(
-        hessians, mapped_partitions, array_ops.size(unique_partitions))
-
+        hessians64, mapped_partitions, array_ops.size(unique_partitions))
+    per_partition_gradients = math_ops.cast(per_partition_gradients,
+                                            dtypes.float32)
+    per_partition_hessians = math_ops.cast(per_partition_hessians,
+                                           dtypes.float32)
     # Prepend a bias feature per partition that accumulates the stats for all
     # examples in that partition.
     bias_feature_ids = array_ops.fill(
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
index 28834ef55bf8e1f32cc8f2380a4be3bf3824d8e1..5cd37ec67ec3bdefb6ea19049a7a12249162d45a 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import random
+
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.boosted_trees.proto import split_info_pb2
 from tensorflow.contrib.boosted_trees.python.ops import split_handler_ops
@@ -399,6 +401,65 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
 
     self.assertAllClose(0.6, split_node.split.threshold)
 
+  def testMakeSparseSplitDefaultDirectionIsStable(self):
+    """Tests default direction is stable when no sparsity."""
+    random.seed(1123)
+    for _ in range(50):
+      with self.test_session() as sess:
+        grad = random.random()
+        hessian = random.random()
+        # The data looks like the following (divide by the num of steps 2).
+        # Gradients       | Partition | bucket ID       |
+        # (grad, hessian) |  0        | -1              |
+        # And then 100 buckets of
+        # (grad/100, hessian/100), so there is no sparsity.
+        n_buckets = 100
+
+        # 1 for the overall sum, and 100 buckets.
+        partition_ids = array_ops.constant(
+            [0] * (n_buckets + 1), dtype=dtypes.int32)
+        # We have only 1 dimension in our sparse feature column.
+
+        bucket_ids = [-1] + [n for n in range(100)]
+        bucket_ids = array_ops.constant(bucket_ids, dtype=dtypes.int64)
+        dimension_ids = array_ops.constant(
+            [0] * (n_buckets + 1), dtype=dtypes.int64)
+        bucket_ids = array_ops.stack([bucket_ids, dimension_ids], axis=1)
+
+        gradients = [grad] + [grad / n_buckets] * n_buckets
+        gradients = array_ops.constant(gradients)
+        hessians = [hessian] + [hessian / n_buckets] * n_buckets
+        hessians = array_ops.constant(hessians)
+
+        boundaries = [x * 1 for x in range(n_buckets + 1)]
+        bucket_boundaries = array_ops.constant(boundaries, dtype=dtypes.float32)
+
+        partitions, gains, splits = (
+            split_handler_ops.build_sparse_inequality_splits(
+                num_minibatches=2,
+                partition_ids=partition_ids,
+                bucket_ids=bucket_ids,
+                gradients=gradients,
+                hessians=hessians,
+                bucket_boundaries=bucket_boundaries,
+                l1_regularization=0,
+                l2_regularization=2,
+                tree_complexity_regularization=0,
+                min_node_weight=0,
+                feature_column_group_id=0,
+                bias_feature_id=-1,
+                class_id=-1,
+                multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
+        partitions, gains, splits = (sess.run([partitions, gains, splits]))
+      self.assertAllEqual([0], partitions)
+      self.assertEqual(1, len(splits))
+
+      split_info = split_info_pb2.SplitInfo()
+      split_info.ParseFromString(splits[0])
+      self.assertTrue(
+          split_info.split_node.HasField(
+              'sparse_float_binary_split_default_left'))
+
   def testMakeMulticlassSparseSplit(self):
     """Tests split handler op."""
     with self.test_session() as sess:
diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
index 1b184d296b329cee481db67992e77d1e33e18035..50cc00afdcc77fedc9bf8c94a9a6fcf2a28ebde9 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
@@ -187,7 +187,7 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
       stamp_token: Expected current token.
       next_stamp_token: Next value for the token.
     Returns:
-      A list of quantiles or approximate boundaries.
+      The flush operation.
     """
     return gen_quantile_ops.quantile_accumulator_flush(
         quantile_accumulator_handle=self._quantile_accumulator_handle,
diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py
index d2c30f121539f8eae5d5f921bd7a1507a81f6e29..af8df72618b7255e182e98e6e4b96a0333b3dce6 100644
--- a/tensorflow/contrib/checkpoint/__init__.py
+++ b/tensorflow/contrib/checkpoint/__init__.py
@@ -14,22 +14,29 @@
 # ==============================================================================
 """Tools for working with object-based checkpoints.
 
-
-For creating and managing dependencies:
-@@CheckpointableObjectGraph
+Visualization and inspection:
 @@dot_graph_from_checkpoint
 @@object_metadata
+
+Creating and managing dependencies:
+@@Checkpointable
+@@CheckpointableObjectGraph
+@@NoDependency
 @@split_dependency
+@@UniqueNameTracker
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.checkpoint.python.containers import UniqueNameTracker
 from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency
 from tensorflow.contrib.checkpoint.python.visualize import dot_graph_from_checkpoint
 from tensorflow.core.protobuf.checkpointable_object_graph_pb2 import CheckpointableObjectGraph
-from tensorflow.python.training.checkpointable_utils import object_metadata
+from tensorflow.python.training.checkpointable.base import Checkpointable
+from tensorflow.python.training.checkpointable.base import NoDependency
+from tensorflow.python.training.checkpointable.util import object_metadata
 
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD
index a5681ffa61d07ef29d0a0862db9736a210c8e26e..53f4e97f9932104933b3ecf80142e5af82cd487a 100644
--- a/tensorflow/contrib/checkpoint/python/BUILD
+++ b/tensorflow/contrib/checkpoint/python/BUILD
@@ -8,11 +8,34 @@ py_library(
     name = "checkpoint",
     srcs_version = "PY2AND3",
     deps = [
+        ":containers",
         ":split_dependency",
         ":visualize",
     ],
 )
 
+py_library(
+    name = "containers",
+    srcs = ["containers.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = ["//tensorflow/python/training/checkpointable:base"],
+)
+
+py_test(
+    name = "containers_test",
+    srcs = ["containers_test.py"],
+    deps = [
+        ":containers",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/training/checkpointable:base",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "split_dependency",
     srcs = ["split_dependency.py"],
diff --git a/tensorflow/contrib/checkpoint/python/containers.py b/tensorflow/contrib/checkpoint/python/containers.py
new file mode 100644
index 0000000000000000000000000000000000000000..9807abae1f5106bb84f858c3725f096aaa4eaca9
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/containers.py
@@ -0,0 +1,77 @@
+"""Checkpointable data structures."""
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.training.checkpointable import base as checkpointable_lib
+
+
+class UniqueNameTracker(checkpointable_lib.CheckpointableBase):
+  """Adds dependencies on checkpointable objects with name hints.
+
+  Useful for creating dependencies with locally unique names.
+
+  Example usage:
+  ```python
+  class SlotManager(tf.contrib.checkpoint.Checkpointable):
+
+    def __init__(self):
+      # Create a dependency named "slotdeps" on the container.
+      self.slotdeps = tf.contrib.checkpoint.UniqueNameTracker()
+      slotdeps = self.slotdeps
+      slots = []
+      slots.append(slotdeps.track(tfe.Variable(3.), "x"))  # Named "x"
+      slots.append(slotdeps.track(tfe.Variable(4.), "y"))
+      slots.append(slotdeps.track(tfe.Variable(5.), "x"))  # Named "x_1"
+  ```
+  """
+
+  def __init__(self):
+    self._maybe_initialize_checkpointable()
+    self._name_counts = {}
+
+  def track(self, checkpointable, base_name):
+    """Add a dependency on `checkpointable`.
+
+    Args:
+      checkpointable: An object to add a checkpoint dependency on.
+      base_name: A name hint, which is uniquified to determine the dependency
+        name.
+    Returns:
+      `checkpointable`, for chaining.
+    Raises:
+      ValueError: If `checkpointable` is not a checkpointable object.
+    """
+
+    if not isinstance(checkpointable, checkpointable_lib.CheckpointableBase):
+      raise ValueError(
+          ("Expected a checkpointable value, got %s which does not inherit "
+           "from CheckpointableBase.") % (checkpointable,))
+
+    def _format_name(prefix, number):
+      if number > 0:
+        return "%s_%d" % (prefix, number)
+      else:
+        return prefix
+
+    count = self._name_counts.get(base_name, 0)
+    candidate = _format_name(base_name, count)
+    while self._lookup_dependency(candidate) is not None:
+      count += 1
+      candidate = _format_name(base_name, count)
+    self._name_counts[base_name] = count + 1
+    return self._track_checkpointable(checkpointable, name=candidate)
diff --git a/tensorflow/contrib/checkpoint/python/containers_test.py b/tensorflow/contrib/checkpoint/python/containers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..851a80058852bd917aec075b4bf63264318603a7
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/containers_test.py
@@ -0,0 +1,99 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import six
+
+from tensorflow.contrib.checkpoint.python import containers
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
+
+
+class UniqueNameTrackerTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNames(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    x1 = resource_variable_ops.ResourceVariable(2.)
+    x2 = resource_variable_ops.ResourceVariable(3.)
+    x3 = resource_variable_ops.ResourceVariable(4.)
+    y = resource_variable_ops.ResourceVariable(5.)
+    slots = containers.UniqueNameTracker()
+    slots.track(x1, "x")
+    slots.track(x2, "x")
+    slots.track(x3, "x_1")
+    slots.track(y, "y")
+    self.evaluate((x1.initializer, x2.initializer, x3.initializer,
+                   y.initializer))
+    save_root = checkpointable_utils.Checkpoint(slots=slots)
+    save_path = save_root.save(checkpoint_prefix)
+
+    restore_slots = checkpointable.Checkpointable()
+    restore_root = checkpointable_utils.Checkpoint(
+        slots=restore_slots)
+    status = restore_root.restore(save_path)
+    restore_slots.x = resource_variable_ops.ResourceVariable(0.)
+    restore_slots.x_1 = resource_variable_ops.ResourceVariable(0.)
+    restore_slots.x_1_1 = resource_variable_ops.ResourceVariable(0.)
+    restore_slots.y = resource_variable_ops.ResourceVariable(0.)
+    status.assert_consumed().run_restore_ops()
+    self.assertEqual(2., self.evaluate(restore_slots.x))
+    self.assertEqual(3., self.evaluate(restore_slots.x_1))
+    self.assertEqual(4., self.evaluate(restore_slots.x_1_1))
+    self.assertEqual(5., self.evaluate(restore_slots.y))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testExample(self):
+    class SlotManager(checkpointable.Checkpointable):
+
+      def __init__(self):
+        self.slotdeps = containers.UniqueNameTracker()
+        slotdeps = self.slotdeps
+        slots = []
+        slots.append(slotdeps.track(
+            resource_variable_ops.ResourceVariable(3.), "x"))
+        slots.append(slotdeps.track(
+            resource_variable_ops.ResourceVariable(4.), "y"))
+        slots.append(slotdeps.track(
+            resource_variable_ops.ResourceVariable(5.), "x"))
+        self.slots = slots
+
+    manager = SlotManager()
+    self.evaluate([v.initializer for v in manager.slots])
+    checkpoint = checkpointable_utils.Checkpoint(slot_manager=manager)
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpoint.save(checkpoint_prefix)
+    metadata = checkpointable_utils.object_metadata(save_path)
+    dependency_names = []
+    for node in metadata.nodes:
+      for child in node.children:
+        dependency_names.append(child.local_name)
+    six.assertCountEqual(
+        self,
+        dependency_names,
+        ["x", "x_1", "y", "slot_manager", "slotdeps", "save_counter"])
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency.py b/tensorflow/contrib/checkpoint/python/split_dependency.py
index 3aec8c96e90440d6da00d95cffc34bd53ec7164f..7e77453f3d848c2e321ed2ba66917a742d95459a 100644
--- a/tensorflow/contrib/checkpoint/python/split_dependency.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 import functools
 
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.training import checkpointable as checkpointable
 from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training.checkpointable import base as checkpointable
 
 
 class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency_test.py b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
index f1d9d19b047ee69281cf8bdba38a28dc87947e38..69dc0b9be2d5548852c37552a64a0d31c9557b43 100644
--- a/tensorflow/contrib/checkpoint/python/split_dependency_test.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
@@ -23,8 +23,8 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training import checkpointable
-from tensorflow.python.training import checkpointable_utils
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 def _split_variable_closure(variable):
diff --git a/tensorflow/contrib/checkpoint/python/visualize.py b/tensorflow/contrib/checkpoint/python/visualize.py
index 9a3b23bb2c30ee601f5f94da31ad182399a04e4f..bac071c4cff383f60b707b6e42c13faf5e0ac948 100644
--- a/tensorflow/contrib/checkpoint/python/visualize.py
+++ b/tensorflow/contrib/checkpoint/python/visualize.py
@@ -18,8 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.training import checkpointable
-from tensorflow.python.training import checkpointable_utils
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 def dot_graph_from_checkpoint(save_path):
diff --git a/tensorflow/contrib/checkpoint/python/visualize_test.py b/tensorflow/contrib/checkpoint/python/visualize_test.py
index 1d9ab789235cb964521315b4864563f89745ae75..583e3bc442893d825c337d73fb999d1e586738a1 100644
--- a/tensorflow/contrib/checkpoint/python/visualize_test.py
+++ b/tensorflow/contrib/checkpoint/python/visualize_test.py
@@ -24,11 +24,11 @@ from tensorflow.contrib.checkpoint.python import visualize
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
-from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import adam
-from tensorflow.python.training import checkpointable_utils
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 try:
   import pydot  # pylint: disable=g-import-not-at-top
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 1403483d287041b02dfbf538f7e7ddee11662f47..880fca4ea65608472838baee234e468bef37afb3 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -36,6 +36,8 @@ except ImportError:
 
 
 _GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
+_DEFAULT_ENV_VARIABLE = 'TPU_NAME'
+_DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'
 
 
 class TPUClusterResolver(ClusterResolver):
@@ -70,6 +72,16 @@ class TPUClusterResolver(ClusterResolver):
   def _gkeMaster():
     return os.environ[_GKE_ENV_VARIABLE].split(',')[0]
 
+  @staticmethod
+  def _envVarFallback():
+    if _DEFAULT_ENV_VARIABLE in os.environ:
+      return os.environ[_DEFAULT_ENV_VARIABLE]
+    return None
+
+  @staticmethod
+  def _discoveryUrl():
+    return os.environ.get(_DISCOVERY_SERVICE_URL_ENV_VARIABLE)
+
   def __init__(self,
                tpu=None,
                zone=None,
@@ -78,7 +90,8 @@ class TPUClusterResolver(ClusterResolver):
                coordinator_name=None,
                coordinator_address=None,
                credentials='default',
-               service=None):
+               service=None,
+               discovery_url=None):
     """Creates a new TPUClusterResolver object.
 
     The ClusterResolver will then use the parameters to query the Cloud TPU APIs
@@ -108,6 +121,11 @@ class TPUClusterResolver(ClusterResolver):
       service: The GCE API object returned by the googleapiclient.discovery
         function. If you specify a custom service object, then the credentials
         parameter will be ignored.
+      discovery_url: A URL template that points to the location of
+        the discovery service. It should have two parameters {api} and
+        {apiVersion} that when filled in produce an absolute URL to the
+        discovery document for that service. The environment variable
+        'TPU_API_DISCOVERY_URL' will override this.
 
     Raises:
       ImportError: If the googleapiclient is not installed.
@@ -123,8 +141,11 @@ class TPUClusterResolver(ClusterResolver):
 
     in_gke = self._inGke()
     # When using GKE with Cloud TPUs, the env variable will be set.
-    if tpu is None and in_gke:
-      tpu = self._gkeMaster()
+    if tpu is None:
+      if in_gke:
+        tpu = self._gkeMaster()
+      else:
+        tpu = self._envVarFallback()
 
     self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
     self._job_name = job_name
@@ -154,9 +175,16 @@ class TPUClusterResolver(ClusterResolver):
                           '--upgrade google-api-python-client` to install with '
                           'pip.')
 
-      self._service = discovery.build(
-          'tpu', 'v1alpha1',
-          credentials=self._credentials)
+      final_discovery_url = self._discoveryUrl() or discovery_url
+      if final_discovery_url:
+        self._service = discovery.build(
+            'tpu', 'v1alpha1',
+            credentials=self._credentials,
+            discoveryServiceUrl=final_discovery_url)
+      else:
+        self._service = discovery.build(
+            'tpu', 'v1alpha1',
+            credentials=self._credentials)
     else:
       self._service = service
 
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index 5b3f9be5a11237f9dceebefa1db294efaf7e482d..5fac55fd027fa2d100621e08a09e05cdb3a1b941 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -367,6 +367,10 @@ class TPUClusterResolverTest(test.TestCase):
         compat.as_bytes(TPUClusterResolver._gkeMaster()))
     del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
 
+  def testDiscoveryUrl(self):
+    os.environ['TPU_API_DISCOVERY_URL'] = 'https://{api}.internal/{apiVersion}'
+    self.assertEqual('https://{api}.internal/{apiVersion}',
+                     TPUClusterResolver._discoveryUrl())
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 6468bed4979253be5c20666d26bf24fa479d64a0..fece56c4127de4deebc1404f0eff9747f99ba89f 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -32,52 +32,13 @@ tensorflow/python/feature_column
 tensorflow/python/framework
 tensorflow/python/grappler
 tensorflow/python/keras
-tensorflow/python/keras/activations
 tensorflow/python/keras/applications
-tensorflow/python/keras/applications/densenet
-tensorflow/python/keras/applications/inception_resnet_v2
-tensorflow/python/keras/applications/inception_v3
-tensorflow/python/keras/applications/mobilenet
-tensorflow/python/keras/applications/nasnet
-tensorflow/python/keras/applications/resnet50
-tensorflow/python/keras/applications/vgg16
-tensorflow/python/keras/applications/vgg19
-tensorflow/python/keras/applications/xception
-tensorflow/python/keras/backend
-tensorflow/python/keras/callbacks
-tensorflow/python/keras/constraints
 tensorflow/python/keras/datasets
-tensorflow/python/keras/datasets/boston_housing
-tensorflow/python/keras/datasets/cifar10
-tensorflow/python/keras/datasets/cifar100
-tensorflow/python/keras/datasets/fashion_mnist
-tensorflow/python/keras/datasets/imdb
-tensorflow/python/keras/datasets/mnist
-tensorflow/python/keras/datasets/reuters
-tensorflow/python/keras/estimator
-tensorflow/python/keras/initializers
+tensorflow/python/keras/engine
 tensorflow/python/keras/layers
-tensorflow/python/keras/losses
-tensorflow/python/keras/metrics
-tensorflow/python/keras/models
-tensorflow/python/keras/optimizers
 tensorflow/python/keras/preprocessing
-tensorflow/python/keras/preprocessing/image
-tensorflow/python/keras/preprocessing/sequence
-tensorflow/python/keras/preprocessing/text
-tensorflow/python/keras/regularizers
 tensorflow/python/keras/utils
 tensorflow/python/keras/wrappers
-tensorflow/python/keras/wrappers/scikit_learn
-tensorflow/python/keras/_impl
-tensorflow/python/keras/_impl/keras
-tensorflow/python/keras/_impl/keras/applications
-tensorflow/python/keras/_impl/keras/datasets
-tensorflow/python/keras/_impl/keras/engine
-tensorflow/python/keras/_impl/keras/layers
-tensorflow/python/keras/_impl/keras/preprocessing
-tensorflow/python/keras/_impl/keras/utils
-tensorflow/python/keras/_impl/keras/wrappers
 tensorflow/python/kernel_tests
 tensorflow/python/kernel_tests/boosted_trees
 tensorflow/python/kernel_tests/distributions
@@ -100,6 +61,7 @@ tensorflow/python/summary
 tensorflow/python/summary/writer
 tensorflow/python/tools
 tensorflow/python/training
+tensorflow/python/training/checkpointable
 tensorflow/python/user_ops
 tensorflow/python/util
 tensorflow/python/util/protobuf
@@ -333,6 +295,8 @@ tensorflow/contrib/metrics
 tensorflow/contrib/metrics/python
 tensorflow/contrib/metrics/python/metrics
 tensorflow/contrib/metrics/python/ops
+tensorflow/contrib/mixed_precision
+tensorflow/contrib/mixed_precision/python
 tensorflow/contrib/mpi_collectives/python
 tensorflow/contrib/mpi_collectives/python/ops
 tensorflow/contrib/model_pruning
diff --git a/tensorflow/contrib/cmake/python_protos.txt b/tensorflow/contrib/cmake/python_protos.txt
index d63c41db844af243f0c6600b1565635ac9b91cac..cf1ee2ad76f2cc9f58dbe90182a3e17f1edc7ed3 100644
--- a/tensorflow/contrib/cmake/python_protos.txt
+++ b/tensorflow/contrib/cmake/python_protos.txt
@@ -11,7 +11,6 @@ tensorflow/contrib/mpi
 tensorflow/contrib/mpi_collectives
 tensorflow/contrib/session_bundle
 tensorflow/contrib/tensor_forest/proto
-tensorflow/contrib/tensorboard/graph_explorer/proto
 tensorflow/contrib/tensorboard/plugins/projector
 tensorflow/contrib/tensorboard/plugins/trace
 tensorflow/contrib/tpu/proto
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index c6a15f2ca075c8de96786a580c7ddb89541df5bc..a06bdf78fb011b288d5d7af6488ec6802ff34c35 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -22,8 +22,6 @@ set(tf_c_srcs
     "${tensorflow_source_dir}/tensorflow/c/eager/c_api.cc"
     "${tensorflow_source_dir}/tensorflow/c/eager/c_api.h"
     "${tensorflow_source_dir}/tensorflow/c/eager/tape.h"
-    "${tensorflow_source_dir}/tensorflow/c/eager/runtime.cc"
-    "${tensorflow_source_dir}/tensorflow/c/eager/runtime.h"
     "${tensorflow_source_dir}/tensorflow/c/checkpoint_reader.cc"
     "${tensorflow_source_dir}/tensorflow/c/checkpoint_reader.h"
     "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.cc"
@@ -38,13 +36,15 @@ add_dependencies(
   tf_core_lib
   tf_protos_cc)
 
-add_library(tf_c_python_api OBJECT
-  "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
-  "${tensorflow_source_dir}/tensorflow/c/python_api.h"
-)
-add_dependencies(
-  tf_c_python_api
-  tf_c
-  tf_core_lib
-  tf_core_framework
-  tf_protos_cc)
+if(tensorflow_BUILD_PYTHON_BINDINGS)
+  add_library(tf_c_python_api OBJECT
+    "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
+    "${tensorflow_source_dir}/tensorflow/c/python_api.h"
+  )
+  add_dependencies(
+    tf_c_python_api
+    tf_c
+    tf_core_lib
+    tf_core_framework
+    tf_protos_cc)
+endif()
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index f73da0b8ab18af1eca4c2bd577604595f8b8ec6d..6c90cf398c69c8c1b22ea75e0c407f258e2535f9 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -155,7 +155,7 @@ if (WIN32)
     set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib")
   endif()
 else (WIN32)
-  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so")
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif (WIN32)
 add_custom_target(tf_extension_ops)
 
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 1505d3e2083b5a3446a7f85d59c73816e65e1a2a..2d76bf530a2100b2afa80a16a5d64b6ec51ffc68 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -68,6 +68,7 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/csv_dataset_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/prefetching_kernels.cc"
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index c4bdb69d828b269e6246777e74c3756ba1c4b96f..894b1ead7688bd951c5c78f613fdb7aae226fe65 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -244,13 +244,11 @@ add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
 # tf_python_op_gen_main library
 ########################################################
 set(tf_python_op_gen_main_srcs
-    "${tensorflow_source_dir}/tensorflow/python/eager/python_eager_op_gen.h"
-    "${tensorflow_source_dir}/tensorflow/python/eager/python_eager_op_gen.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.cc"
-    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.cc"
-    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_main.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.h"
+    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_internal.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_internal.h"
+    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_main.cc"
 )
 
 add_library(tf_python_op_gen_main OBJECT ${tf_python_op_gen_main_srcs})
@@ -464,12 +462,12 @@ set (pywrap_tensorflow_internal_src
     "${tensorflow_source_dir}/tensorflow/python/eager/pywrap_tfe_src.cc"
     "${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.h"
     "${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.cc"
-    "${tensorflow_source_dir}/tensorflow/python/eager/python_eager_op_gen.h"
-    "${tensorflow_source_dir}/tensorflow/python/eager/python_eager_op_gen.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/cpp_shape_inference.h"
     "${tensorflow_source_dir}/tensorflow/python/framework/cpp_shape_inference.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.h"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.cc"
+    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_internal.h"
+    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_internal.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/bfloat16.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/bfloat16.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/numpy.h"
@@ -715,7 +713,7 @@ if(WIN32)
   endif()
 else()
   add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}
                                      ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
 endif()
 
@@ -791,7 +789,6 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
 add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
-
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index cffe069aa352f8a6f2c436bc70b62f54e2336ac6..4f957f1e0b46fde5daacbc59657af994e13c42d5 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -44,7 +44,8 @@ UNDNAME = "undname.exe"
 DUMPBIN = "dumpbin.exe"
 
 # Exclude if matched
-EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
+EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::|Internal|"
+                        r"python_op_gen_internal|grappler")
 
 # Include if matched before exclude
 INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
@@ -56,6 +57,10 @@ INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
                            r"tensorflow::ops::internal::Enter|"
                            r"tensorflow::strings::internal::AppendPieces|"
                            r"tensorflow::strings::internal::CatPieces|"
+                           r"tensorflow::errors::Internal|"
+                           r"tensorflow::Tensor::CopyFromInternal|"
+                           r"tensorflow::kernel_factory::"
+                           r"OpKernelRegistrar::InitInternal|"
                            r"tensorflow::io::internal::JoinPathImpl")
 
 # Include if matched after exclude
@@ -64,7 +69,7 @@ INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
                         r"tensorflow::|"
                         r"functor::|"
                         r"\?nsync_|"
-                        r"perftools::gputools")
+                        r"stream_executor::")
 
 # We want to identify data members explicitly in the DEF file, so that no one
 # can implicitly link against the DLL if they use one of the variables exported
diff --git a/tensorflow/contrib/coder/kernels/range_coder_ops_test.cc b/tensorflow/contrib/coder/kernels/range_coder_ops_test.cc
index ae4d9d2836a0f89a9765004a85bc3c292b0e484f..81b36ca902b82220d9c5282a1ec72324a6d95922 100644
--- a/tensorflow/contrib/coder/kernels/range_coder_ops_test.cc
+++ b/tensorflow/contrib/coder/kernels/range_coder_ops_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
diff --git a/tensorflow/contrib/coder/python/layers/entropybottleneck.py b/tensorflow/contrib/coder/python/layers/entropybottleneck.py
index f039cb0f5265b920200f63c5bd5ebeb4e23826be..0fbe3081af0b4de7f116918b3f49efe91a2d83bd 100644
--- a/tensorflow/contrib/coder/python/layers/entropybottleneck.py
+++ b/tensorflow/contrib/coder/python/layers/entropybottleneck.py
@@ -28,7 +28,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import engine
+from tensorflow.python.keras import engine
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import init_ops
diff --git a/tensorflow/contrib/compiler/jit_test.py b/tensorflow/contrib/compiler/jit_test.py
index b2f678fb29cedd3ec32f0460354cc4ac18fb63d3..a56a01b16356e12b83344474c7fbe427530f0c74 100644
--- a/tensorflow/contrib/compiler/jit_test.py
+++ b/tensorflow/contrib/compiler/jit_test.py
@@ -24,7 +24,6 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -170,7 +169,6 @@ class JITTest(test.TestCase):
       self.assertEqual(b"jit_scope_0", func_attrs["_XlaScope"].s)
 
 
-@test_util.with_c_api
 class CompilationEnabledInGradientTest(test.TestCase):
 
   def testCompilationInGradient(self):
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 33ddfb8dee1c446f22c7d0071f9a0e2bbac6bdad..8285ea04926d3a24e9c22bd6d69eb7a48f5e3a85 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -54,11 +54,11 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import adagrad
 from tensorflow.python.training import adam
-from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import momentum
 from tensorflow.python.training import rmsprop
 from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 CUDNN_LSTM = cudnn_rnn_ops.CUDNN_LSTM
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 73a961992e19fabec5d0f75be1b52dbba20eb7af..ed0a26bbd87eeb5bd005de8f9d054d315e378529 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -24,7 +24,7 @@ from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.keras._impl.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import init_ops
@@ -33,8 +33,8 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.training import checkpointable as checkpointable_lib
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import base as checkpointable_lib
 
 CUDNN_RNN_UNIDIRECTION = "unidirectional"
 CUDNN_RNN_BIDIRECTION = "bidirectional"
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 077cbba9d2ae41a83f6c358a63ae27aec5741e2c..a25aa85251083c24ca6685c4ffef267955f66f63 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -23,6 +23,8 @@ removing existing functionality.
 See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 
 @@Counter
+@@CheckpointInputPipelineHook
+@@CsvDataset
 @@SqlDataset
 
 @@assert_element_shape
@@ -72,8 +74,10 @@ from tensorflow.contrib.data.python.ops.grouping import group_by_window
 from tensorflow.contrib.data.python.ops.interleave_ops import parallel_interleave
 from tensorflow.contrib.data.python.ops.interleave_ops import sample_from_datasets
 from tensorflow.contrib.data.python.ops.interleave_ops import sloppy_interleave
+from tensorflow.contrib.data.python.ops.iterator_ops import CheckpointInputPipelineHook
 from tensorflow.contrib.data.python.ops.iterator_ops import make_saveable_from_iterator
 from tensorflow.contrib.data.python.ops.prefetching_ops import prefetch_to_device
+from tensorflow.contrib.data.python.ops.readers import CsvDataset
 from tensorflow.contrib.data.python.ops.readers import make_batched_features_dataset
 from tensorflow.contrib.data.python.ops.readers import make_csv_dataset
 from tensorflow.contrib.data.python.ops.readers import read_batch_features
diff --git a/tensorflow/contrib/data/kernels/BUILD b/tensorflow/contrib/data/kernels/BUILD
index c56910c7833d4c54fa8db27cd061b404013f3f54..7b69e10441eba3e38c979d5715c16699ac2710ed 100644
--- a/tensorflow/contrib/data/kernels/BUILD
+++ b/tensorflow/contrib/data/kernels/BUILD
@@ -29,6 +29,16 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "csv_dataset_op",
+    srcs = ["csv_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
 cc_library(
     name = "ignore_errors_dataset_op",
     srcs = ["ignore_errors_dataset_op.cc"],
@@ -63,6 +73,7 @@ cc_library(
 cc_library(
     name = "dataset_kernels",
     deps = [
+        ":csv_dataset_op",
         ":directed_interleave_dataset_op",
         ":ignore_errors_dataset_op",
         ":prefetching_kernels",
diff --git a/tensorflow/contrib/data/kernels/csv_dataset_op.cc b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..76e54a284e07ec1bab9b0f364a44997a39bce78a
--- /dev/null
+++ b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
@@ -0,0 +1,508 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/parsing_ops.cc.
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
+#include "tensorflow/core/lib/io/random_inputstream.h"
+
+namespace tensorflow {
+namespace {
+
+class CSVDatasetOp : public DatasetOpKernel {
+ public:
+  explicit CSVDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    const Tensor* filenames_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("filenames", &filenames_tensor));
+    OP_REQUIRES(
+        ctx, filenames_tensor->dims() <= 1,
+        errors::InvalidArgument("`filenames` must be a scalar or a vector."));
+
+    OpInputList record_defaults_list;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("record_defaults", &record_defaults_list));
+    for (int i = 0; i < record_defaults_list.size(); ++i) {
+      OP_REQUIRES(ctx, record_defaults_list[i].NumElements() < 2,
+                  errors::InvalidArgument(
+                      "There should only be 1 default per field but field ", i,
+                      " has ", record_defaults_list[i].NumElements()));
+    }
+
+    const Tensor* select_cols_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("select_cols", &select_cols_tensor));
+    OP_REQUIRES(ctx, select_cols_tensor->dims() == 1,
+                errors::InvalidArgument("`select_cols` must be a vector."));
+
+    int64 buffer_size;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
+    OP_REQUIRES(ctx, buffer_size > 0,
+                errors::InvalidArgument("buffer_size should be positive"));
+
+    string delim;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<string>(ctx, "field_delim", &delim));
+    OP_REQUIRES(ctx, delim.size() == 1,
+                errors::InvalidArgument("field_delim should be only 1 char"));
+
+    bool header;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<bool>(ctx, "header", &header));
+
+    bool use_quote_delim;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<bool>(ctx, "use_quote_delim",
+                                                  &use_quote_delim));
+    string na_value;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<string>(ctx, "na_value", &na_value));
+
+    std::vector<Tensor> record_defaults;
+    record_defaults.reserve(record_defaults_list.size());
+    for (const Tensor& t : record_defaults_list) {
+      record_defaults.push_back(t);
+    }
+
+    std::vector<string> filenames;
+    filenames.reserve(filenames_tensor->NumElements());
+    for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
+      filenames.push_back(filenames_tensor->flat<string>()(i));
+    }
+
+    std::vector<int64> select_cols;
+    select_cols.reserve(select_cols_tensor->NumElements());
+    for (int i = 0; i < select_cols_tensor->NumElements(); ++i) {
+      select_cols.push_back(select_cols_tensor->flat<int64>()(i));
+    }
+    OP_REQUIRES(
+        ctx, output_types_.size() == select_cols.size() || select_cols.empty(),
+        errors::InvalidArgument("select_cols should match output size"));
+    for (int i = 1; i < select_cols.size(); i++) {
+      OP_REQUIRES(ctx, select_cols[i - 1] < select_cols[i],
+                  errors::InvalidArgument(
+                      "select_cols should be strictly increasing indices"));
+    }
+    OP_REQUIRES(
+        ctx, select_cols.empty() || select_cols.front() >= 0,
+        errors::InvalidArgument("select_cols should be non-negative indices"));
+    bool select_all_cols = select_cols.empty();
+
+    *output = new Dataset(
+        ctx, std::move(filenames), header, buffer_size, output_types_,
+        output_shapes_, std::move(record_defaults), std::move(select_cols),
+        select_all_cols, use_quote_delim, delim[0], std::move(na_value));
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, std::vector<string> filenames, bool header,
+            int64 buffer_size, const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes,
+            std::vector<Tensor> record_defaults, std::vector<int64> select_cols,
+            bool select_all_cols, bool use_quote_delim, char delim,
+            string na_value)
+        : GraphDatasetBase(ctx),
+          filenames_(std::move(filenames)),
+          header_(header),
+          buffer_size_(buffer_size),
+          out_type_(output_types),
+          output_shapes_(output_shapes),
+          record_defaults_(std::move(record_defaults)),
+          select_cols_(std::move(select_cols)),
+          select_all_cols_(select_all_cols),
+          use_quote_delim_(use_quote_delim),
+          delim_(delim),
+          na_value_(std::move(na_value)) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::CSV")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override { return out_type_; }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "CSVDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      // TODO(rachelim): Implement this
+      std::vector<Node*> input_tensors;
+      TF_RETURN_IF_ERROR(b->AddDataset(this, input_tensors, output));
+      return errors::Unimplemented("CSVDataset: AsGraphDefInternal");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a file, so try to read the next record
+          if (buffered_input_stream_) {
+            Status s = ReadRecord(ctx, out_tensors);
+            if (s.ok() || !errors::IsOutOfRange(s)) {
+              // Not at the end of file, return OK or non-EOF errors to caller.
+              *end_of_sequence = false;
+              return s;
+            }
+            // We have reached the end of the current file, so maybe
+            // move on to next file.
+            ResetStreamsLocked();
+            ++current_file_index_;
+          }
+          // Iteration ends when there are no more files to process.
+          if (current_file_index_ == dataset()->filenames_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
+        } while (true);
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        // TODO(rachelim): Implement save
+        return errors::Unimplemented("CSVDataset: SaveInternal");
+      }
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        // TODO(rachelim): Implement restore
+        return errors::Unimplemented("CSVDataset: RestoreInternal");
+      }
+
+     private:
+      // Reads a record by parsing the input buffer, and converting extracted
+      // fields to output tensors as we go.
+      Status ReadRecord(IteratorContext* ctx, std::vector<Tensor>* out_tensors)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        // Extracts fields from line(s) from the buffered input stream.
+        out_tensors->reserve(dataset()->record_defaults_.size());
+
+        string input;
+        TF_RETURN_IF_ERROR(buffered_input_stream_->ReadLine(&input));
+
+        size_t current_idx = 0;
+        size_t num_fields_parsed = 0;
+        size_t selector_idx = 0;  // Keep track of index into select_cols
+
+        while (current_idx < input.size()) {
+          // In each iteration, parse one field
+          if (input[current_idx] == '\n' || input[current_idx] == '\r') {
+            // This should never happen, because buffered input reader splits
+            // input on newlines.
+            return errors::InvalidArgument("Parsing error.");
+          }
+
+          bool quoted = false;
+          bool include =
+              (dataset()->select_all_cols_ ||
+               dataset()->select_cols_[selector_idx] == num_fields_parsed);
+
+          if (dataset()->use_quote_delim_ && input[current_idx] == '"') {
+            quoted = true;
+            current_idx++;
+          }
+
+          // Parse the body of the field
+          string field;
+          if (!quoted) {
+            while (current_idx < input.size() &&
+                   input[current_idx] != dataset()->delim_) {
+              if ((dataset()->use_quote_delim_ && input[current_idx] == '"') ||
+                  input[current_idx] == '\n' || input[current_idx] == '\r') {
+                return errors::InvalidArgument(
+                    "Unquoted fields cannot have quotes/CRLFs inside");
+              }
+              if (include) field += input[current_idx];
+              current_idx++;
+            }  // Exit condition: end of input, or current index at delim
+
+            // Go to next field or the end
+            current_idx++;
+          } else {
+            // Quoted field needs to be ended with '"' and delim or end
+            while (true) {
+              if (current_idx >= input.size() - 1 || input.empty()) {
+                if (current_idx == input.size() - 1 &&
+                    input[current_idx] == '"') {
+                  // We're at the end of the input, and the quote terminates the
+                  // record. Go to end.
+                  current_idx++;
+                  break;
+                }
+                // If there's no terminating quote, it means our buffered record
+                // line reader split a record up. This can happen if there is a
+                // newline encased in quotes. The next line is also part of the
+                // record, so we read it and reset the index.
+                if (include && current_idx == input.size() - 1) {
+                  // TODO(rachelim): Instead of building up a string, keep track
+                  //  of terminal indices (or starting char* and length)
+                  // Also look into using /lib/strings/Scanner
+                  field += input[current_idx];
+                }
+                if (include) {
+                  field += '\n';
+                }
+                current_idx = 0;
+                Status s = buffered_input_stream_->ReadLine(&input);
+                if (!s.ok()) {
+                  return errors::InvalidArgument(
+                      "Quoted field has to end with quote followed by delim, "
+                      "CRLF, or EOF");
+                }
+              } else if (input[current_idx] == '"' &&
+                         input[current_idx + 1] == dataset()->delim_) {
+                // End of field, go to next field or end
+                current_idx += 2;
+                break;
+              } else if (input[current_idx] == '"') {
+                // Current char is a quote. Since we're not at end of field,
+                // the next character must also be a quote.
+                if (input[current_idx + 1] != '"') {
+                  return errors::InvalidArgument(
+                      "Quote inside a string has to be escaped by another "
+                      "quote");
+                }
+                if (include) field += '"';
+                current_idx += 2;
+              } else {
+                if (include) field += input[current_idx];
+                current_idx++;
+              }
+            }
+          }
+
+          num_fields_parsed++;
+
+          if (include) {
+            // Add the tensor to the result
+            TF_RETURN_IF_ERROR(FieldToOutput(ctx, std::move(field),
+                                             selector_idx, out_tensors));
+            selector_idx++;
+            // Terminate early if we have all the fields we want
+            if (selector_idx == dataset()->select_cols_.size())
+              return Status::OK();
+          }
+        }  // Exit condition: current_idx has reached the end of record
+
+        // Check if the last field is empty, and include it if necessary
+        bool include =
+            (dataset()->select_all_cols_ ||
+             dataset()->select_cols_[selector_idx] == num_fields_parsed);
+        if (include && !input.empty() &&
+            input[input.size() - 1] == dataset()->delim_) {
+          TF_RETURN_IF_ERROR(
+              FieldToOutput(ctx, string(), selector_idx, out_tensors));
+        }
+
+        // Check that number of fields matches
+        if (out_tensors->size() != dataset()->out_type_.size()) {
+          return errors::InvalidArgument("Expect ", dataset()->out_type_.size(),
+                                         " fields but have ",
+                                         out_tensors->size(), " in record");
+        }
+        return Status::OK();
+      }
+
+      // Given a string field, and its index in the output,
+      // converts it to a Tensor of the right type and adds it to the
+      // out_tensors vector.
+      Status FieldToOutput(IteratorContext* ctx, string field,
+                           size_t output_idx,
+                           std::vector<Tensor>* out_tensors) {
+        if (output_idx >= dataset()->out_type_.size()) {
+          // We can get here if we're selecting all columns, but the number of
+          // fields exceeds the number of defaults provided
+          return errors::InvalidArgument("Expect ", dataset()->out_type_.size(),
+                                         " fields but have more in record");
+        }
+        const DataType& dtype = dataset()->out_type_[output_idx];
+        Tensor component(ctx->allocator({}), dtype, {});
+        if ((field.empty() || field == dataset()->na_value_) &&
+            dataset()->record_defaults_[output_idx].NumElements() != 1) {
+          // If the field is empty or NA value, and default is not given,
+          // report error.
+          return errors::InvalidArgument("Field ", output_idx,
+                                         " is required but missing in record!");
+        }
+
+        switch (dtype) {
+          // For each case, if the field is empty, we use the default.
+          // Otherwise, we convert it to the right type.
+          case DT_INT32: {
+            if (field.empty() || field == dataset()->na_value_) {
+              component.scalar<int32>()() =
+                  dataset()->record_defaults_[output_idx].flat<int32>()(0);
+            } else {
+              int32 value;
+              if (!strings::safe_strto32(field, &value)) {
+                return errors::InvalidArgument(
+                    "Field ", output_idx,
+                    " in record is not a valid int32: ", field);
+              }
+              component.scalar<int32>()() = value;
+            }
+            break;
+          }
+          case DT_INT64: {
+            if (field.empty() || field == dataset()->na_value_) {
+              component.scalar<int64>()() =
+                  dataset()->record_defaults_[output_idx].flat<int64>()(0);
+            } else {
+              int64 value;
+              if (!strings::safe_strto64(field, &value)) {
+                return errors::InvalidArgument(
+                    "Field ", output_idx,
+                    " in record is not a valid int64: ", field);
+              }
+              component.scalar<int64>()() = value;
+            }
+            break;
+          }
+          case DT_FLOAT: {
+            if (field.empty() || field == dataset()->na_value_) {
+              component.scalar<float>()() =
+                  dataset()->record_defaults_[output_idx].flat<float>()(0);
+            } else {
+              float value;
+              if (!strings::safe_strtof(field.c_str(), &value)) {
+                return errors::InvalidArgument(
+                    "Field ", output_idx,
+                    " in record is not a valid float: ", field);
+              }
+              component.scalar<float>()() = value;
+            }
+            break;
+          }
+          case DT_DOUBLE: {
+            if (field.empty() || field == dataset()->na_value_) {
+              component.scalar<double>()() =
+                  dataset()->record_defaults_[output_idx].flat<double>()(0);
+            } else {
+              double value;
+              if (!strings::safe_strtod(field.c_str(), &value)) {
+                return errors::InvalidArgument(
+                    "Field ", output_idx,
+                    " in record is not a valid double: ", field);
+              }
+              component.scalar<double>()() = value;
+            }
+            break;
+          }
+          case DT_STRING: {
+            if (field.empty() || field == dataset()->na_value_) {
+              component.scalar<string>()() =
+                  dataset()->record_defaults_[output_idx].flat<string>()(0);
+            } else {
+              component.scalar<string>()() = std::move(field);
+            }
+            break;
+          }
+          default:
+            return errors::InvalidArgument("csv: data type ", dtype,
+                                           " not supported in field ",
+                                           output_idx);
+        }
+        out_tensors->push_back(std::move(component));
+        return Status::OK();
+      }
+
+      // Sets up reader streams to read from the file at `current_file_index_`.
+      Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (current_file_index_ >= dataset()->filenames_.size()) {
+          return errors::InvalidArgument(
+              "current_file_index_:", current_file_index_,
+              " >= filenames_.size():", dataset()->filenames_.size());
+        }
+
+        // Actually move on to next file.
+        TF_RETURN_IF_ERROR(env->NewRandomAccessFile(
+            dataset()->filenames_[current_file_index_], &file_));
+        input_stream_.reset(
+            new io::RandomAccessInputStream(file_.get(), false));
+        // TODO(rachelim): Maintain our own buffer so we don't read every record
+        //   twice
+        buffered_input_stream_.reset(new io::BufferedInputStream(
+            input_stream_.get(), dataset()->buffer_size_, false));
+        if (dataset()->header_) {
+          // Ignore header line
+          string str;
+          Status s = buffered_input_stream_->ReadLine(&str);
+          if (errors::IsOutOfRange(s)) {
+            return errors::InvalidArgument("Can't read header of empty file");
+          }
+        }
+        return Status::OK();
+      }
+
+      // Resets all reader streams.
+      void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        input_stream_.reset();
+        buffered_input_stream_.reset();
+        file_.reset();
+      }
+
+      mutex mu_;
+      std::unique_ptr<io::RandomAccessInputStream> input_stream_
+          GUARDED_BY(mu_);
+      std::unique_ptr<io::BufferedInputStream> buffered_input_stream_
+          GUARDED_BY(mu_);
+      size_t current_file_index_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<RandomAccessFile> file_
+          GUARDED_BY(mu_);  // must outlive input_stream_
+    };                      // class Iterator
+
+    const std::vector<string> filenames_;
+    const bool header_;
+    const int64 buffer_size_;
+    const DataTypeVector out_type_;
+    const std::vector<PartialTensorShape> output_shapes_;
+    const std::vector<Tensor> record_defaults_;
+    const std::vector<int64> select_cols_;
+    const bool select_all_cols_;
+    const bool use_quote_delim_;
+    const char delim_;
+    const string na_value_;
+  };  // class Dataset
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};  // class CSVDatasetOp
+
+// Register the kernel implementation for CSVDataset.
+REGISTER_KERNEL_BUILDER(Name("CSVDataset").Device(DEVICE_CPU), CSVDatasetOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/data/ops/dataset_ops.cc b/tensorflow/contrib/data/ops/dataset_ops.cc
index 137deb63527f0bdde7da8d5be83ed038f430e581..f271d269ab1b9339de4657e459dcbbd462890f0a 100644
--- a/tensorflow/contrib/data/ops/dataset_ops.cc
+++ b/tensorflow/contrib/data/ops/dataset_ops.cc
@@ -34,6 +34,40 @@ data_input_datasets: `N` datasets with the same type that will be interleaved
   according to the values of `selector_input_dataset`.
 )doc");
 
+REGISTER_OP("CSVDataset")
+    .Input("filenames: string")
+    .Input("buffer_size: int64")
+    .Input("header: bool")
+    .Input("field_delim: string")
+    .Input("use_quote_delim: bool")
+    .Input("na_value: string")
+    .Input("select_cols: int64")
+    .Input("record_defaults: output_types")
+    .Output("handle: variant")
+    .Attr("output_types: list({float,double,int32,int64,string}) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `filenames` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      // `buffer_size`, `header`, `field_delim`, `use_quote_delim`,
+      // `na_value` must be scalars
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      // `select_cols` must be a vector
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 1, &unused));
+      // `record_defaults` must be a list of scalars...?
+      for (size_t i = 7; i < c->num_inputs(); ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &unused));
+      }
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("IgnoreErrorsDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 6017e27e731e3e8bcdee516ea291b17cd0782e63..f5082228e885d065e659abf208ca7b94bb4999a5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -11,7 +11,10 @@ py_test(
     size = "medium",
     srcs = ["batch_dataset_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",  # (b/79552534)
+        "no_pip",
+    ],
     deps = [
         ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:batching",
@@ -117,6 +120,19 @@ py_library(
     ],
 )
 
+py_test(
+    name = "csv_dataset_op_test",
+    size = "small",
+    srcs = ["csv_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":dataset_serialization_test",
+        "//tensorflow/contrib/data/python/ops:readers",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "filter_dataset_op_test",
     size = "small",
@@ -287,6 +303,7 @@ py_test(
     name = "reader_dataset_ops_test",
     size = "medium",
     srcs = ["reader_dataset_ops_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -301,6 +318,7 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lib",
         "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:iterator_ops",
         "//third_party/py/numpy",
@@ -411,6 +429,7 @@ py_test(
     srcs = ["sql_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:readers",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 6588fd04acb02790f5002c2e983253c3ed0504cf..2568b899d7ea1be685036ad8af93f584f861c951 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -427,7 +427,9 @@ class BatchDatasetTest(test.TestCase):
     self.assertEqual([None], dataset.output_shapes[1][0].as_list())
     self.assertEqual([None, 30], dataset.output_shapes[1][1].as_list())
 
-  def _testMapAndBatchDatasetHelper(self, num_parallel_batches=1):
+  def _testMapAndBatchDatasetHelper(self,
+                                    num_parallel_calls=None,
+                                    num_parallel_batches=None):
     """Test a dataset that maps a TF function across its input elements."""
     # The pipeline is TensorSliceDataset ->
     # RepeatDataset(count) -> MapAndBatchDataset(square_3, batch_size).
@@ -446,6 +448,7 @@ class BatchDatasetTest(test.TestCase):
             batching.map_and_batch(
                 map_func=_map_fn,
                 batch_size=batch_size,
+                num_parallel_calls=num_parallel_calls,
                 num_parallel_batches=num_parallel_batches))
         .make_initializable_iterator())
     init_op = iterator.initializer
@@ -497,12 +500,18 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(init_op, feed_dict={count: 14, batch_size: 0})
 
-  def testMapAndBatchDataset(self):
+  def testMapAndBatch(self):
     return self._testMapAndBatchDatasetHelper()
 
-  def testMapAndBatchDatasetWithParallelBatching(self):
+  def testMapAndBatchWithParallelBatches(self):
     return self._testMapAndBatchDatasetHelper(num_parallel_batches=10)
 
+  def testMapAndBatchWithSequentialCalls(self):
+    return self._testMapAndBatchDatasetHelper(num_parallel_calls=1)
+
+  def testMapAndBatchWithParallelCalls(self):
+    return self._testMapAndBatchDatasetHelper(num_parallel_calls=2)
+
   def _testMapAndBatchPartialBatchHelper(self, drop_remainder=False):
     iterator = (
         dataset_ops.Dataset.range(10).apply(
@@ -682,7 +691,7 @@ class UnbatchDatasetSerializationTest(
 class MapAndBatchDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
-  def testSerializationCore(self):
+  def testNumParallelBatches(self):
     range_size = 11
     num_repeats = 2
     batch_size = 5
@@ -709,6 +718,33 @@ class MapAndBatchDatasetSerializationTest(
     self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
                         num_outputs_drop_remainder)
 
+  def testNumParallelCalls(self):
+    range_size = 11
+    num_repeats = 2
+    batch_size = 5
+    total_outputs = range_size * num_repeats
+    num_outputs_drop_remainder = total_outputs // batch_size
+    num_outputs_keep_remainder = int(math.ceil(total_outputs / batch_size))
+    num_parallel_calls = 7
+
+    def build_ds(range_start, drop_remainder=False):
+
+      def _map_fn(x):
+        return math_ops.square(x)
+
+      return dataset_ops.Dataset.range(
+          range_start, range_start + range_size).repeat(num_repeats).apply(
+              batching.map_and_batch(
+                  map_func=_map_fn,
+                  batch_size=batch_size,
+                  num_parallel_calls=num_parallel_calls,
+                  drop_remainder=drop_remainder))
+
+    self.run_core_tests(lambda: build_ds(10), lambda: build_ds(15),
+                        num_outputs_keep_remainder)
+    self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
+                        num_outputs_drop_remainder)
+
 
 class PaddedBatchDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
diff --git a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..641a389c033687ebe081963182390b00230e4cb5
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
@@ -0,0 +1,378 @@
+#  Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for CsvDatasetOp."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+import time
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import readers
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_parsing_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+
+
+class CsvDatasetOpTest(test.TestCase):
+
+  def _assert_datasets_equal(self, g, ds1, ds2):
+    assert ds1.output_shapes == ds2.output_shapes, ('output_shapes differ: %s, '
+                                                    '%s') % (ds1.output_shapes,
+                                                             ds2.output_shapes)
+    assert ds1.output_types == ds2.output_types
+    assert ds1.output_classes == ds2.output_classes
+    next1 = ds1.make_one_shot_iterator().get_next()
+    next2 = ds2.make_one_shot_iterator().get_next()
+    with self.test_session(graph=g) as sess:
+      # Run through datasets and check that outputs match, or errors match.
+      while True:
+        try:
+          op1 = sess.run(next1)
+        except (errors.OutOfRangeError, ValueError) as e:
+          # If op1 throws an exception, check that op2 throws same exception.
+          with self.assertRaises(type(e)):
+            sess.run(next2)
+          break
+        op2 = sess.run(next2)
+        self.assertAllEqual(op1, op2)
+
+  def setup_files(self, inputs):
+    filenames = []
+    for i, ip in enumerate(inputs):
+      fn = os.path.join(self.get_temp_dir(), 'temp_%d.txt' % i)
+      with open(fn, 'w') as f:
+        f.write('\n'.join(ip))
+      filenames.append(fn)
+    return filenames
+
+  def _make_test_datasets(self, inputs, **kwargs):
+    # Test by comparing its output to what we could get with map->decode_csv
+    filenames = self.setup_files(inputs)
+    dataset_expected = core_readers.TextLineDataset(filenames)
+    dataset_expected = dataset_expected.map(
+        lambda l: gen_parsing_ops.decode_csv(l, **kwargs))
+    dataset_actual = readers.CsvDataset(filenames, **kwargs)
+    return (dataset_actual, dataset_expected)
+
+  def _test_by_comparison(self, inputs, **kwargs):
+    """Checks that CsvDataset is equiv to TextLineDataset->map(decode_csv)."""
+    with ops.Graph().as_default() as g:
+      dataset_actual, dataset_expected = self._make_test_datasets(
+          inputs, **kwargs)
+      self._assert_datasets_equal(g, dataset_actual, dataset_expected)
+
+  def _test_dataset(self,
+                    inputs,
+                    expected_output=None,
+                    expected_err_re=None,
+                    **kwargs):
+    """Checks that elements produced by CsvDataset match expected output."""
+    # Convert str type because py3 tf strings are bytestrings
+    filenames = self.setup_files(inputs)
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = readers.CsvDataset(filenames, **kwargs)
+        nxt = dataset.make_one_shot_iterator().get_next()
+        if expected_err_re is None:
+          # Verify that output is expected, without errors
+          expected_output = [[
+              v.encode('utf-8') if isinstance(v, str) else v for v in op
+          ] for op in expected_output]
+          for value in expected_output:
+            op = sess.run(nxt)
+            self.assertAllEqual(op, value)
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(nxt)
+        else:
+          # Verify that OpError is produced as expected
+          with self.assertRaisesOpError(expected_err_re):
+            while True:
+              try:
+                sess.run(nxt)
+              except errors.OutOfRangeError:
+                break
+
+  def testCsvDataset_floatRequired(self):
+    record_defaults = [[]] * 4
+    inputs = [['1,2,3,4']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_int(self):
+    record_defaults = [[0]] * 4
+    inputs = [['1,2,3,4', '5,6,7,8']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_float(self):
+    record_defaults = [[0.0]] * 4
+    inputs = [['1.0,2.1,3.2,4.3', '5.4,6.5,7.6,8.7']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_string(self):
+    record_defaults = [['']] * 4
+    inputs = [['1.0,2.1,hello,4.3', '5.4,6.5,goodbye,8.7']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_withQuoted(self):
+    record_defaults = [['']] * 4
+    inputs = [['1.0,2.1,"hello, it is me",4.3', '5.4,6.5,goodbye,8.7']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_mixedTypes(self):
+    record_defaults = [
+        constant_op.constant([], dtype=dtypes.int32),
+        constant_op.constant([], dtype=dtypes.float32),
+        constant_op.constant([], dtype=dtypes.string),
+        constant_op.constant([], dtype=dtypes.float64)
+    ]
+    inputs = [['1,2.1,3.2,4.3', '5,6.5,7.6,8.7']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_withUseQuoteDelimFalse(self):
+    record_defaults = [['']] * 4
+    inputs = [['1,2,"3,4"', '"5,6",7,8']]
+    self._test_by_comparison(
+        inputs, record_defaults=record_defaults, use_quote_delim=False)
+
+  def testCsvDataset_withFieldDelim(self):
+    record_defaults = [[0]] * 4
+    inputs = [['1:2:3:4', '5:6:7:8']]
+    self._test_by_comparison(
+        inputs, record_defaults=record_defaults, field_delim=':')
+
+  def testCsvDataset_withEmptyValues(self):
+    record_defaults = [[0]] * 4
+    inputs = [['1,,3,4', ',6,7,8']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_withNaValue(self):
+    record_defaults = [[0]] * 4
+    inputs = [['1,NA,3,4', 'NA,6,7,8']]
+    self._test_by_comparison(
+        inputs, record_defaults=record_defaults, na_value='NA')
+
+  def testCsvDataset_withSelectCols(self):
+    record_defaults = [[0]] * 2
+    inputs = [['1,2,3,4', '5,6,7,8']]
+    self._test_by_comparison(
+        inputs, record_defaults=record_defaults, select_cols=[1, 2])
+
+  def testCsvDataset_withSelectColsTooHigh(self):
+    record_defaults = [[0]] * 2
+    inputs = [['1,2,3,4', '5,6,7,8']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='Expect 2 fields but have 1 in record',
+        record_defaults=record_defaults,
+        select_cols=[3, 4])
+
+  def testCsvDataset_withMultipleFiles(self):
+    record_defaults = [[0]] * 4
+    inputs = [['1,2,3,4', '5,6,7,8'], ['5,6,7,8']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_withNewLine(self):
+    # In this case, we expect it to behave differently from
+    # TextLineDataset->map(decode_csv) since that flow has bugs
+    record_defaults = [['']] * 4
+    inputs = [['a,b,"""c""\n0","d\ne"', 'f,g,h,i']]
+    expected = [['a', 'b', '"c"\n0', 'd\ne'], ['f', 'g', 'h', 'i']]
+    self._test_dataset(inputs, expected, record_defaults=record_defaults)
+
+  def testCsvDataset_withMultipleNewLines(self):
+    # In this case, we expect it to behave differently from
+    # TextLineDataset->map(decode_csv) since that flow has bugs
+    record_defaults = [['']] * 4
+    inputs = [['a,"b\n\nx","""c""\n \n0","d\ne"', 'f,g,h,i']]
+    expected = [['a', 'b\n\nx', '"c"\n \n0', 'd\ne'], ['f', 'g', 'h', 'i']]
+    self._test_dataset(inputs, expected, record_defaults=record_defaults)
+
+  def testCsvDataset_withLeadingAndTrailingSpaces(self):
+    record_defaults = [[0.0]] * 4
+    inputs = [['0, 1, 2, 3']]
+    expected = [[0.0, 1.0, 2.0, 3.0]]
+    self._test_dataset(inputs, expected, record_defaults=record_defaults)
+
+  def testCsvDataset_errorWithMissingDefault(self):
+    record_defaults = [[]] * 2
+    inputs = [['0,']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='Field 1 is required but missing in record!',
+        record_defaults=record_defaults)
+
+  def testCsvDataset_errorWithFewerDefaultsThanFields(self):
+    record_defaults = [[0.0]] * 2
+    inputs = [['0,1,2,3']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='Expect 2 fields but have more in record',
+        record_defaults=record_defaults)
+
+  def testCsvDataset_errorWithMoreDefaultsThanFields(self):
+    record_defaults = [[0.0]] * 5
+    inputs = [['0,1,2,3']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='Expect 5 fields but have 4 in record',
+        record_defaults=record_defaults)
+
+  def testCsvDataset_withHeader(self):
+    record_defaults = [[0]] * 2
+    inputs = [['col1,col2', '1,2']]
+    expected = [[1, 2]]
+    self._test_dataset(
+        inputs,
+        expected,
+        record_defaults=record_defaults,
+        header=True,
+    )
+
+  def testCsvDataset_withHeaderAndNoRecords(self):
+    record_defaults = [[0]] * 2
+    inputs = [['col1,col2']]
+    expected = []
+    self._test_dataset(
+        inputs,
+        expected,
+        record_defaults=record_defaults,
+        header=True,
+    )
+
+  def testCsvDataset_errorWithHeaderEmptyFile(self):
+    record_defaults = [[0]] * 2
+    inputs = [[]]
+    self._test_dataset(
+        inputs,
+        expected_err_re="Can't read header of empty file",
+        record_defaults=record_defaults,
+        header=True,
+    )
+
+  def testCsvDataset_withEmptyFile(self):
+    record_defaults = [['']] * 2
+    inputs = [['']]  # Empty file
+    self._test_dataset(
+        inputs, expected_output=[], record_defaults=record_defaults)
+
+  def testCsvDataset_errorWithEmptyRecord(self):
+    record_defaults = [['']] * 2
+    inputs = [['', '1,2']]  # First record is empty
+    self._test_dataset(
+        inputs,
+        expected_err_re='Expect 2 fields but have 0 in record',
+        record_defaults=record_defaults)
+
+  def testCsvDataset_withChainedOps(self):
+    # Testing that one dataset can create multiple iterators fine.
+    # `repeat` creates multiple iterators from the same C++ Dataset.
+    record_defaults = [[0]] * 4
+    inputs = [['1,,3,4', '5,6,,8']]
+    ds_actual, ds_expected = self._make_test_datasets(
+        inputs, record_defaults=record_defaults)
+    with ops.Graph().as_default() as g:
+      self._assert_datasets_equal(g,
+                                  ds_actual.repeat(5).prefetch(1),
+                                  ds_expected.repeat(5).prefetch(1))
+
+  def testCsvDataset_withTypeDefaults(self):
+    # Testing using dtypes as record_defaults for required fields
+    record_defaults = [dtypes.float32, dtypes.float32]
+    inputs = [['1.0,2.0', '3.0,4.0']]
+    self._test_dataset(
+        inputs,
+        [[1.0, 2.0], [3.0, 4.0]],
+        record_defaults=record_defaults,
+    )
+
+
+class CsvDatasetBenchmark(test.Benchmark):
+  """Benchmarks for the various ways of creating a dataset from CSV files.
+  """
+
+  def _setUp(self):
+    # Since this isn't test.TestCase, have to manually create a test dir
+    gfile.MakeDirs(googletest.GetTempDir())
+    self._temp_dir = tempfile.mkdtemp(dir=googletest.GetTempDir())
+
+    self._num_cols = [4, 64, 256]
+    self._batch_size = 500
+    self._filenames = []
+    for n in self._num_cols:
+      fn = os.path.join(self._temp_dir, 'file%d.csv' % n)
+      with open(fn, 'w') as f:
+        # Just write 10 rows and use `repeat`...
+        row = ','.join(['1.23456E12' for _ in range(n)])
+        f.write('\n'.join([row for _ in range(10)]))
+      self._filenames.append(fn)
+
+  def _tearDown(self):
+    gfile.DeleteRecursively(self._temp_dir)
+
+  def _runBenchmark(self, dataset, num_cols, prefix):
+    next_element = dataset.make_one_shot_iterator().get_next()
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(next_element)
+      deltas = []
+      for _ in range(10):
+        start = time.time()
+        sess.run(next_element)
+        end = time.time()
+        deltas.append(end - start)
+    median_wall_time = np.median(deltas) / 100
+    print('%s num_cols: %d Median wall time: %f' % (prefix, num_cols,
+                                                    median_wall_time))
+    self.report_benchmark(
+        iters=self._batch_size,
+        wall_time=median_wall_time,
+        name='%s_with_cols_%d' % (prefix, num_cols))
+
+  def benchmarkBatchThenMap(self):
+    self._setUp()
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [[0.0]] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = dataset.map(lambda l: gen_parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
+      dataset = dataset.batch(self._batch_size)
+      self._runBenchmark(dataset, num_cols, 'csv_map_then_batch')
+    self._tearDown()
+
+  def benchmarkCsvDataset(self):
+    self._setUp()
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [[0.0]] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
+      dataset = dataset.batch(self._batch_size)
+      self._runBenchmark(dataset, num_cols, 'csv_fused_dataset')
+    self._tearDown()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index 1075302bae96ca2e0111efbacdf5e919ea76897d..e0237198b7d47eb98eeffe88d28bf9681b2722c6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -36,6 +36,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -256,6 +257,29 @@ class TFRecordDatasetSerializationTest(
         lambda: self._build_iterator_graph(num_epochs * 2), num_outputs)
 
 
+def _interleave(iterators, cycle_length):
+  pending_iterators = iterators
+  open_iterators = []
+  num_open = 0
+  for i in range(cycle_length):
+    if pending_iterators:
+      open_iterators.append(pending_iterators.pop(0))
+      num_open += 1
+
+  while num_open:
+    for i in range(min(cycle_length, len(open_iterators))):
+      if open_iterators[i] is None:
+        continue
+      try:
+        yield next(open_iterators[i])
+      except StopIteration:
+        if pending_iterators:
+          open_iterators[i] = pending_iterators.pop(0)
+        else:
+          open_iterators[i] = None
+          num_open -= 1
+
+
 class ReadBatchFeaturesTest(test.TestCase):
 
   def setUp(self):
@@ -355,8 +379,8 @@ class ReadBatchFeaturesTest(test.TestCase):
           yield j, i
 
     def _next_record_interleaved(file_indices, cycle_length):
-      return self._interleave([_next_record([i]) for i in file_indices],
-                              cycle_length)
+      return _interleave([_next_record([i]) for i in file_indices],
+                         cycle_length)
 
     file_batch = []
     keywords_batch_indices = []
@@ -397,28 +421,6 @@ class ReadBatchFeaturesTest(test.TestCase):
           [len(file_batch), keywords_batch_max_len], record_batch
       ]
 
-  def _interleave(self, iterators, cycle_length):
-    pending_iterators = iterators
-    open_iterators = []
-    num_open = 0
-    for i in range(cycle_length):
-      if pending_iterators:
-        open_iterators.append(pending_iterators.pop(0))
-        num_open += 1
-
-    while num_open:
-      for i in range(min(cycle_length, len(open_iterators))):
-        if open_iterators[i] is None:
-          continue
-        try:
-          yield next(open_iterators[i])
-        except StopIteration:
-          if pending_iterators:
-            open_iterators[i] = pending_iterators.pop(0)
-          else:
-            open_iterators[i] = None
-            num_open -= 1
-
   def _verify_records(self,
                       sess,
                       batch_size,
@@ -620,14 +622,12 @@ class MakeCsvDatasetTest(test.TestCase):
     f.close()
     return fn
 
-  def _create_file(self, fileno, header=True, comment=True):
+  def _create_file(self, fileno, header=True):
     rows = []
     if header:
       rows.append(self.COLUMNS)
     for recno in range(self._num_records):
       rows.append(self._csv_values(fileno, recno))
-      if comment:
-        rows.append("# Some comment goes here. Ignore me.")
     return self._write_file("csv_file%d.csv" % fileno, rows)
 
   def _create_files(self):
@@ -648,9 +648,7 @@ class MakeCsvDatasetTest(test.TestCase):
       shuffle=False,
       shuffle_seed=None,
       header=True,
-      comment="#",
       na_value="",
-      default_float_type=dtypes.float32,
   ):
     return readers.make_csv_dataset(
         filenames,
@@ -662,9 +660,7 @@ class MakeCsvDatasetTest(test.TestCase):
         shuffle=shuffle,
         shuffle_seed=shuffle_seed,
         header=header,
-        comment=comment,
         na_value=na_value,
-        default_float_type=default_float_type,
         select_columns=select_cols,
     )
 
@@ -786,29 +782,6 @@ class MakeCsvDatasetTest(test.TestCase):
             num_epochs=10,
             label_name=None)
 
-  def testMakeCSVDataset_withNoComments(self):
-    """Tests that datasets can be created from CSV files with no header line.
-    """
-    defaults = self.DEFAULTS
-    file_without_header = self._create_file(
-        len(self._test_filenames), comment=False)
-    with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
-        dataset = self._make_csv_dataset(
-            file_without_header,
-            defaults,
-            batch_size=2,
-            num_epochs=10,
-            comment=None,
-        )
-        self._verify_records(
-            sess,
-            dataset,
-            [len(self._test_filenames)],
-            batch_size=2,
-            num_epochs=10,
-        )
-
   def testMakeCSVDataset_withNoHeader(self):
     """Tests that datasets can be created from CSV files with no header line.
     """
@@ -876,7 +849,7 @@ class MakeCsvDatasetTest(test.TestCase):
 
     In that case, we should infer the types from the first N records.
     """
-    # Test that it works with standard test files (with comments, header, etc)
+    # Test that it works with standard test files (with header, etc)
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         dataset = self._make_csv_dataset(
@@ -889,7 +862,9 @@ class MakeCsvDatasetTest(test.TestCase):
             num_epochs=10,
             defaults=[[], [], [], [], [""]])
 
-    # Test on a deliberately tricky file
+  def testMakeCSVDataset_withTypeInferenceTricky(self):
+    # Test on a deliberately tricky file (type changes as we read more rows, and
+    # there are null values)
     fn = os.path.join(self.get_temp_dir(), "file.csv")
     expected_dtypes = [
         dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float32,
@@ -914,20 +889,29 @@ class MakeCsvDatasetTest(test.TestCase):
             column_names=None,
             label_name=None,
             na_value="NAN",
-            default_float_type=dtypes.float32,
         )
         features = dataset.make_one_shot_iterator().get_next()
         # Check that types match
         for i in range(len(expected_dtypes)):
+          print(features["col%d" % i].dtype, expected_dtypes[i])
           assert features["col%d" % i].dtype == expected_dtypes[i]
         for i in range(len(rows)):
           assert sess.run(features) == dict(zip(col_names, expected[i]))
 
-    # With float64 as default type for floats
+  def testMakeCSVDataset_withTypeInferenceAllTypes(self):
+    # Test that we make the correct inference for all types with fallthrough
+    fn = os.path.join(self.get_temp_dir(), "file.csv")
     expected_dtypes = [
-        dtypes.int32, dtypes.int64, dtypes.float64, dtypes.float64,
+        dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64,
         dtypes.string, dtypes.string
     ]
+    col_names = ["col%d" % i for i in range(len(expected_dtypes))]
+    rows = [[1, 2**31 + 1, 1.0, 4e40, "abc", ""]]
+    expected = [[
+        1, 2**31 + 1, 1.0, 4e40, "abc".encode("utf-8"), "".encode("utf-8")
+    ]]
+    self._write_file("file.csv", [col_names] + rows)
+
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         dataset = self._make_csv_dataset(
@@ -936,7 +920,6 @@ class MakeCsvDatasetTest(test.TestCase):
             column_names=None,
             label_name=None,
             na_value="NAN",
-            default_float_type=dtypes.float64,
         )
         features = dataset.make_one_shot_iterator().get_next()
         # Check that types match
@@ -1086,5 +1069,189 @@ class MakeCsvDatasetTest(test.TestCase):
           self.assertFalse(all_equal)
 
 
+class MakeTFRecordDatasetTest(TFRecordDatasetTestBase):
+
+  def _next_expected_batch(self,
+                           file_indices,
+                           batch_size,
+                           num_epochs,
+                           cycle_length,
+                           drop_final_batch,
+                           use_parser_fn):
+
+    def _next_record(file_indices):
+      for j in file_indices:
+        for i in range(self._num_records):
+          yield j, i
+
+    def _next_record_interleaved(file_indices, cycle_length):
+      return _interleave([_next_record([i]) for i in file_indices],
+                         cycle_length)
+
+    record_batch = []
+    batch_index = 0
+    for _ in range(num_epochs):
+      if cycle_length == 1:
+        next_records = _next_record(file_indices)
+      else:
+        next_records = _next_record_interleaved(file_indices, cycle_length)
+      for f, r in next_records:
+        record = self._record(f, r)
+        if use_parser_fn:
+          record = record[1:]
+        record_batch.append(record)
+        batch_index += 1
+        if len(record_batch) == batch_size:
+          yield record_batch
+          record_batch = []
+          batch_index = 0
+    if record_batch and not drop_final_batch:
+      yield record_batch
+
+  def _verify_records(self,
+                      sess,
+                      outputs,
+                      batch_size,
+                      file_index,
+                      num_epochs,
+                      interleave_cycle_length,
+                      drop_final_batch,
+                      use_parser_fn):
+    if file_index is not None:
+      file_indices = [file_index]
+    else:
+      file_indices = range(self._num_files)
+
+    for expected_batch in self._next_expected_batch(
+        file_indices, batch_size, num_epochs, interleave_cycle_length,
+        drop_final_batch, use_parser_fn):
+      actual_batch = sess.run(outputs)
+      self.assertAllEqual(expected_batch, actual_batch)
+
+  def _read_test(self, batch_size, num_epochs, file_index=None,
+                 num_parallel_reads=1, drop_final_batch=False, parser_fn=False):
+    if file_index is None:
+      file_pattern = self.test_filenames
+    else:
+      file_pattern = self.test_filenames[file_index]
+
+    if parser_fn:
+      fn = lambda x: string_ops.substr(x, 1, 999)
+    else:
+      fn = None
+
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        outputs = readers.make_tf_record_dataset(
+            file_pattern=file_pattern,
+            num_epochs=num_epochs,
+            batch_size=batch_size,
+            parser_fn=fn,
+            num_parallel_reads=num_parallel_reads,
+            drop_final_batch=drop_final_batch,
+            shuffle=False).make_one_shot_iterator().get_next()
+        self._verify_records(
+            sess, outputs, batch_size, file_index, num_epochs=num_epochs,
+            interleave_cycle_length=num_parallel_reads,
+            drop_final_batch=drop_final_batch, use_parser_fn=parser_fn)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(outputs)
+
+  def testRead(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 3]:
+        # Basic test: read from file 0.
+        self._read_test(batch_size, num_epochs, 0)
+
+        # Basic test: read from file 1.
+        self._read_test(batch_size, num_epochs, 1)
+
+        # Basic test: read from both files.
+        self._read_test(batch_size, num_epochs)
+
+        # Basic test: read from both files, with parallel reads.
+        self._read_test(batch_size, num_epochs, num_parallel_reads=8)
+
+  def testDropFinalBatch(self):
+    for batch_size in [1, 2, 10]:
+      for num_epochs in [1, 3]:
+        # Read from file 0.
+        self._read_test(batch_size, num_epochs, 0, drop_final_batch=True)
+
+        # Read from both files.
+        self._read_test(batch_size, num_epochs, drop_final_batch=True)
+
+        # Read from both files, with parallel reads.
+        self._read_test(batch_size, num_epochs, num_parallel_reads=8,
+                        drop_final_batch=True)
+
+  def testParserFn(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 3]:
+        for drop_final_batch in [False, True]:
+          self._read_test(batch_size, num_epochs, parser_fn=True,
+                          drop_final_batch=drop_final_batch)
+          self._read_test(batch_size, num_epochs, num_parallel_reads=8,
+                          parser_fn=True, drop_final_batch=drop_final_batch)
+
+  def _shuffle_test(self, batch_size, num_epochs, num_parallel_reads=1,
+                    seed=None):
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = readers.make_tf_record_dataset(
+            file_pattern=self.test_filenames,
+            num_epochs=num_epochs,
+            batch_size=batch_size,
+            num_parallel_reads=num_parallel_reads,
+            shuffle=True,
+            shuffle_seed=seed)
+        iterator = dataset.make_initializable_iterator()
+        next_element = iterator.get_next()
+
+        sess.run(iterator.initializer)
+        first_batches = []
+        try:
+          while True:
+            first_batches.append(sess.run(next_element))
+        except errors.OutOfRangeError:
+          pass
+
+        sess.run(iterator.initializer)
+        second_batches = []
+        try:
+          while True:
+            second_batches.append(sess.run(next_element))
+        except errors.OutOfRangeError:
+          pass
+
+        self.assertEqual(len(first_batches), len(second_batches))
+        if seed is not None:
+          # if you set a seed, should get the same results
+          for i in range(len(first_batches)):
+            self.assertAllEqual(first_batches[i], second_batches[i])
+
+        expected = []
+        for f in range(self._num_files):
+          for r in range(self._num_records):
+            expected.extend([self._record(f, r)] * num_epochs)
+
+        for batches in (first_batches, second_batches):
+          actual = []
+          for b in batches:
+            actual.extend(b)
+          self.assertAllEqual(sorted(expected), sorted(actual))
+
+  def testShuffle(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 3]:
+        for num_parallel_reads in [1, 2]:
+          # Test that all expected elements are produced
+          self._shuffle_test(batch_size, num_epochs, num_parallel_reads)
+          # Test that elements are produced in a consistent order if
+          # you specify a seed.
+          self._shuffle_test(batch_size, num_epochs, num_parallel_reads,
+                             seed=21345)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test.py
index e26cef8ec522c7e69a0c19b2b30a969bbfc0ad78..4148addf2878c99f47ebe1454edf69ad7f38dfbc 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test.py
@@ -22,6 +22,7 @@ import os
 
 import sqlite3
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -29,7 +30,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class SqlDatasetTest(test.TestCase):
+class SqlDatasetTestBase(test.TestCase):
 
   def _createSqlDataset(self, output_types, num_repeats=1):
     dataset = readers.SqlDataset(self.driver_name, self.data_source_name,
@@ -92,6 +93,9 @@ class SqlDatasetTest(test.TestCase):
     conn.commit()
     conn.close()
 
+
+class SqlDatasetTest(SqlDatasetTestBase):
+
   # Test that SqlDataset can read from a database table.
   def testReadResultSet(self):
     init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
@@ -652,5 +656,27 @@ class SqlDatasetTest(test.TestCase):
         sess.run(get_next)
 
 
+class SqlDatasetSerializationTest(
+    SqlDatasetTestBase,
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_dataset(self, num_repeats):
+    data_source_name = os.path.join(test.get_temp_dir(), "tftest.sqlite")
+    driver_name = array_ops.placeholder_with_default(
+        array_ops.constant("sqlite", dtypes.string), shape=[])
+    query = ("SELECT first_name, last_name, motto FROM students ORDER BY "
+             "first_name DESC")
+    output_types = (dtypes.string, dtypes.string, dtypes.string)
+    return readers.SqlDataset(driver_name, data_source_name, query,
+                              output_types).repeat(num_repeats)
+
+  def testSQLSaveable(self):
+    num_repeats = 4
+    num_outputs = num_repeats * 2
+    self.run_core_tests(lambda: self._build_dataset(num_repeats),
+                        lambda: self._build_dataset(num_repeats // 2),
+                        num_outputs)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 7a3e42cc72755c67b910db99c0238f6ba780a942..eceecfd1744d0ae28953a4504450653efa473569 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -45,6 +45,27 @@ py_library(
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+)
+
+py_test(
+    name = "iterator_ops_test",
+    size = "small",
+    srcs = ["iterator_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":iterator_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:model_fn",
     ],
 )
 
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index 42ec2b0b017973c60efb4c2e1c99a7a2292da58b..b9393de4e90ae2597045b29070934b94e18cfcbd 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -466,14 +466,14 @@ def assert_element_shape(expected_shapes):
 class _MapAndBatchDataset(dataset_ops.MapDataset):
   """A `Dataset` that maps a function over a batch of elements."""
 
-  def __init__(self, input_dataset, map_func, batch_size, num_parallel_batches,
+  def __init__(self, input_dataset, map_func, batch_size, num_parallel_calls,
                drop_remainder):
     """See `Dataset.map()` for details."""
     super(_MapAndBatchDataset, self).__init__(input_dataset, map_func)
     self._batch_size_t = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
-    self._num_parallel_batches_t = ops.convert_to_tensor(
-        num_parallel_batches, dtype=dtypes.int64, name="num_parallel_batches")
+    self._num_parallel_calls_t = ops.convert_to_tensor(
+        num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
     self._drop_remainder_t = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
@@ -483,12 +483,12 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
     input_resource = self._input_dataset._as_variant_tensor()
-    return gen_dataset_ops.map_and_batch_dataset(
+    return gen_dataset_ops.map_and_batch_dataset_v2(
         input_resource,
         self._map_func.captured_inputs,
         f=self._map_func,
         batch_size=self._batch_size_t,
-        num_parallel_batches=self._num_parallel_batches_t,
+        num_parallel_calls=self._num_parallel_calls_t,
         drop_remainder=self._drop_remainder_t,
         output_types=nest.flatten(
             sparse.as_dense_types(self.output_types, self.output_classes)),
@@ -511,8 +511,9 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
 
 def map_and_batch(map_func,
                   batch_size,
-                  num_parallel_batches=1,
-                  drop_remainder=False):
+                  num_parallel_batches=None,
+                  drop_remainder=False,
+                  num_parallel_calls=None):
   """Fused implementation of `map` and `batch`.
 
   Maps `map_func` across `batch_size` consecutive elements of this dataset
@@ -528,21 +529,37 @@ def map_and_batch(map_func,
       nested structure of tensors.
     batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
       consecutive elements of this dataset to combine in a single batch.
-    num_parallel_batches: A `tf.int64` scalar `tf.Tensor`, representing the
-      number of batches to create in parallel. On one hand, higher values can
-      help mitigate the effect of stragglers. On the other hand, higher values
-      can increase contention if CPU is scarce.
-    drop_remainder: A `tf.bool` scalar `tf.Tensor`, representing whether the
-      last batch should be dropped in case its size is smaller than desired;
-      the default behavior is not to drop the smaller batch.
+    num_parallel_batches: (Optional.) A `tf.int64` scalar `tf.Tensor`,
+      representing the number of batches to create in parallel. On one hand,
+      higher values can help mitigate the effect of stragglers. On the other
+      hand, higher values can increase contention if CPU is scarce.
+    drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
+      whether the last batch should be dropped in case its size is smaller than
+      desired; the default behavior is not to drop the smaller batch.
+    num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
+        representing the number of elements to process in parallel. If not
+        specified, `batch_size * num_parallel_batches` elements will be
+        processed in parallel.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
     @{tf.data.Dataset.apply}.
+
+  Raises:
+    ValueError: If both `num_parallel_batches` and `num_parallel_calls` are
+      specified.
   """
 
+  if num_parallel_batches is None and num_parallel_calls is None:
+    num_parallel_calls = batch_size
+  elif num_parallel_batches is not None and num_parallel_calls is None:
+    num_parallel_calls = batch_size * num_parallel_batches
+  elif num_parallel_batches is not None and num_parallel_calls is not None:
+    raise ValueError("The `num_parallel_batches` and `num_parallel_calls` "
+                     "arguments are mutually exclusive.")
+
   def _apply_fn(dataset):
     return _MapAndBatchDataset(dataset, map_func, batch_size,
-                               num_parallel_batches, drop_remainder)
+                               num_parallel_calls, drop_remainder)
 
   return _apply_fn
diff --git a/tensorflow/contrib/data/python/ops/iterator_ops.py b/tensorflow/contrib/data/python/ops/iterator_ops.py
index d736029fb035e573b70e8b19570e4e8ceca3c005..0d71be66018eeebe60de9deff24ceb6854d209d9 100644
--- a/tensorflow/contrib/data/python/ops/iterator_ops.py
+++ b/tensorflow/contrib/data/python/ops/iterator_ops.py
@@ -16,10 +16,12 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.training import saver
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import session_run_hook
 
 
 def make_saveable_from_iterator(iterator):
@@ -60,14 +62,14 @@ def make_saveable_from_iterator(iterator):
   return _Saveable(iterator._iterator_resource)  # pylint: disable=protected-access
 
 
-class _Saveable(saver.BaseSaverBuilder.SaveableObject):
+class _Saveable(saver_lib.BaseSaverBuilder.SaveableObject):
   """SaveableObject for saving/restoring iterator state."""
 
   def __init__(self, iterator_resource):
     serialized_iterator = gen_dataset_ops.serialize_iterator(iterator_resource)
     specs = [
-        saver.BaseSaverBuilder.SaveSpec(serialized_iterator, "",
-                                        iterator_resource.name + "-state")
+        saver_lib.BaseSaverBuilder.SaveSpec(serialized_iterator, "",
+                                            iterator_resource.name + "-state")
     ]
     super(_Saveable, self).__init__(iterator_resource, specs,
                                     iterator_resource.name)
@@ -75,3 +77,182 @@ class _Saveable(saver.BaseSaverBuilder.SaveableObject):
   def restore(self, restored_tensors, unused_restored_shapes):
     with ops.colocate_with(self.op):
       return gen_dataset_ops.deserialize_iterator(self.op, restored_tensors[0])
+
+
+class CheckpointInputPipelineHook(session_run_hook.SessionRunHook):
+  """Checkpoints input pipeline state every N steps or seconds.
+
+  This hook saves the state of the iterators in the `Graph` so that when
+  training is resumed the input pipeline continues from where it left off.
+  This could potentially avoid overfitting in certain pipelines where the
+  number of training steps per eval are small compared to the dataset
+  size or if the training pipeline is pre-empted.
+
+  Differences from `CheckpointSaverHook`:
+  1. Saves only the input pipelines in the "iterators" collection and not the
+     global variables or other saveable objects.
+  2. Does not write the `GraphDef` and `MetaGraphDef` to the summary.
+
+  Example of checkpointing the training pipeline:
+
+  ```python
+  est = tf.estimator.Estimator(model_fn)
+  while True:
+    est.train(
+        train_input_fn,
+        hooks=[tf.contrib.data.CheckpointInputPipelineHook(est)],
+        steps=train_steps_per_eval)
+    # Note: We do not pass the hook here.
+    metrics = est.evaluate(eval_input_fn)
+    if should_stop_the_training(metrics):
+      break
+  ```
+
+  This hook should be used if the input pipeline state needs to be saved
+  separate from the model checkpoint. Doing so may be useful for a few reasons:
+  1. The input pipeline checkpoint may be large, if there are large shuffle
+     or prefetch buffers for instance, and may bloat the checkpoint size.
+  2. If the input pipeline is shared between training and validation, restoring
+     the checkpoint during validation may override the validation input
+     pipeline.
+
+  For saving the input pipeline checkpoint alongside the model weights use
+  @{tf.contrib.data.make_saveable_from_iterator} directly to create a
+  `SaveableObject` and add to the `SAVEABLE_OBJECTS` collection. Note, however,
+  that you will need to be careful not to restore the training iterator during
+  eval. You can do that by not adding the iterator to the SAVEABLE_OBJECTS
+  collector when building the eval graph.
+  """
+
+  def __init__(self, estimator):
+    """Initializes a `CheckpointInputPipelineHook`.
+
+    Args:
+      estimator: Estimator.
+
+    Raises:
+      ValueError: One of `save_steps` or `save_secs` should be set.
+      ValueError: At most one of saver or scaffold should be set.
+    """
+    # `checkpoint_basename` is "input.ckpt" for non-distributed pipelines or
+    # of the form "input_<task_type>_<task_id>.ckpt" for distributed pipelines.
+    # Note: The default `checkpoint_basename` used by `CheckpointSaverHook` is
+    # "model.ckpt". We intentionally choose the input pipeline checkpoint prefix
+    # to be different to avoid conflicts with the model checkpoint.
+
+    # pylint: disable=protected-access
+    checkpoint_prefix = "input"
+    if estimator._config.num_worker_replicas > 1:
+      # Distributed setting.
+      suffix = "_{}_{}".format(estimator._config.task_type,
+                               estimator._config.task_id)
+      checkpoint_prefix += suffix
+    # pylint: enable=protected-access
+
+    # We use a composition paradigm instead of inheriting from
+    # `CheckpointSaverHook` because `Estimator` does an `isinstance` check
+    # to check whether a `CheckpointSaverHook` is already present in the list
+    # of hooks and if not, adds one. Inheriting from `CheckpointSaverHook`
+    # would thwart this behavior. This hook checkpoints *only the iterators*
+    # and not the graph variables.
+    self._checkpoint_saver_hook = basic_session_run_hooks.CheckpointSaverHook(
+        estimator.model_dir,
+        save_secs=estimator._config.save_checkpoints_secs,  # pylint: disable=protected-access
+        save_steps=estimator._config.save_checkpoints_steps,  # pylint: disable=protected-access
+        checkpoint_basename=checkpoint_prefix + ".ckpt")
+
+    # Name for the protocol buffer file that will contain the list of most
+    # recent checkpoints stored as a `CheckpointState` protocol buffer.
+    # This file, kept in the same directory as the checkpoint files, is
+    # automatically managed by the `Saver` to keep track of recent checkpoints.
+    # The default name used by the `Saver` for this file is "checkpoint". Here
+    # we use the name "checkpoint_<checkpoint_prefix>" so that in case the
+    # `checkpoint_dir` is the same as the model checkpoint directory, there are
+    # no conflicts during restore.
+    self._latest_filename = "checkpoint_" + checkpoint_prefix
+    self._first_run = True
+
+  def begin(self):
+    # Build a Saver that saves all iterators in the `GLOBAL_ITERATORS`
+    # collection if no `Saver` or `Scaffold` is provided.
+    # pylint: disable=protected-access
+    if (self._checkpoint_saver_hook._saver is None and
+        self._checkpoint_saver_hook._scaffold is None):
+      iterators = ops.get_collection(iterator_ops.GLOBAL_ITERATORS)
+      saveables = [_Saveable(i) for i in iterators]
+      self._checkpoint_saver_hook._saver = _CustomSaver(saveables,
+                                                        self._latest_filename)
+    # pylint: enable=protected-access
+    self._checkpoint_saver_hook.begin()
+
+  def _restore_or_save_initial_ckpt(self, session):
+    # Ideally this should be run in after_create_session but is not for the
+    # following reason:
+    # Currently there is no way of enforcing an order of running the
+    # `SessionRunHooks`. Hence it is possible that the `_DatasetInitializerHook`
+    # is run *after* this hook. That is troublesome because
+    # 1. If a checkpoint exists and this hook restores it, the initializer hook
+    #    will override it.
+    # 2. If no checkpoint exists, this hook will try to save an initialized
+    #    iterator which will result in an exception.
+    #
+    # As a temporary fix we enter the following implicit contract between this
+    # hook and the _DatasetInitializerHook.
+    # 1. The _DatasetInitializerHook initializes the iterator in the call to
+    #    after_create_session.
+    # 2. This hook saves the iterator on the first call to `before_run()`, which
+    #    is guaranteed to happen after `after_create_session()` of all hooks
+    #    have been run.
+
+    # Check if there is an existing checkpoint. If so, restore from it.
+    # pylint: disable=protected-access
+    latest_checkpoint_path = saver_lib.latest_checkpoint(
+        self._checkpoint_saver_hook._checkpoint_dir,
+        latest_filename=self._latest_filename)
+    if latest_checkpoint_path:
+      self._checkpoint_saver_hook._get_saver().restore(session,
+                                                       latest_checkpoint_path)
+    else:
+      # The checkpoint saved here is the state at step "global_step".
+      # Note: We do not save the GraphDef or MetaGraphDef here.
+      global_step = session.run(self._checkpoint_saver_hook._global_step_tensor)
+      self._checkpoint_saver_hook._save(session, global_step)
+      self._checkpoint_saver_hook._timer.update_last_triggered_step(global_step)
+    # pylint: enable=protected-access
+
+  def before_run(self, run_context):
+    if self._first_run:
+      self._restore_or_save_initial_ckpt(run_context.session)
+      self._first_run = False
+    return self._checkpoint_saver_hook.before_run(run_context)
+
+  def after_run(self, run_context, run_values):
+    self._checkpoint_saver_hook.after_run(run_context, run_values)
+
+  def end(self, session):
+    self._checkpoint_saver_hook.end(session)
+
+
+class _CustomSaver(saver_lib.Saver):
+  """`Saver` with a different default `latest_filename`.
+
+  This is used in the `CheckpointInputPipelineHook` to avoid conflicts with
+  the model ckpt saved by the `CheckpointSaverHook`.
+  """
+
+  def __init__(self, var_list, latest_filename):
+    super(_CustomSaver, self).__init__(var_list)
+    self._latest_filename = latest_filename
+
+  def save(self,
+           sess,
+           save_path,
+           global_step=None,
+           latest_filename=None,
+           meta_graph_suffix="meta",
+           write_meta_graph=True,
+           write_state=True,
+           strip_default_attrs=False):
+    return super(_CustomSaver, self).save(
+        sess, save_path, global_step, latest_filename or self._latest_filename,
+        meta_graph_suffix, write_meta_graph, write_state, strip_default_attrs)
diff --git a/tensorflow/contrib/data/python/ops/iterator_ops_test.py b/tensorflow/contrib/data/python/ops/iterator_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..30a993b1f7056b9726f524b2279131339c80c5eb
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/iterator_ops_test.py
@@ -0,0 +1,123 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for experimental iterator_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.ops import iterator_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import training_util
+
+
+class CheckpointInputPipelineHookTest(test.TestCase):
+
+  @staticmethod
+  def _model_fn(features, labels, mode, config):
+    del labels
+    del mode
+    del config
+    global_step = training_util.get_or_create_global_step()
+    update_global_step_op = global_step.assign_add(1)
+    latest_feature = variables.Variable(
+        0, name='latest_feature', dtype=dtypes.int64)
+    store_latest_feature_op = latest_feature.assign(features)
+    ops.add_to_collection('my_vars', global_step)
+    ops.add_to_collection('my_vars', latest_feature)
+    return model_fn.EstimatorSpec(
+        mode='train',
+        train_op=control_flow_ops.group(
+            [update_global_step_op, store_latest_feature_op]),
+        loss=constant_op.constant(2.0))
+
+  def _read_vars(self, model_dir):
+    """Returns (global_step, latest_feature)."""
+    with ops.Graph().as_default() as g:
+      ckpt_path = saver_lib.latest_checkpoint(model_dir)
+      meta_filename = ckpt_path + '.meta'
+      saver_lib.import_meta_graph(meta_filename)
+      saver = saver_lib.Saver()
+      with self.test_session(graph=g) as sess:
+        saver.restore(sess, ckpt_path)
+        return sess.run(ops.get_collection('my_vars'))
+
+  def _build_iterator_saver_hook(self, est):
+    return iterator_ops.CheckpointInputPipelineHook(est)
+
+  def testReturnDatasetFromInputFn(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.range(10)
+
+    est = estimator.Estimator(model_fn=self._model_fn)
+
+    est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
+    self.assertSequenceEqual(self._read_vars(est.model_dir), (2, 1))
+    est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
+    self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
+
+  def testBuildIteratorInInputFn(self):
+
+    def _input_fn():
+      ds = dataset_ops.Dataset.range(10)
+      iterator = ds.make_one_shot_iterator()
+      return iterator.get_next()
+
+    est = estimator.Estimator(model_fn=self._model_fn)
+
+    est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
+    self.assertSequenceEqual(self._read_vars(est.model_dir), (2, 1))
+    est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
+    self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
+
+  def testDoNotRestore(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.range(10)
+
+    est = estimator.Estimator(model_fn=self._model_fn)
+
+    est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
+    self.assertSequenceEqual(self._read_vars(est.model_dir), (2, 1))
+    est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
+    self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
+    # Hook not provided, input pipeline was not restored.
+    est.train(_input_fn, steps=2)
+    self.assertSequenceEqual(self._read_vars(est.model_dir), (6, 1))
+
+  def testRaiseErrorIfNoIterator(self):
+
+    def _input_fn():
+      return constant_op.constant(1, dtype=dtypes.int64)
+
+    est = estimator.Estimator(model_fn=self._model_fn)
+
+    with self.assertRaises(ValueError):
+      est.train(
+          _input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index bbb808fbd7730002e48cab47fa8d0fe09e2124d2..75c31a944a09462f534f6ae3e3204c812ecf28d9 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -18,15 +18,16 @@ from __future__ import division
 from __future__ import print_function
 
 import csv
-from math import ceil
 
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.data.python.ops import gen_dataset_ops as contrib_gen_dataset_ops
 from tensorflow.contrib.data.python.ops import interleave_ops
 from tensorflow.contrib.data.python.ops import shuffle_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -34,9 +35,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.util import deprecation
 
@@ -68,7 +67,7 @@ def _is_valid_float(str_val, float_dtype):
     return False
 
 
-def _infer_type(str_val, na_value, prev_type, float_dtype):
+def _infer_type(str_val, na_value, prev_type):
   """Given a string, infers its tensor type.
 
   Infers the type of a value by picking the least 'permissive' type possible,
@@ -79,29 +78,34 @@ def _infer_type(str_val, na_value, prev_type, float_dtype):
     na_value: Additional string to recognize as a NA/NaN CSV value.
     prev_type: Type previously inferred based on values of this column that
       we've seen up till now.
-    float_dtype: Either `tf.float32` or `tf.float64`. Denotes what float type
-      to parse float strings as.
   Returns:
     Inferred dtype.
   """
   if str_val in ("", na_value):
+    # If the field is null, it gives no extra information about its type
     return prev_type
 
-  if _is_valid_int32(str_val) and prev_type in (None, dtypes.int32):
-    return dtypes.int32
+  type_list = [
+      dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64, dtypes.string
+  ]  # list of types to try, ordered from least permissive to most
 
-  if _is_valid_int64(str_val) and prev_type in (None, dtypes.int32,
-                                                dtypes.int64):
-    return dtypes.int64
+  type_functions = [
+      _is_valid_int32,
+      _is_valid_int64,
+      lambda str_val: _is_valid_float(str_val, dtypes.float32),
+      lambda str_val: _is_valid_float(str_val, dtypes.float64),
+      lambda str_val: True,
+  ]  # Corresponding list of validation functions
 
-  if _is_valid_float(str_val, float_dtype) and prev_type != dtypes.string:
-    return float_dtype
+  for i in range(len(type_list)):
+    validation_fn = type_functions[i]
+    if validation_fn(str_val) and (prev_type is None or
+                                   prev_type in type_list[:i + 1]):
+      return type_list[i]
 
-  return dtypes.string
 
-
-def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
-                  comment):
+def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header):
+  """Generator that yields rows of CSV file(s) in order."""
   for fn in filenames:
     with file_io.FileIO(fn, "r") as f:
       rdr = csv.reader(
@@ -112,9 +116,6 @@ def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
         next(rdr)  # Skip header lines
 
       for csv_row in rdr:
-        if comment is not None and csv_row[0].startswith(comment):
-          continue  # Skip comment lines
-
         if len(csv_row) != num_cols:
           raise ValueError(
               "Problem inferring types: CSV row has different number of fields "
@@ -123,22 +124,21 @@ def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
 
 
 def _infer_column_defaults(filenames, num_cols, field_delim, use_quote_delim,
-                           na_value, header, comment, float_dtype,
-                           num_rows_for_inference, select_columns):
+                           na_value, header, num_rows_for_inference,
+                           select_columns):
   """Infers column types from the first N valid CSV records of files."""
   if select_columns is None:
     select_columns = range(num_cols)
   inferred_types = [None] * len(select_columns)
 
   for i, csv_row in enumerate(
-      _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
-                    comment)):
+      _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header)):
     if num_rows_for_inference is not None and i >= num_rows_for_inference:
       break
 
     for j, col_index in enumerate(select_columns):
       inferred_types[j] = _infer_type(csv_row[col_index], na_value,
-                                      inferred_types[j], float_dtype)
+                                      inferred_types[j])
 
   # Replace None's with a default type
   inferred_types = [t or dtypes.string for t in inferred_types]
@@ -198,6 +198,112 @@ def _get_sorted_col_indices(select_columns, column_names):
   return result
 
 
+def _maybe_shuffle_and_repeat(
+    dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed):
+  """Optionally shuffle and repeat dataset, as requested."""
+  if num_epochs != 1 and shuffle:
+    # Use shuffle_and_repeat for perf
+    return dataset.apply(
+        shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
+                                       shuffle_seed))
+  elif shuffle:
+    return dataset.shuffle(shuffle_buffer_size, shuffle_seed)
+  elif num_epochs != 1:
+    return dataset.repeat(num_epochs)
+  return dataset
+
+
+def make_tf_record_dataset(
+    file_pattern,
+    batch_size,
+    parser_fn=None,
+    num_epochs=None,
+    shuffle=True,
+    shuffle_buffer_size=None,
+    shuffle_seed=None,
+    prefetch_buffer_size=None,
+    num_parallel_reads=None,
+    num_parallel_parser_calls=None,
+    drop_final_batch=False):
+  """Reads and optionally parses TFRecord files into a dataset.
+
+  Provides common functionality such as batching, optional parsing, shuffling,
+  and performant defaults.
+
+  Args:
+    file_pattern: List of files or patterns of TFRecord file paths.
+      See @{tf.gfile.Glob} for pattern rules.
+    batch_size: An int representing the number of records to combine
+      in a single batch.
+    parser_fn: (Optional.) A function accepting string input to parse
+      and process the record contents. This function must map records
+      to components of a fixed shape, so they may be batched. By
+      default, uses the record contents unmodified.
+    num_epochs: (Optional.) An int specifying the number of times this
+      dataset is repeated.  If None (the default), cycles through the
+      dataset forever.
+    shuffle: (Optional.) A bool that indicates whether the input
+      should be shuffled. Defaults to `True`.
+    shuffle_buffer_size: (Optional.) Buffer size to use for
+      shuffling. A large buffer size ensures better shuffling, but
+      increases memory usage and startup time.
+    shuffle_seed: (Optional.) Randomization seed to use for shuffling.
+    prefetch_buffer_size: (Optional.) An int specifying the number of
+      feature batches to prefetch for performance improvement.
+      Defaults to auto-tune. Set to 0 to disable prefetching.
+    num_parallel_reads: (Optional.) Number of threads used to read
+      records from files. By default or if set to a value >1, the
+      results will be interleaved.
+    num_parallel_parser_calls: (Optional.) Number of parallel
+      records to parse in parallel. Defaults to an automatic selection.
+    drop_final_batch: (Optional.) Whether the last batch should be
+      dropped in case its size is smaller than `batch_size`; the
+      default behavior is not to drop the smaller batch.
+
+  Returns:
+    A dataset, where each element matches the output of `parser_fn`
+    except it will have an additional leading `batch-size` dimension,
+    or a `batch_size`-length 1-D tensor of strings if `parser_fn` is
+    unspecified.
+  """
+  files = dataset_ops.Dataset.list_files(
+      file_pattern, shuffle=shuffle, seed=shuffle_seed)
+
+  if num_parallel_reads is None:
+    # Note: We considered auto-tuning this value, but there is a concern
+    # that this affects the mixing of records from different files, which
+    # could affect training convergence/accuracy, so we are defaulting to
+    # a constant for now.
+    num_parallel_reads = 24
+  dataset = core_readers.TFRecordDataset(
+      files, num_parallel_reads=num_parallel_reads)
+
+  if shuffle_buffer_size is None:
+    # TODO(josh11b): Auto-tune this value when not specified
+    shuffle_buffer_size = 10000
+  dataset = _maybe_shuffle_and_repeat(
+      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)
+
+  if parser_fn is None:
+    if drop_final_batch:
+      dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size))
+    else:
+      dataset = dataset.batch(batch_size)
+  else:
+    # TODO(josh11b): if num_parallel_parser_calls is None, use some function
+    # of num cores instead of map_and_batch's default behavior of one batch.
+    dataset = dataset.apply(batching.map_and_batch(
+        parser_fn, batch_size, num_parallel_calls=num_parallel_parser_calls,
+        drop_remainder=drop_final_batch))
+
+  if prefetch_buffer_size is None:
+    prefetch_buffer_size = -1  # tf.config.data.AUTOTUNE
+  if prefetch_buffer_size == 0:
+    return dataset
+  else:
+    return dataset.prefetch(buffer_size=prefetch_buffer_size)
+
+
 def make_csv_dataset(
     file_pattern,
     batch_size,
@@ -209,7 +315,6 @@ def make_csv_dataset(
     use_quote_delim=True,
     na_value="",
     header=True,
-    comment=None,
     num_epochs=None,
     shuffle=True,
     shuffle_buffer_size=10000,
@@ -218,7 +323,6 @@ def make_csv_dataset(
     num_parallel_reads=1,
     num_parallel_parser_calls=2,
     sloppy=False,
-    default_float_type=dtypes.float32,
     num_rows_for_inference=100,
 ):
   """Reads CSV files into a dataset.
@@ -231,8 +335,8 @@ def make_csv_dataset(
   Args:
     file_pattern: List of files or patterns of file paths containing CSV
       records. See @{tf.gfile.Glob} for pattern rules.
-    batch_size: An int representing the number of consecutive elements of this
-      dataset to combine in a single batch.
+    batch_size: An int representing the number of records to combine
+      in a single batch.
     column_names: An optional list of strings that corresponds to the CSV
       columns, in order. One per column of the input record. If this is not
       provided, infers the column names from the first row of the records.
@@ -272,15 +376,11 @@ def make_csv_dataset(
     header: A bool that indicates whether the first rows of provided CSV files
       correspond to header lines with column names, and should not be included
       in the data.
-    comment: An optional character string that marks lines that should not be
-      parsed as csv records. If this is provided, all lines that start with
-      this character will not be parsed.
     num_epochs: An int specifying the number of times this dataset is repeated.
       If None, cycles through the dataset forever.
     shuffle: A bool that indicates whether the input should be shuffled.
     shuffle_buffer_size: Buffer size to use for shuffling. A large buffer size
-      ensures better shuffling, but would increase memory usage and startup
-      time.
+      ensures better shuffling, but increases memory usage and startup time.
     shuffle_seed: Randomization seed to use for shuffling.
     prefetch_buffer_size: An int specifying the number of feature batches to
       prefetch for performance improvement. Recommended value is the number of
@@ -294,8 +394,6 @@ def make_csv_dataset(
       produced is deterministic prior to shuffling (elements are still
       randomized if `shuffle=True`. Note that if the seed is set, then order
       of elements after shuffling is deterministic). Defaults to `False`.
-    default_float_type: Either `tf.float32` or `tf.float64`. If defaults are
-      not provided, float-like strings are interpreted to be this type.
     num_rows_for_inference: Number of rows of a file to use for type inference
       if record_defaults is not provided. If None, reads all the rows of all
       the files. Defaults to 100.
@@ -317,8 +415,6 @@ def make_csv_dataset(
     dataset = dataset.shuffle(len(filenames), shuffle_seed)
 
   # Clean arguments; figure out column names and defaults
-  if comment is not None and len(comment) != 1:
-    raise ValueError("`comment` arg must be a single-character string or None")
 
   if column_names is None:
     if not header:
@@ -341,8 +437,7 @@ def make_csv_dataset(
     # construction time
     column_defaults = _infer_column_defaults(
         filenames, len(column_names), field_delim, use_quote_delim, na_value,
-        header, comment, default_float_type, num_rows_for_inference,
-        select_columns)
+        header, num_rows_for_inference, select_columns)
 
   if select_columns is not None and len(column_defaults) != len(select_columns):
     raise ValueError(
@@ -356,71 +451,189 @@ def make_csv_dataset(
   if label_name is not None and label_name not in column_names:
     raise ValueError("`label_name` provided must be one of the columns.")
 
-  # Define map and filter functions
-  def filter_fn(line):
-    return math_ops.not_equal(string_ops.substr(line, 0, 1), comment)
-
   def filename_to_dataset(filename):
-    ds = core_readers.TextLineDataset(filename)
-    if header:
-      ds = ds.skip(1)
-    if comment is not None:
-      ds = ds.filter(filter_fn)
-    return ds
+    return CsvDataset(
+        filename,
+        record_defaults=column_defaults,
+        field_delim=field_delim,
+        use_quote_delim=use_quote_delim,
+        na_value=na_value,
+        select_cols=select_columns,
+        header=header)
 
-  def decode_csv(line):
-    """Decodes CSV line into features.
+  def map_fn(*columns):
+    """Organizes columns into a features dictionary.
 
     Args:
-      line: String tensor corresponding to one csv record.
+      *columns: list of `Tensor`s corresponding to one csv record.
     Returns:
       A dictionary of feature names to values for that particular record. If
       label_name is provided, extracts the label feature to be returned as the
       second element of the tuple.
     """
-    columns = parsing_ops.decode_csv(
-        line,
-        column_defaults,
-        field_delim=field_delim,
-        use_quote_delim=use_quote_delim,
-        na_value=na_value,
-        select_cols=select_columns,
-    )
     features = dict(zip(column_names, columns))
     if label_name is not None:
       label = features.pop(label_name)
       return features, label
     return features
 
-  # Read files sequentially or in parallel
+  # Read files sequentially (if num_parallel_reads=1) or in parallel
   dataset = dataset.apply(
       interleave_ops.parallel_interleave(
           filename_to_dataset, cycle_length=num_parallel_reads, sloppy=sloppy))
 
-  if num_epochs != 1 and shuffle:
-    # Use shuffle_and_repeat for perf
-    dataset = dataset.apply(
-        shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
-                                       shuffle_seed))
-  elif shuffle:
-    dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)
-  elif num_epochs != 1:
-    dataset = dataset.repeat(num_epochs)
-
-  # Use map_and_batch for perf
-  # TODO(b/76425672): use num_parallel_calls for better performance tuning when
-  # that is added
-  dataset = dataset.apply(
-      batching.map_and_batch(
-          map_func=decode_csv,
-          batch_size=batch_size,
-          num_parallel_batches=int(
-              ceil(num_parallel_parser_calls / batch_size))))
+  dataset = _maybe_shuffle_and_repeat(
+      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)
 
+  # Apply batch before map for perf, because map has high overhead relative
+  # to the size of the computation in each map
+  dataset = dataset.batch(batch_size=batch_size)
+  dataset = dataset.map(map_fn, num_parallel_calls=num_parallel_parser_calls)
   dataset = dataset.prefetch(prefetch_buffer_size)
+
   return dataset
 
 
+_DEFAULT_READER_BUFFER_SIZE_BYTES = 4 * 1024 * 1024  # 4 MB
+
+
+class CsvDataset(dataset_ops.Dataset):
+  """A Dataset comprising lines from one or more CSV files."""
+
+  def __init__(self,
+               filenames,
+               record_defaults,
+               buffer_size=None,
+               header=False,
+               field_delim=",",
+               use_quote_delim=True,
+               na_value="",
+               select_cols=None):
+    """Creates a `CsvDataset` by reading and decoding CSV files.
+
+    The elements of this dataset correspond to records from the file(s).
+    RFC 4180 format is expected for CSV files
+    (https://tools.ietf.org/html/rfc4180)
+    Note that we allow leading and trailing spaces with int or float field.
+
+
+    For example, suppose we have a file 'my_file0.csv' with four CSV columns of
+    different data types:
+    ```
+    abcdefg,4.28E10,5.55E6,12
+    hijklmn,-5.3E14,,2
+    ```
+
+    We can construct a CsvDataset from it as follows:
+    ```python
+    dataset = tf.contrib.data.CsvDataset(
+      "my_file*.csv",
+      [tf.float32,  # Required field, use dtype or empty tensor
+       tf.constant([0.0], dtype=tf.float32),  # Optional field, default to 0.0
+       tf.int32,  # Required field, use dtype or empty tensor
+       ],
+      select_cols=[1,2,3]  # Only parse last three columns
+    )
+    ```
+
+    The expected output of its iterations is:
+    ```python
+    next = dataset.make_one_shot_iterator().get_next()
+    with tf.Session() as sess:
+      while True:
+        try:
+          print(sess.run(nxt))
+        except tf.errors.OutOfRangeError:
+          break
+
+    >> (4.28e10, 5.55e6, 12)
+    >> (-5.3e14, 0.0, 2)
+    ```
+
+    Args:
+      filenames: A `tf.string` tensor containing one or more filenames.
+      record_defaults: A list of default values for the CSV fields. Each item in
+        the list is either a valid CSV `DType` (float32, float64, int32, int64,
+        string), or a `Tensor` object with one of the above types. One per
+        column of CSV data, with either a scalar `Tensor` default value for the
+        column if it is optional, or `DType` or empty `Tensor` if required. If
+        both this and `select_columns` are specified, these must have the same
+        lengths, and `column_defaults` is assumed to be sorted in order of
+        increasing column index.
+      buffer_size: (Optional.) A `tf.int64` scalar denoting the number of bytes
+        to buffer while reading files. Defaults to 4MB.
+      header: (Optional.) A `tf.bool` scalar indicating whether the CSV file(s)
+        have header line(s) that should be skipped when parsing. Defaults to
+        `False`.
+      field_delim: (Optional.) A `tf.string` scalar containing the delimiter
+        character that separates fields in a record. Defaults to `","`.
+      use_quote_delim: (Optional.) A `tf.bool` scalar. If `False`, treats
+        double quotation marks as regular characters inside of string fields
+        (ignoring RFC 4180, Section 2, Bullet 5). Defaults to `True`.
+      na_value: (Optional.) A `tf.string` scalar indicating a value that will
+        be treated as NA/NaN.
+      select_cols: (Optional.) A sorted list of column indices to select from
+        the input data. If specified, only this subset of columns will be
+        parsed. Defaults to parsing all columns.
+    """
+    super(CsvDataset, self).__init__()
+    self._filenames = ops.convert_to_tensor(
+        filenames, dtype=dtypes.string, name="filenames")
+    record_defaults = [
+        constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
+        for x in record_defaults
+    ]
+    self._record_defaults = ops.convert_n_to_tensor(
+        record_defaults, name="record_defaults")
+    self._buffer_size = convert.optional_param_to_tensor(
+        "buffer_size", buffer_size, _DEFAULT_READER_BUFFER_SIZE_BYTES)
+    self._header = ops.convert_to_tensor(
+        header, dtype=dtypes.bool, name="header")
+    self._field_delim = ops.convert_to_tensor(
+        field_delim, dtype=dtypes.string, name="field_delim")
+    self._use_quote_delim = ops.convert_to_tensor(
+        use_quote_delim, dtype=dtypes.bool, name="use_quote_delim")
+    self._na_value = ops.convert_to_tensor(
+        na_value, dtype=dtypes.string, name="na_value")
+    self._select_cols = convert.optional_param_to_tensor(
+        "select_cols",
+        select_cols,
+        argument_default=[],
+        argument_dtype=dtypes.int64,
+    )
+    self._output_shapes = tuple(
+        tensor_shape.scalar() for _ in range(len(record_defaults)))
+    self._output_types = tuple(d.dtype for d in self._record_defaults)
+    self._output_classes = tuple(
+        ops.Tensor for _ in range(len(record_defaults)))
+
+  def _as_variant_tensor(self):
+    # Constructs graph node for the dataset op.
+    return contrib_gen_dataset_ops.csv_dataset(
+        filenames=self._filenames,
+        record_defaults=self._record_defaults,
+        buffer_size=self._buffer_size,
+        header=self._header,
+        output_shapes=self._output_shapes,
+        field_delim=self._field_delim,
+        use_quote_delim=self._use_quote_delim,
+        na_value=self._na_value,
+        select_cols=self._select_cols,
+    )
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_classes(self):
+    return self._output_classes
+
+
 def make_batched_features_dataset(file_pattern,
                                   batch_size,
                                   features,
@@ -480,8 +693,8 @@ def make_batched_features_dataset(file_pattern,
   Args:
     file_pattern: List of files or patterns of file paths containing
       `Example` records. See `tf.gfile.Glob` for pattern rules.
-    batch_size: An int representing the number of consecutive elements of this
-      dataset to combine in a single batch.
+    batch_size: An int representing the number of records to combine
+      in a single batch.
     features: A `dict` mapping feature keys to `FixedLenFeature` or
       `VarLenFeature` values. See `tf.parse_example`.
     reader: A function or class that can be
@@ -537,16 +750,8 @@ def make_batched_features_dataset(file_pattern,
     dataset = dataset.map(lambda _, v: v)
 
   # Apply dataset repeat and shuffle transformations.
-  repeat_dataset = (num_epochs != 1)
-  if repeat_dataset and shuffle:
-    # Used fused shuffle_and_repeat operation for better performance
-    dataset = dataset.apply(
-        shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
-                                       shuffle_seed))
-  elif repeat_dataset:
-    dataset = dataset.repeat(num_epochs)
-  elif shuffle:
-    dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)
+  dataset = _maybe_shuffle_and_repeat(
+      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)
 
   if drop_final_batch:
     dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size))
@@ -620,8 +825,8 @@ def read_batch_features(file_pattern,
   Args:
     file_pattern: List of files or patterns of file paths containing
       `Example` records. See `tf.gfile.Glob` for pattern rules.
-    batch_size: An int representing the number of consecutive elements of this
-      dataset to combine in a single batch.
+    batch_size: An int representing the number of records to combine
+      in a single batch.
     features: A `dict` mapping feature keys to `FixedLenFeature` or
       `VarLenFeature` values. See `tf.parse_example`.
     reader: A function or class that can be
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 8dfcaf6032e1602ed76a8a995553c5d398c4a778..64a77bbed1d55c3d95329d9c7783c2b468bde745 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -26,7 +26,6 @@ py_library(
         "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/contrib/eager/python:datasets",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:checkpointable",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:device_util",
         "//tensorflow/python:distribute",
@@ -34,6 +33,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/checkpointable:base",
         "@six_archive//:six",
     ],
 )
@@ -151,6 +151,7 @@ py_library(
         ":one_device_strategy",
         ":tpu_strategy",
         "//tensorflow/contrib/optimizer_v2:training",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
@@ -469,24 +470,24 @@ py_library(
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "cross_tower_ops_test",
     srcs = ["cross_tower_ops_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
-    deps = [
+    additional_deps = [
         ":combinations",
         ":cross_tower_ops",
         ":values",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
-        "@absl_py//absl/testing:parameterized",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
     ],
 )
 
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 946310aa6fc2101d75e86d3ff2e9f3284e6c6625..15935817b0283ebc04b95304afe41d8690a11442 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -51,6 +51,7 @@ from tensorflow.contrib.optimizer_v2 import gradient_descent as gradient_descent
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.training import adam
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.util import tf_inspect
 
@@ -69,26 +70,31 @@ def generate(combinations):
    -- there should always be a "mode" argument.  Accepted values are "eager"
       and "graph".
    -- arguments of the test method must match by name to get the corresponding
-      value of the combination.  Tests must accept all arguments (except "mode",
-      which is optional).
-   -- distribution argument is special.  It is meant for passing instances of
-      DistributionStrategy.  Each instance is to be passed as `(<int>,
-      <DistributionStrategy>)` tuple, where <int> is the number of required
-      GPUs.  If the required number of GPUs for the DistributionStrategy isn't
-      available then the test case is going to be skipped.
+      value of the combination.  Tests must accept all arguments except the
+      "mode", "required_tpu" and "required_gpus".
+   -- "distribution" argument is special and optional.  It is meant for passing
+      instances of DistributionStrategy.  Each instance is to be passed as via
+      `NamedDistribution`.  If using "distribution", "required_gpus" and
+      "required_tpu" should be specified via the NamedDistribution instance,
+      rather than as separate arguments.
+   -- "required_tpu" argument is special and optional.  If not `None`, then the
+      test will be skipped if TPUs aren't available.
+   -- "required_gpus" argument is special and optional.  If not `None`, then the
+      test will be skipped if the specified number of GPUs aren't available.
 
   Returns:
     a decorator that will cause the test method to be run under the specified
     conditions.
 
   Raises:
-    ValueError - if "mode" argument wasn't either "eager" or "graph.
+    ValueError - if "mode" argument wasn't either "eager" or "graph".
   """
 
   def decorator(test_function):
     """The decorator to be returned."""
 
     # Generate good test names that can be used with --test_filter.
+    named_combinations = []
     for combination in combinations:
       # We use OrderedDicts in `combine()` and `times()` to ensure stable
       # order of keys in each dictionary.
@@ -99,30 +105,46 @@ def generate(combinations):
               "".join(filter(str.isalnum, str(value))))
           for key, value in combination.items()
       ])
-      combination.update({"testcase_name": "_test{}".format(name)})
+      named_combinations.append(
+          OrderedDict(
+              list(combination.items()) + [("testcase_name",
+                                            "_test{}".format(name))]))
 
-    @parameterized.named_parameters(*combinations)
+    @parameterized.named_parameters(*named_combinations)
     def decorated(self, **kwargs):
       """A wrapped test method that sets up `test_function`."""
       assert "mode" in kwargs
       mode = kwargs["mode"]
 
-      if "distribution" in kwargs:
-        distribution = kwargs["distribution"]
-        kwargs["distribution"] = distribution.strategy
-        if distribution.required_tpu and not TPU_TEST:
-          self.skipTest("Test requires a TPU, but it's not available.")
-        if not distribution.required_tpu and TPU_TEST:
-          self.skipTest("Test that doesn't require a TPU.")
-
-        if not distribution.required_gpus:
-          if GPU_TEST:
-            self.skipTest("Test that doesn't require GPUs.")
-        elif context.num_gpus() < distribution.required_gpus:
-          self.skipTest(
-              "{} GPUs are not available for this test. {} GPUs are available".
-              format(distribution.required_gpus, context.num_gpus()))
+      distribution = kwargs.pop("distribution", None)
+      required_tpu = kwargs.pop("required_tpu", False)
+      required_gpus = kwargs.pop("required_gpus", None)
 
+      if distribution:
+        assert required_gpus is None, (
+            "Do not use `required_gpus` and `distribution` together.")
+        assert required_tpu is False, (
+            "Do not use `required_tpu` and `distribution` together.")
+        kwargs["distribution"] = distribution.strategy
+        required_gpus = distribution.required_gpus
+        required_tpu = distribution.required_tpu
+
+      if required_tpu and not TPU_TEST:
+        self.skipTest("Test requires a TPU, but it's not available.")
+      if not required_tpu and TPU_TEST:
+        self.skipTest("Test that doesn't require a TPU.")
+
+      if not required_gpus:
+        if GPU_TEST:
+          self.skipTest("Test that doesn't require GPUs.")
+      elif context.num_gpus() < required_gpus:
+        self.skipTest(
+            "{} GPUs are not available for this test. {} GPUs are available".
+            format(required_gpus, context.num_gpus()))
+
+      # At this point, `kwargs` doesn't have `required_gpus` or `required_tpu`
+      # that the user might have specified.  `kwargs` still has `mode`, which
+      # the test is allowed to accept or ignore.
       requested_arguments = tf_inspect.getfullargspec(test_function).args
       missing_arguments = set(list(kwargs.keys()) + ["self"]).difference(
           set(requested_arguments + ["mode"]))
@@ -159,7 +181,8 @@ def combine(**kwargs):
   can be computed using `times()`.
 
   Args:
-    **kwargs: keyword arguments of form `option=[possibilities, ...]`.
+    **kwargs: keyword arguments of form `option=[possibilities, ...]`
+         or `option=the_only_possibility`.
 
   Returns:
     a list of dictionaries for each combination. Keys in the dictionaries are
@@ -178,6 +201,8 @@ def combine(**kwargs):
 
   key = first[0]
   values = first[1]
+  if not isinstance(values, list):
+    values = [values]
 
   return [
       OrderedDict(sorted(list(combined.items()) + [(key, v)], key=sort_by_key))
@@ -262,21 +287,31 @@ class NamedDistribution(object):
     return self._required_tpu
 
 
+default_strategy = NamedDistribution(
+    "Default",
+    distribute_lib._default_distribution_strategy,  # pylint: disable=protected-access
+    required_gpus=None)
 one_device_strategy = NamedDistribution(
     "OneDeviceCPU", one_device_strategy.OneDeviceStrategy("/cpu:0"),
-    None)
+    required_gpus=None)
+tpu_strategy_single_iteration = NamedDistribution(
+    "TPUSingleIteration",
+    tpu_strategy.TPUStrategy(iterations_per_step=1),
+    required_tpu=True)
 tpu_strategy = NamedDistribution(
     "TPU", tpu_strategy.TPUStrategy(), required_tpu=True)
+# Note that we disable prefetching for testing since prefetching makes
+# the input non-deterministic.
 mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
     "MirroredCPUAndGPU",
-    mirrored_strategy.MirroredStrategy(["/gpu:0", "/cpu:0"]), 1)
-mirrored_strategy_without_prefetch = NamedDistribution(
-    "MirroredCPUAndGPUNoPrefetch",
     mirrored_strategy.MirroredStrategy(
-        ["/gpu:0", "/cpu:0"], prefetch_on_device=False), 1)
+        ["/gpu:0", "/cpu:0"], prefetch_on_device=False),
+    required_gpus=1)
 mirrored_strategy_with_two_gpus = NamedDistribution(
     "Mirrored2GPUs",
-    mirrored_strategy.MirroredStrategy(["/gpu:0", "/gpu:1"]), 2)
+    mirrored_strategy.MirroredStrategy(
+        ["/gpu:0", "/gpu:1"], prefetch_on_device=False),
+    required_gpus=2)
 
 adam_optimizer_v1_fn = NamedObject(
     "AdamV1", lambda: adam.AdamOptimizer(0.2, epsilon=1))
diff --git a/tensorflow/contrib/distribute/python/combinations_test.py b/tensorflow/contrib/distribute/python/combinations_test.py
index 219b24160f3902fcfa5363cc39a8fc5b30d00308..184bcf27e59d68b82d28e8f01890c04f214c017c 100644
--- a/tensorflow/contrib/distribute/python/combinations_test.py
+++ b/tensorflow/contrib/distribute/python/combinations_test.py
@@ -41,6 +41,15 @@ class TestingCombinationsTest(test.TestCase):
         "b": 3
     }], combinations.combine(a=[1, 2], b=[2, 3]))
 
+  def test_combine_single_parameter(self):
+    self.assertEqual([{
+        "a": 1,
+        "b": 2
+    }, {
+        "a": 2,
+        "b": 2
+    }], combinations.combine(a=[1, 2], b=2))
+
   def test_add(self):
     self.assertEqual(
         [{
diff --git a/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py b/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
index b87224251ca3844fc81c6f32a893d2c71664a955..2b05884b9b93470ef9a764cbedbc91bd3912c611 100644
--- a/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
+++ b/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""An example tf.keras model that is trained using MirroredStrategy."""
+"""An example of training tf.keras Model using MirroredStrategy."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from sys import argv
+
+import sys
+
 import numpy as np
 import tensorflow as tf
 
@@ -33,30 +35,37 @@ def input_fn():
 
 def main(args):
   if len(args) < 2:
-    print('You must specify  model_dir for checkpoints such as'
-          ' /tmp/tfkeras_example./')
+    print('You must specify model_dir for checkpoints such as'
+          ' /tmp/tfkeras_example/.')
     return
 
-  print('Using %s to store checkpoints.' % args[1])
-
-  strategy = tf.contrib.distribute.MirroredStrategy(
-      ['/device:GPU:0', '/device:GPU:1'])
-  config = tf.estimator.RunConfig(train_distribute=strategy)
-  optimizer = tf.train.GradientDescentOptimizer(0.2)
+  model_dir = args[1]
+  print('Using %s to store checkpoints.' % model_dir)
 
+  # Define tf.keras Model.
   model = tf.keras.Sequential()
   model.add(tf.keras.layers.Dense(16, activation='relu', input_shape=(10,)))
   model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
 
+  # Compile tf.keras Model.
+  optimizer = tf.train.GradientDescentOptimizer(0.2)
   model.compile(loss='binary_crossentropy', optimizer=optimizer)
   model.summary()
   tf.keras.backend.set_learning_phase(True)
+
+  # Define a DistributionStrategy and convert the tf.keras Model to a
+  # tf.Estimator that utilizes the DistributionStrategy.
+  strategy = tf.contrib.distribute.MirroredStrategy(
+      ['/device:GPU:0', '/device:GPU:1'])
+  config = tf.estimator.RunConfig(train_distribute=strategy)
   keras_estimator = tf.keras.estimator.model_to_estimator(
-      keras_model=model, config=config, model_dir=args[1])
+      keras_model=model, config=config, model_dir=model_dir)
 
+  # Train and evaluate the tf.Estimator.
   keras_estimator.train(input_fn=input_fn, steps=10)
   eval_result = keras_estimator.evaluate(input_fn=input_fn)
   print('Eval result: {}'.format(eval_result))
 
+
 if __name__ == '__main__':
-  tf.app.run(argv=argv)
+  tf.app.run(argv=sys.argv)
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index e134fe34e10be402f028db986b8cbf14222db07f..5c056a7c73def2f1fb4bbe0df4d3f82fdabda3df 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -44,13 +44,16 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           combinations.distributions_and_v1_optimizers(),
           combinations.combine(mode=["graph"], use_callable_loss=[True, False])
           + combinations.combine(mode=["eager"], use_callable_loss=[True]),
-          combinations.combine(is_tpu=[False])) +
-      combinations.combine(
-          distribution=[combinations.tpu_strategy],
-          optimizer_fn=[combinations.adam_optimizer_v1_fn],
-          mode=["graph"],
-          use_callable_loss=[False],
-          is_tpu=[True]))
+          combinations.combine(is_tpu=[False])) + combinations.combine(
+              distribution=[combinations.tpu_strategy],
+              optimizer_fn=[
+                  combinations.adam_optimizer_v1_fn,
+                  # TODO(isaprykin):  Make Adam v2 work with while_loops
+                  # and TPUs.
+              ],
+              mode=["graph"],
+              use_callable_loss=[False],
+              is_tpu=[True]))
   def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss,
                        is_tpu):
     with distribution.scope():
@@ -101,7 +104,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           distribution=[combinations.tpu_strategy],
           optimizer_fn=[
               combinations.adam_optimizer_v1_fn,
-              combinations.gradient_descent_optimizer_v1_fn
+              combinations.gradient_descent_optimizer_v1_fn,
+              combinations.gradient_descent_optimizer_v2_fn,
           ],
           mode=["graph"],
           is_tpu=[True]))
@@ -171,13 +175,28 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           set(created_variables))
 
   @combinations.generate(
-      combinations.times(combinations.distributions_and_v1_optimizers(),
-                         combinations.combine(
-                             mode=["graph", "eager"],
-                             momentum=[0.8, 0.9, 0.99],
-                             renorm=[False, True])))
+      combinations.times(
+          combinations.combine(momentum=[0.8, 0.9, 0.99], renorm=[False, True]),
+          combinations.times(
+              combinations.distributions_and_v1_optimizers(),
+              combinations.combine(
+                  mode=["graph", "eager"],
+                  is_tpu=[False],
+                  # TODO(isaprykin):  Allow False here.  Currently subsequent
+                  # towers will re-execute UPDATE_OPS of previous towers.
+                  update_ops_in_cross_tower_mode=[True])) +
+          combinations.combine(
+              distribution=[combinations.tpu_strategy_single_iteration],
+              optimizer_fn=[
+                  combinations.gradient_descent_optimizer_v1_fn,
+                  combinations.gradient_descent_optimizer_v2_fn
+              ],
+              mode=["graph"],
+              is_tpu=[True],
+              update_ops_in_cross_tower_mode=[False])))
   def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn, momentum,
-                                    renorm):
+                                    renorm, is_tpu,
+                                    update_ops_in_cross_tower_mode):
     """Verifies that moving mean updates are reduced across towers."""
     with distribution.scope():
       num_towers = len(distribution.worker_devices)
@@ -185,27 +204,30 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           optimizer_fn,
           batch_per_epoch=num_towers,
           momentum=momentum,
-          renorm=renorm)
+          renorm=renorm,
+          update_ops_in_tower_mode=not update_ops_in_cross_tower_mode)
 
-      # Disable prefetching since that makes the specific input on each device
-      # to be non deterministic, and this test relies on specific input being
-      # on each device.
+      # Make sure prefetching is disabled since that makes the
+      # specific input on each device to be non deterministic, and
+      # this test relies on specific input being on each device.
       if isinstance(distribution, mirrored_strategy.MirroredStrategy):
-        distribution._prefetch_on_device = False
+        self.assertFalse(distribution._prefetch_on_device)
       iterator = distribution.distribute_dataset(
           dataset_fn).make_one_shot_iterator()
 
       def run_step():
-        return control_flow_ops.group(
-            distribution.unwrap(
-                distribution.call_for_each_tower(
-                    model_fn,
-                    iterator.get_next(),
-                    run_concurrently=batchnorm.built)) +
-            ops.get_collection(ops.GraphKeys.UPDATE_OPS))
+        fetches = distribution.unwrap(
+            distribution.call_for_each_tower(
+                model_fn, iterator.get_next(),
+                run_concurrently=batchnorm.built))
+        if update_ops_in_cross_tower_mode:
+          fetches += ops.get_collection(ops.GraphKeys.UPDATE_OPS)
+        return control_flow_ops.group(fetches)
 
       if not context.executing_eagerly():
         with self.test_session() as sess:
+          if is_tpu:
+            sess.run(tpu.initialize_system())
           run_step = sess.make_callable(run_step())
         self.evaluate(variables_lib.global_variables_initializer())
 
@@ -229,22 +251,40 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
               expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum))
           self.assertNear(expected_moving_means[i], moving_means[i], 0.0001)
 
+      if is_tpu:
+        with self.test_session() as sess:
+          sess.run(tpu.shutdown_system())
+
   @combinations.generate(
       combinations.times(
           combinations.combine(
-              distribution=[combinations.one_device_strategy,
-                            combinations.mirrored_strategy_with_gpu_and_cpu,
-                            combinations.mirrored_strategy_with_two_gpus],
-              optimizer_fn=[combinations.gradient_descent_optimizer_v1_fn,
-                            combinations.gradient_descent_optimizer_v2_fn],
-              loss_reduction=[losses_impl.Reduction.SUM,
-                              losses_impl.Reduction.MEAN,
-                              losses_impl.Reduction.SUM_OVER_BATCH_SIZE,
-                              losses_impl.Reduction.SUM_OVER_NONZERO_WEIGHTS]),
-          combinations.combine(mode=["graph"], use_callable_loss=[True, False])
-          + combinations.combine(mode=["eager"], use_callable_loss=[True])))
+              optimizer_fn=[
+                  combinations.gradient_descent_optimizer_v1_fn,
+                  combinations.gradient_descent_optimizer_v2_fn
+              ],
+              loss_reduction=[
+                  losses_impl.Reduction.SUM, losses_impl.Reduction.MEAN,
+                  losses_impl.Reduction.SUM_OVER_BATCH_SIZE,
+                  losses_impl.Reduction.SUM_OVER_NONZERO_WEIGHTS
+              ]),
+          combinations.times(
+              combinations.combine(
+                  distribution=[
+                      combinations.one_device_strategy,
+                      combinations.mirrored_strategy_with_gpu_and_cpu,
+                      combinations.mirrored_strategy_with_two_gpus
+                  ],
+                  is_tpu=[False]),
+              combinations.combine(
+                  mode=["graph"], use_callable_loss=[True, False]) +
+              combinations.combine(mode=["eager"], use_callable_loss=[True])) +
+          combinations.combine(
+              distribution=[combinations.tpu_strategy_single_iteration],
+              is_tpu=[True],
+              mode=["graph"],
+              use_callable_loss=[True, False])))
   def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction,
-                    use_callable_loss):
+                    use_callable_loss, is_tpu):
     with distribution.scope():
       all_vars = []
 
@@ -280,12 +320,13 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       if not context.executing_eagerly():
         with self.test_session() as sess:
+          if is_tpu:
+            sess.run(tpu.initialize_system())
           run_step = sess.make_callable(run_step())
         self.evaluate(variables_lib.global_variables_initializer())
 
       run_step()
 
-      self.assertEqual(distribution.num_towers, len(all_vars))
       v = all_vars[0]
       self.assertTrue(all([v is vi for vi in all_vars[1:]]))
       weight = numpy.squeeze(self.evaluate(distribution.fetch(v)))
@@ -312,6 +353,10 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         # One of the mean loss reductions.
         self.assertNear(weight, 2 + 10.6, 0.0001)
 
+      if is_tpu:
+        with self.test_session() as sess:
+          sess.run(tpu.shutdown_system())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 8237b23dbbdb10c053de53880d6838113b99be2d..89f2c431fece63269928fec6aa6d23b5a79ba0b9 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -111,10 +111,13 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
             kwargs["name"] = "%s/replica_%d" % (var0name, i)
             # Initialize replicas with the same value:
             if context.executing_eagerly():
-              initial_value = index[devices[0]].value()
+              kwargs["initial_value"] = array_ops.identity(
+                  index[devices[0]].value())
             else:
-              initial_value = index[devices[0]].initial_value
-            kwargs["initial_value"] = array_ops.identity(initial_value)
+              def initial_value_fn(device=d):
+                with ops.device(device):
+                  return array_ops.identity(index[devices[0]].initial_value)
+              kwargs["initial_value"] = initial_value_fn
           with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
             v = next_creator(*args, **kwargs)
           assert not isinstance(v, values.DistributedVariable)
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index 6c5c055070c0fc88ed8f3a459e3f346596f077a6..3f9a02b249dde9a66056ed8952b664bbc3f74ead 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -28,9 +28,12 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import core
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import distribute as distribute_lib
@@ -116,7 +119,6 @@ class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase):
       self.assertEqual(expected, self.evaluate(unwrapped[0]))
 
 
-@test_util.with_c_api
 class MirroredStrategyVariableCreationTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
@@ -370,22 +372,27 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       expected_sum = 0.0
       expected_mean = 0.0
       for i, d in enumerate(dist.worker_devices):
-        # Test access within a device scope, should see different values.
-        with ops.device(d):
-          v_sum_value = self.evaluate(ret_v_sum.read_value())
-          v_mean_value = self.evaluate(ret_v_mean.read_value())
-          expected = i + 3.0
-          self.assertEqual(expected, v_sum_value)
-          expected_sum += expected
-          expected = i * 6.0
-          self.assertEqual(expected, v_mean_value)
-          expected_mean += expected
-
-      # fetch() should return the value you get by applying the
-      # reduction across all towers.
-      self.assertEqual(expected_sum, self.evaluate(dist.fetch(ret_v_sum)))
+        # Should see different values on different devices.
+        v_sum_value = self.evaluate(ret_v_sum.get(d).read_value())
+        v_mean_value = self.evaluate(ret_v_mean.get(d).read_value())
+        expected = i + 3.0
+        self.assertEqual(expected, v_sum_value)
+        expected_sum += expected
+        expected = i * 6.0
+        self.assertEqual(expected, v_mean_value)
+        expected_mean += expected
       expected_mean /= len(dist.worker_devices)
+
+      # Without get(device), should return the value you get by
+      # applying the reduction across all towers (whether you use
+      # fetch(), get(), or nothing).
+      self.assertEqual(expected_sum, self.evaluate(dist.fetch(ret_v_sum)))
       self.assertEqual(expected_mean, self.evaluate(dist.fetch(ret_v_mean)))
+      self.assertEqual(expected_sum, self.evaluate(ret_v_sum.get()))
+      self.assertEqual(expected_mean, self.evaluate(ret_v_mean.get()))
+      if not context.executing_eagerly():
+        self.assertEqual(expected_sum, self.evaluate(ret_v_sum))
+        self.assertEqual(expected_mean, self.evaluate(ret_v_mean))
 
   # NOTE(priyag): Names and name scopes are ignored in eager, hence we are not
   # testing this in eager mode.
@@ -431,6 +438,30 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         self.assertEquals("foo/" + name + ":0", v0.name)
         self.assertEquals("tower_1/foo/" + name + ":0", v1.name)
 
+  def testDynamicRnnVariables(self):
+    def model_fn():
+      inputs = constant_op.constant(2 * [2 * [[0.0, 1.0, 2.0, 3.0, 4.0]]])
+      cell_fw = rnn_cell_impl.LSTMCell(300)
+      cell_bw = rnn_cell_impl.LSTMCell(300)
+      (outputs, _) = rnn.bidirectional_dynamic_rnn(
+          cell_fw,
+          cell_bw,
+          inputs,
+          dtype=dtypes.float32)
+      return outputs
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with context.graph_mode(), dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      # Two variables are created by the RNN layer.
+      self.assertEquals(2, len(result))
+      for v in result:
+        self.assertIsInstance(v, values.DistributedValues)
+        _, v1 = dist.unwrap(v)
+        self.assertStartsWith(v1.name, "tower_1/")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
index a1ef0ecc77a8e8432dfa4eb6da7c324b371dab70..61cbe6df813bb28bf8baa83d9e28ffafc4f0cbb8 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
@@ -27,7 +27,6 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import distribute as distribute_lib
 
 
-@test_util.with_c_api
 class MirroredOneCPUDistributionTest(strategy_test_lib.DistributionTestBase):
 
   def _get_distribution_strategy(self):
@@ -53,7 +52,6 @@ class MirroredOneCPUDistributionTest(strategy_test_lib.DistributionTestBase):
     self._test_call_and_merge_exceptions(self._get_distribution_strategy())
 
 
-@test_util.with_c_api
 class VariableCreatorStackTest(test.TestCase):
 
   def testCreatorStacksAreThreadLocal(self):
diff --git a/tensorflow/contrib/distribute/python/multi_worker_strategy_test.py b/tensorflow/contrib/distribute/python/multi_worker_strategy_test.py
index ee7588163e42ee3c31dd9fd25fc53e3483f0fbee..09c859b32a3150b95fbfcfa5b62b5eca426ddf18 100644
--- a/tensorflow/contrib/distribute/python/multi_worker_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/multi_worker_strategy_test.py
@@ -25,11 +25,9 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.training import server_lib
 
 
-@test_util.with_c_api
 class MultiWorkerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
                               strategy_test_lib.DistributionTestBase):
 
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
index 7101ed0756f44b846f10ddc6d429afe005a2f196..7aad8a953cbedd30b48739416e74b3dc164dc4cd 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
@@ -24,7 +24,6 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 
 
-@test_util.with_c_api
 class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
 
   def _get_distribution_strategy(self):
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator_test.py b/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
index 713494d603b855be2863af9f24ab98d4cf048042..a0b452fc2d445d1cf7dbf5e8fe0e29edef516207 100644
--- a/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
+++ b/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
@@ -44,7 +44,6 @@ class CanonicalizeVariableNameTest(test.TestCase):
     self.assertEquals("foo_a", self._canonicalize("foo_a"))
 
 
-@test_util.with_c_api
 class SharedVariableCreatorTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py
index 0db0b59fcacee2785eb8191bb84ed5216a79b081..d1fdb3279cf2a7cba6e2282d58eedccf38bd38a3 100644
--- a/tensorflow/contrib/distribute/python/single_loss_example.py
+++ b/tensorflow/contrib/distribute/python/single_loss_example.py
@@ -22,6 +22,7 @@ from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.distribute.python import step_fn
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
 from tensorflow.python.layers import core
 from tensorflow.python.layers import normalization
 from tensorflow.python.ops import array_ops
@@ -59,7 +60,7 @@ def minimize_loss_example(optimizer_fn,
     # TODO(isaprykin): map_and_batch with drop_remainder causes shapes to be
     # fully defined for TPU.  Remove this when XLA supports dynamic shapes.
     return dataset.apply(
-        batching.map_and_batch(lambda x: x, batch_size=2, drop_remainder=True))
+        batching.map_and_batch(lambda x: x, batch_size=1, drop_remainder=True))
 
   # An Optimizer instance is created either outside or inside model_fn.
   outer_optimizer = None
@@ -68,11 +69,10 @@ def minimize_loss_example(optimizer_fn,
 
   layer = core.Dense(1, use_bias=use_bias)
 
-  def model_fn(xs):
+  def model_fn(x):
     """A very simple model written by the user."""
 
     def loss_fn():
-      x = math_ops.reduce_mean(xs, keepdims=True)
       y = array_ops.reshape(layer(x), []) - constant_op.constant(1.)
       return y * y
 
@@ -89,7 +89,8 @@ def minimize_loss_example(optimizer_fn,
 def batchnorm_example(optimizer_fn,
                       batch_per_epoch=1,
                       momentum=0.9,
-                      renorm=False):
+                      renorm=False,
+                      update_ops_in_tower_mode=False):
   """Example of non-distribution-aware legacy code with batch normalization."""
 
   def dataset_fn():
@@ -103,12 +104,19 @@ def batchnorm_example(optimizer_fn,
   optimizer = optimizer_fn()
   batchnorm = normalization.BatchNormalization(
       renorm=renorm, momentum=momentum, fused=False)
+  layer = core.Dense(1, use_bias=False)
 
   def model_fn(x):
+    """A model that uses batchnorm."""
 
     def loss_fn():
-      y = math_ops.reduce_sum(batchnorm(x, training=True), axis=1)
-      loss = math_ops.reduce_mean(y - constant_op.constant(1.))
+      y = batchnorm(x, training=True)
+      with ops.control_dependencies(
+          ops.get_collection(ops.GraphKeys.UPDATE_OPS)
+          if update_ops_in_tower_mode else []):
+        loss = math_ops.reduce_mean(
+            math_ops.reduce_sum(layer(y)) - constant_op.constant(1.))
+      # `x` and `y` will be fetched by the gradient computation, but not `loss`.
       return loss
 
     # Callable loss.
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index a7e4fe80f3e65907fa4b48c5fe0fcfd422ba033f..75441786a615fc0d87b4c4b0b45b9384d678c1d3 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -33,7 +33,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.util import nest
 
 
-# TODO(isaprykin):  Consider whether inheriting is really appropriate.
 class TPUStrategy(one_device_strategy.OneDeviceStrategy):
   """Experimental TPU distribution strategy implementation."""
 
@@ -73,7 +72,6 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
     def infeed_input(i):
       """Get input, split it and then enqueue."""
       iteration_inputs = [f.get(i) for f in feeds()]
-
       infeed_inputs = [[inputs_per_core[core_id]
                         for inputs_per_core in iteration_inputs]
                        for core_id in range(self._num_cores_per_host)]
@@ -117,3 +115,14 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
           iterate_on_tpu, [], num_shards=self._num_cores_per_host)
 
     return control_flow_ops.group(tpu_result, enqueue_ops)
+
+  def _reduce(self, method_string, value, destinations):
+    del destinations  # TPU is graph mode only.  Rely on implicit Send/Recv.
+    if method_string == 'mean':
+      # TODO(jhseu):  Revisit once we support model-parallelism.
+      value *= (1. / self._num_cores_per_host)
+    return tpu_ops.cross_replica_sum(value)
+
+  @property
+  def num_towers(self):
+    return self._num_cores_per_host
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index aaf177d07ead6978db45277252540e3b329f2bc3..49b4e24daa4ffe417712bc854aa29995d5afc408 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -34,10 +34,11 @@ from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.training import checkpointable
+from tensorflow.python.ops import math_ops
 from tensorflow.python.training import device_util
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
 
 
@@ -60,7 +61,7 @@ class DistributedValues(object):
       else:
         device = distribute_lib.get_update_device()
         if device is None:
-          device = device_util.current()
+          return self._get_cross_tower()
     device = device_util.canonicalize(device)
     try:
       return self._index[device]
@@ -231,12 +232,6 @@ class DistributedVariable(DistributedDelegate):
                               self._primary_var.op.type)
     return self.get().op
 
-  def _as_graph_element(self):
-    # pylint: disable=protected-access
-    if distribute_lib.get_cross_tower_context():
-      return self._primary_var._as_graph_element()
-    return self.get()._as_graph_element()
-
   def _should_act_as_resource_variable(self):
     """Pass resource_variable_ops.is_resource_variable check."""
     pass
@@ -320,6 +315,18 @@ class MirroredVariable(DistributedVariable, Mirrored,
   def assign(self, *args, **kwargs):
     return self.get(device=_get_update_device()).assign(*args, **kwargs)
 
+  def _get_cross_tower(self):
+    device = device_util.canonicalize(device_util.current())
+    if device in self._index:
+      return array_ops.identity(self._index[device])
+    return array_ops.identity(self._primary_var)
+
+  def _as_graph_element(self):
+    # pylint: disable=protected-access
+    if distribute_lib.get_cross_tower_context():
+      return self._primary_var._as_graph_element()
+    return self.get()._as_graph_element()
+
   def _gather_saveables_for_checkpoint(self):
     """Overrides CheckpointableBase method.
 
@@ -364,6 +371,12 @@ class _TowerLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
         for d, v in six.iteritems(self._tower_local_variable._index)])  # pylint: disable=protected-access
 
 
+def _assert_tower_context():
+  if not distribute_lib.get_tower_context():
+    raise RuntimeError(
+        "Tower-local variables may only be assigned in a tower context.")
+
+
 class TowerLocalVariable(DistributedVariable, PerDevice,
                          checkpointable.CheckpointableBase):
   """Holds a map from device to variables whose values are reduced on save."""
@@ -374,18 +387,35 @@ class TowerLocalVariable(DistributedVariable, PerDevice,
     super(TowerLocalVariable, self).__init__(index)
 
   def assign_sub(self, *args, **kwargs):
+    _assert_tower_context()
     return self.get().assign_sub(*args, **kwargs)
 
   def assign_add(self, *args, **kwargs):
+    _assert_tower_context()
     return self.get().assign_add(*args, **kwargs)
 
   def assign(self, *args, **kwargs):
+    _assert_tower_context()
     return self.get().assign(*args, **kwargs)
 
   @property
   def reduce_method(self):
     return self._reduce_method
 
+  def _get_cross_tower(self):
+    all_components = tuple(self._index.values())
+    # TODO(josh11b): Use a strategy-specific method.
+    total = math_ops.add_n(all_components)
+    if self._reduce_method == "mean":
+      return total * (1./ len(all_components))
+    return total
+
+  def _as_graph_element(self):
+    # pylint: disable=protected-access
+    if distribute_lib.get_cross_tower_context():
+      return self._get_cross_tower()
+    return self.get()._as_graph_element()
+
   def _gather_saveables_for_checkpoint(self):
     """Overrides CheckpointableBase method.
 
@@ -672,11 +702,12 @@ class MultiWorkerDataset(object):
     return MultiWorkerDataIterator(iterators, self._worker_device_map)
 
 
-class PerIteration(object):
-  """Holds input for multiple iterations at once."""
+class _PerKey(object):
+  """Holds data associated by keys."""
 
-  def __init__(self, index):
-    self._index = index
+  def __init__(self, *index):
+    # pylint: disable=protected-access
+    self._index = list(index)
 
   def get(self, iteration):
     return array_ops.gather(self._index, iteration)
@@ -687,6 +718,24 @@ class PerIteration(object):
   def get_dtype(self):
     return self._index[-1][-1].dtype
 
+  def __str__(self):
+    return "%s:%s" % (self.__class__.__name__, self._index)
+
+  def __repr__(self):
+    return "%s(%r)" % (self.__class__.__name__, self._index)
+
+
+class PerIteration(_PerKey):
+  """Holds input for multiple iterations at once."""
+
+  def __init__(self, *index):
+    # pylint: disable=protected-access
+    super(PerIteration, self).__init__(*[batch._index for batch in index])
+
+
+class Batches(_PerKey):
+  pass
+
 
 class MultiIterator(object):
   """Iterator that returns results of multiple get_next()s."""
@@ -697,11 +746,31 @@ class MultiIterator(object):
     self._batches_per_iteration = batches_per_iteration
 
   def get_next(self, name=None):
-    return PerIteration([[
-        self._dataset_iterator.get_next(name=name)
-        for _ in range(self._batches_per_iteration)
-    ]
-                         for _ in range(self._iterations)])
+    """Return PerIteration with `iterations x batches_per_iteration` inputs."""
+    data = []
+    for _ in range(self._batches_per_iteration):
+      batch = []
+      for _ in range(self._iterations):
+        batch.append(self._dataset_iterator.get_next(name=name))
+      data.append(batch)
+
+    # Here is an example.  Suppose each get_next returns a tuple of two tensors.
+    # For 3 `iterations` and 2 `batches_per_iteration`, the `data` is:
+    # [[(a,z), (b,y), (c,x)], [(A,Z), (B,Y), (C,X)]]
+    #
+    # After the first `map_structure` it gets transformed to:
+    #  [(Batches(a, A), Batches(z, Z)),
+    #   (Batches(b, B), Batches(y, Y)),
+    #   (Batches(c, C), Batches(x, X))]
+    #
+    # After the second `map_structure` it gets transformed to a tuple of:
+    # (PerIteration([Batches(a, A), Batches(b, B), Batches(c, C)]),
+    #  PerIteration([Batches(z, Z), Batches(y, Y), Batches(x, X)]))
+
+    data = nest.map_structure(Batches, *data)
+    data = nest.map_structure(PerIteration, *data)
+
+    return data
 
   @property
   def initializer(self):
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index 9aeef9fa3e86f25ba2544236fd802c7162f4e40e..1c95758d96aba47e9581dde6411763e98b99a968 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -42,7 +42,6 @@ from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.util import nest
 
 
-@test_util.with_c_api
 class DistributedValuesTest(test.TestCase):
 
   def testGetEager(self):
@@ -81,7 +80,6 @@ class DistributedValuesTest(test.TestCase):
       v = values.DistributedValues({"/device:cpu:0": 42})
 
 
-@test_util.with_c_api
 class DistributedDelegateTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -164,7 +162,6 @@ def _make_mirrored():
   return v, devices, mirrored
 
 
-@test_util.with_c_api
 class RegroupAndSelectDeviceTest(test.TestCase):
 
   def _is_per_device(self, result, expected, klass=values.PerDevice):
@@ -317,7 +314,6 @@ class RegroupAndSelectDeviceTest(test.TestCase):
                                                merged_estimator_spec))
 
 
-@test_util.with_c_api
 class PerDeviceDatasetTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
@@ -564,7 +560,6 @@ class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
         multi_worker_iterator.get_next()
 
 
-@test_util.with_c_api
 class MirroredVariableTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
@@ -741,7 +736,6 @@ def _make_tower_local(method):
   return v, tower_local
 
 
-@test_util.with_c_api
 class TowerLocalVariableTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index fad613155d8861a2508fb7aca752b10ff85d35eb..6192f04c8b695d124b498850ad430823b44fd472 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -94,7 +94,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "distribution_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/distribution_test.py"],
     additional_deps = [
         ":distributions_py",
@@ -337,7 +337,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "mvn_tril_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/mvn_tril_test.py"],
     additional_deps = [
         ":distributions_py",
@@ -372,6 +372,7 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
     ],
+    shard_count = 4,
 )
 
 cuda_py_test(
@@ -459,7 +460,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "batch_reshape_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/batch_reshape_test.py"],
     additional_deps = [
         ":distributions_py",
@@ -578,7 +579,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "wishart_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/wishart_test.py"],
     additional_deps = [
         ":distributions_py",
@@ -709,6 +710,7 @@ cuda_py_test(
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
     tags = ["noasan"],  # times out, http://b/78588814
 )
 
@@ -866,7 +868,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "batch_normalization_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/bijectors/batch_normalization_test.py"],
     additional_deps = [
         ":bijectors_py",
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
index 59d549b7b80a3d80d0b8409542eb6583f645bdaa..f2bb2d3325a7cc6ec5803860600149522752a4c0 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
@@ -448,8 +448,7 @@ class _BatchReshapeTest(object):
 
     else:
       with self.test_session():
-        with self.assertRaisesOpError(r"`batch_shape` size must match "
-                                      r"`distributions.batch_shape` size"):
+        with self.assertRaisesOpError(r"Shape sizes do not match."):
           batch_reshape_lib.BatchReshape(
               distribution=mvn,
               batch_shape=new_batch_shape_ph,
@@ -457,8 +456,13 @@ class _BatchReshapeTest(object):
 
   def test_non_positive_shape(self):
     dims = 2
-    new_batch_shape = [-1, -2]   # -1*-2=2 so will pass size check.
-    old_batch_shape = [2]
+    old_batch_shape = [4]
+    if self.is_static_shape:
+      # Unknown first dimension does not trigger size check. Note that
+      # any dimension < 0 is treated statically as unknown.
+      new_batch_shape = [-1, 0]
+    else:
+      new_batch_shape = [-2, -2]  # -2 * -2 = 4, same size as the old shape.
 
     new_batch_shape_ph = (
         constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
@@ -471,7 +475,7 @@ class _BatchReshapeTest(object):
     mvn = mvn_lib.MultivariateNormalDiag(scale_diag=scale_ph)
 
     if self.is_static_shape:
-      with self.assertRaisesRegexp(ValueError, r".*must be positive.*"):
+      with self.assertRaisesRegexp(ValueError, r".*must be >=-1.*"):
         batch_reshape_lib.BatchReshape(
             distribution=mvn,
             batch_shape=new_batch_shape_ph,
@@ -479,7 +483,7 @@ class _BatchReshapeTest(object):
 
     else:
       with self.test_session():
-        with self.assertRaisesOpError(r".*must be positive.*"):
+        with self.assertRaisesOpError(r".*must be >=-1.*"):
           batch_reshape_lib.BatchReshape(
               distribution=mvn,
               batch_shape=new_batch_shape_ph,
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
index ca20442c3940664feab7526110229872a6cdc41f..dc45114b1c23b5edb78d68ad4f38f5201d265170 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.distributions.python.ops.bijectors.exp import Exp
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import SoftmaxCentered
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import Softplus
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions import bijector
 from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
@@ -188,6 +189,15 @@ class ChainBijectorTest(test.TestCase):
         -np.log(6, dtype=np.float32) - np.sum(x),
         self.evaluate(chain.inverse_log_det_jacobian(y, event_ndims=1)))
 
+  def testChainIldjWithPlaceholder(self):
+    chain = Chain((Exp(), Exp()))
+    samples = array_ops.placeholder(
+        dtype=np.float32, shape=[None, 10], name="samples")
+    ildj = chain.inverse_log_det_jacobian(samples, event_ndims=0)
+    self.assertTrue(ildj is not None)
+    with self.test_session():
+      ildj.eval({samples: np.zeros([2, 10], np.float32)})
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
index 46f2c63f9b0f78b25bb1948e6ea55ab20c5cfa6e..d44e49b4874a5b91f7633cd9c97dbb1a7da70f27 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
@@ -22,15 +22,12 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops.bijectors.reshape import Reshape
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
 from tensorflow.python.platform import test
 
 
-@test_util.with_c_api
 class _ReshapeBijectorTest(object):
   """Base class for testing the reshape transformation.
 
@@ -265,7 +262,6 @@ class _ReshapeBijectorTest(object):
     raise NotImplementedError("Subclass failed to implement `build_shapes`.")
 
 
-@test_util.with_c_api
 class ReshapeBijectorTestStatic(test.TestCase, _ReshapeBijectorTest):
 
   def build_shapes(self, shape_in, shape_out):
@@ -305,21 +301,13 @@ class ReshapeBijectorTestStatic(test.TestCase, _ReshapeBijectorTest):
           bijector, x, y, event_ndims=2, rtol=1e-6, atol=0)
 
   def testInvalidDimensionsOpError(self):
-    if ops._USE_C_API:
-      error_message = "Invalid value in tensor used for shape: -2"
-    else:
-      error_message = "elements must be either positive integers or `-1`."
-    self._testInvalidDimensionsOpError(error_message)
+    self._testInvalidDimensionsOpError(
+        "Invalid value in tensor used for shape: -2")
 
   def testInputOutputMismatchOpError(self):
-    if ops._USE_C_API:
-      error_message = "Cannot reshape a tensor with"
-    else:
-      error_message = "Input to reshape is a tensor with"
-    self._testInputOutputMismatchOpError(error_message)
+    self._testInputOutputMismatchOpError("Cannot reshape a tensor with")
 
 
-@test_util.with_c_api
 class ReshapeBijectorTestDynamic(test.TestCase, _ReshapeBijectorTest):
 
   def build_shapes(self, shape_in, shape_out):
@@ -341,7 +329,6 @@ class ReshapeBijectorTestDynamic(test.TestCase, _ReshapeBijectorTest):
     self._testInputOutputMismatchOpError("Input to reshape is a tensor with")
 
 
-@test_util.with_c_api
 class ReshapeBijectorTestDynamicNdims(test.TestCase, _ReshapeBijectorTest):
 
   def build_shapes(self, shape_in, shape_out):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
index 7435bcbc684c1660a648cef4ab30c888723853f8..b003526392709b61e9cc46e0ff8e5fa78edc0568 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
@@ -131,8 +131,8 @@ class MultivariateNormalFullCovarianceTest(test.TestCase):
     return mu, sigma
 
   def testKLBatch(self):
-    batch_shape = (2,)
-    event_shape = (3,)
+    batch_shape = [2]
+    event_shape = [3]
     with self.test_session():
       mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
       mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
@@ -156,6 +156,33 @@ class MultivariateNormalFullCovarianceTest(test.TestCase):
       self.assertAllClose(expected_kl_0, kl_v[0])
       self.assertAllClose(expected_kl_1, kl_v[1])
 
+  def testKLBatchBroadcast(self):
+    batch_shape = [2]
+    event_shape = [3]
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      # No batch shape.
+      mu_b, sigma_b = self._random_mu_and_sigma([], event_shape)
+      mvn_a = ds.MultivariateNormalFullCovariance(
+          loc=mu_a,
+          covariance_matrix=sigma_a,
+          validate_args=True)
+      mvn_b = ds.MultivariateNormalFullCovariance(
+          loc=mu_b,
+          covariance_matrix=sigma_b,
+          validate_args=True)
+
+      kl = ds.kl_divergence(mvn_a, mvn_b)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      expected_kl_0 = _compute_non_batch_kl(mu_a[0, :], sigma_a[0, :, :],
+                                            mu_b, sigma_b)
+      expected_kl_1 = _compute_non_batch_kl(mu_a[1, :], sigma_a[1, :, :],
+                                            mu_b, sigma_b)
+      self.assertAllClose(expected_kl_0, kl_v[0])
+      self.assertAllClose(expected_kl_1, kl_v[1])
+
 
 def _compute_non_batch_kl(mu_a, sigma_a, mu_b, sigma_b):
   """Non-batch KL for N(mu_a, sigma_a), N(mu_b, sigma_b)."""
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
index 685f32883dae5b8513badeb05e1508cd611d6e93..b556d06123800f22f5d9a90dd18f3c745aec90a1 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
@@ -235,8 +235,8 @@ class MultivariateNormalTriLTest(test.TestCase):
     return mu, sigma
 
   def testKLNonBatch(self):
-    batch_shape = ()
-    event_shape = (2,)
+    batch_shape = []
+    event_shape = [2]
     with self.test_session():
       mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
       mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
@@ -257,8 +257,8 @@ class MultivariateNormalTriLTest(test.TestCase):
       self.assertAllClose(expected_kl, kl_v)
 
   def testKLBatch(self):
-    batch_shape = (2,)
-    event_shape = (3,)
+    batch_shape = [2]
+    event_shape = [3]
     with self.test_session():
       mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
       mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
@@ -282,9 +282,36 @@ class MultivariateNormalTriLTest(test.TestCase):
       self.assertAllClose(expected_kl_0, kl_v[0])
       self.assertAllClose(expected_kl_1, kl_v[1])
 
+  def testKLBatchBroadcast(self):
+    batch_shape = [2]
+    event_shape = [3]
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      # No batch shape.
+      mu_b, sigma_b = self._random_mu_and_sigma([], event_shape)
+      mvn_a = ds.MultivariateNormalTriL(
+          loc=mu_a,
+          scale_tril=np.linalg.cholesky(sigma_a),
+          validate_args=True)
+      mvn_b = ds.MultivariateNormalTriL(
+          loc=mu_b,
+          scale_tril=np.linalg.cholesky(sigma_b),
+          validate_args=True)
+
+      kl = ds.kl_divergence(mvn_a, mvn_b)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      expected_kl_0 = _compute_non_batch_kl(mu_a[0, :], sigma_a[0, :, :],
+                                            mu_b, sigma_b)
+      expected_kl_1 = _compute_non_batch_kl(mu_a[1, :], sigma_a[1, :, :],
+                                            mu_b, sigma_b)
+      self.assertAllClose(expected_kl_0, kl_v[0])
+      self.assertAllClose(expected_kl_1, kl_v[1])
+
   def testKLTwoIdenticalDistributionsIsZero(self):
-    batch_shape = (2,)
-    event_shape = (3,)
+    batch_shape = [2]
+    event_shape = [3]
     with self.test_session():
       mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
       mvn_a = ds.MultivariateNormalTriL(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py b/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
index 968057331787059240110b90545f70c0ab128aa8..b91a610acf1a9094d612504d63030b3bffb873ac 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
@@ -65,6 +65,16 @@ class SeedStreamTest(test.TestCase):
     self.assertAllUnique(
         outputs + [strm2() for _ in range(50)] + [strm3() for _ in range(50)])
 
+  def testInitFromOtherSeedStream(self):
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(strm1, salt="salt")
+    strm3 = seed_stream.SeedStream(strm1, salt="another salt")
+    out1 = [strm1() for _ in range(50)]
+    out2 = [strm2() for _ in range(50)]
+    out3 = [strm3() for _ in range(50)]
+    self.assertAllEqual(out1, out2)
+    self.assertAllUnique(out1 + out3)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/autoregressive.py b/tensorflow/contrib/distributions/python/ops/autoregressive.py
index 88ed0127841093cc1a1168d988f14e7bb0277b12..d813831bef803a22c095d9c98e7163aa4861a15d 100644
--- a/tensorflow/contrib/distributions/python/ops/autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/autoregressive.py
@@ -144,7 +144,7 @@ class Autoregressive(distribution_lib.Distribution):
         `distribution_fn(sample0).event_shape.num_elements()` are both `None`.
       ValueError: if `num_steps < 1`.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name) as name:
       self._distribution_fn = distribution_fn
       self._sample0 = sample0
diff --git a/tensorflow/contrib/distributions/python/ops/batch_reshape.py b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
index bf5590cd552a915a3ecfc1912ee530baf79665a6..c709318f76552e1188f735f5bafff4be0537baed 100644
--- a/tensorflow/contrib/distributions/python/ops/batch_reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import distribution as distribution_lib
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -41,9 +42,6 @@ class BatchReshape(distribution_lib.Distribution):
   This "meta-distribution" reshapes the batch dimensions of another
   distribution.
 
-  Note: Unlike `tf.reshape`, the `BatchReshape` distribution does not support
-  `-1` for flattening.
-
   #### Examples
 
   ```python
@@ -51,7 +49,7 @@ class BatchReshape(distribution_lib.Distribution):
 
   dtype = np.float32
   dims = 2
-  new_batch_shape = [1, 2, 3]
+  new_batch_shape = [1, 2, -1]
   old_batch_shape = [6]
 
   scale = np.ones(old_batch_shape + [dims], dtype)
@@ -85,8 +83,9 @@ class BatchReshape(distribution_lib.Distribution):
     Args:
       distribution: The base distribution instance to reshape. Typically an
         instance of `Distribution`.
-      batch_shape: Positive `int`-like vector-shaped `Tensor` representing the
-        new shape of the batch dimensions.
+      batch_shape: Positive `int`-like vector-shaped `Tensor` representing
+        the new shape of the batch dimensions. Up to one dimension may contain
+        `-1`, meaning the remainder of the batch size.
       validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
@@ -104,31 +103,28 @@ class BatchReshape(distribution_lib.Distribution):
       ValueError: if `batch_shape` size is not the same as a
         `distribution.batch_shape` size.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     name = name or "BatchReshape" + distribution.name
-    self._distribution = distribution
     with ops.name_scope(name, values=[batch_shape]) as name:
-      self._batch_shape_ = ops.convert_to_tensor(
-          batch_shape,
-          dtype=dtypes.int32,
-          name="batch_shape")
-      self._batch_shape_static = tensor_util.constant_value(self._batch_shape_)
-      if self._batch_shape_static is not None:
-        self._batch_shape_static = np.int32(self._batch_shape_static)
-      self._runtime_assertions = validate_init_args(
-          self._distribution,
-          self._batch_shape_,
-          validate_args,
-          self._batch_shape_static)
+      # The unexpanded batch shape may contain up to one dimension of -1.
+      self._batch_shape_unexpanded = ops.convert_to_tensor(
+          batch_shape, dtype=dtypes.int32, name="batch_shape")
+      validate_init_args_statically(distribution, self._batch_shape_unexpanded)
+      batch_shape, batch_shape_static, runtime_assertions = calculate_reshape(
+          distribution.batch_shape_tensor(), self._batch_shape_unexpanded,
+          validate_args)
+      self._distribution = distribution
+      self._batch_shape_ = batch_shape
+      self._batch_shape_static = batch_shape_static
+      self._runtime_assertions = runtime_assertions
       super(BatchReshape, self).__init__(
-          dtype=self._distribution.dtype,
-          reparameterization_type=self._distribution.reparameterization_type,
+          dtype=distribution.dtype,
+          reparameterization_type=distribution.reparameterization_type,
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
           parameters=parameters,
           graph_parents=(
-              [self._batch_shape_] +
-              self._distribution._graph_parents),  # pylint: disable=protected-access
+              [self._batch_shape_unexpanded] + distribution._graph_parents),  # pylint: disable=protected-access
           name=name)
 
   @property
@@ -140,7 +136,7 @@ class BatchReshape(distribution_lib.Distribution):
       return array_ops.identity(self._batch_shape_)
 
   def _batch_shape(self):
-    return tensor_shape.TensorShape(self._batch_shape_static)
+    return self._batch_shape_static
 
   def _event_shape_tensor(self):
     with ops.control_dependencies(self._runtime_assertions):
@@ -152,11 +148,13 @@ class BatchReshape(distribution_lib.Distribution):
   def _sample_n(self, n, seed=None):
     with ops.control_dependencies(self._runtime_assertions):
       x = self.distribution.sample(sample_shape=n, seed=seed)
-      new_shape = array_ops.concat([
-          [n],
-          self.batch_shape_tensor(),
-          self.event_shape_tensor(),
-      ], axis=0)
+      new_shape = array_ops.concat(
+          [
+              [n],
+              self._batch_shape_unexpanded,
+              self.event_shape_tensor(),
+          ],
+          axis=0)
       return array_ops.reshape(x, new_shape)
 
   def _log_prob(self, x):
@@ -213,9 +211,9 @@ class BatchReshape(distribution_lib.Distribution):
     event_ndims = (array_ops.size(self.event_shape_tensor())
                    if self.event_shape.ndims is None
                    else self.event_shape.ndims)
-    batch_ndims = (array_ops.size(self.batch_shape_tensor())
-                   if self.batch_shape.ndims is None
-                   else self.batch_shape.ndims)
+    batch_ndims = (
+        array_ops.size(self._batch_shape_unexpanded)
+        if self.batch_shape.ndims is None else self.batch_shape.ndims)
     sample_ndims = x_ndims - batch_ndims - event_ndims
     if isinstance(sample_ndims, int):
       static_sample_shape = x.shape[:sample_ndims]
@@ -238,10 +236,11 @@ class BatchReshape(distribution_lib.Distribution):
           self.event_shape_tensor(),
       ], axis=0)
       result = fn(array_ops.reshape(x, old_shape))
-      new_shape = array_ops.concat([
-          sample_shape,
-          self.batch_shape_tensor(),
-      ], axis=0)
+      new_shape = array_ops.concat(
+          [
+              sample_shape,
+              self._batch_shape_unexpanded,
+          ], axis=0)
       result = array_ops.reshape(result, new_shape)
       if (static_sample_shape.ndims is not None and
           self.batch_shape.ndims is not None):
@@ -261,8 +260,7 @@ class BatchReshape(distribution_lib.Distribution):
       if static_event_shape_list is None:
         static_event_shape_list = [self.event_shape]
       new_shape = array_ops.concat(
-          [self.batch_shape_tensor()] + event_shape_list,
-          axis=0)
+          [self._batch_shape_unexpanded] + event_shape_list, axis=0)
       result = array_ops.reshape(fn(), new_shape)
       if (self.batch_shape.ndims is not None and
           self.event_shape.ndims is not None):
@@ -281,9 +279,9 @@ class BatchReshape(distribution_lib.Distribution):
       event_ndims = (array_ops.size(self.event_shape_tensor())
                      if self.event_shape.ndims is None
                      else self.event_shape.ndims)
-      batch_ndims = (array_ops.size(self.batch_shape_tensor())
-                     if self.batch_shape.ndims is None
-                     else self.batch_shape.ndims)
+      batch_ndims = (
+          array_ops.size(self._batch_shape_unexpanded)
+          if self.batch_shape.ndims is None else self.batch_shape.ndims)
       expected_batch_event_ndims = batch_ndims + event_ndims
 
       if (isinstance(x_ndims, int) and
@@ -355,62 +353,56 @@ class BatchReshape(distribution_lib.Distribution):
       return runtime_assertions
 
 
-def validate_init_args(
-    distribution,
-    batch_shape,
-    validate_args,
-    batch_shape_static):
+def calculate_reshape(original_shape, new_shape, validate=False, name=None):
+  """Calculates the reshaped dimensions (replacing up to one -1 in reshape)."""
+  batch_shape_static = tensor_util.constant_value_as_shape(new_shape)
+  if batch_shape_static.is_fully_defined():
+    return np.int32(batch_shape_static.as_list()), batch_shape_static, []
+  with ops.name_scope(name, "calculate_reshape", [original_shape, new_shape]):
+    original_size = math_ops.reduce_prod(original_shape)
+    implicit_dim = math_ops.equal(new_shape, -1)
+    size_implicit_dim = (
+        original_size // math_ops.maximum(1, -math_ops.reduce_prod(new_shape)))
+    new_ndims = array_ops.shape(new_shape)
+    expanded_new_shape = array_ops.where(  # Assumes exactly one `-1`.
+        implicit_dim, array_ops.fill(new_ndims, size_implicit_dim), new_shape)
+    validations = [] if not validate else [
+        check_ops.assert_rank(
+            original_shape, 1, message="Original shape must be a vector."),
+        check_ops.assert_rank(
+            new_shape, 1, message="New shape must be a vector."),
+        check_ops.assert_less_equal(
+            math_ops.count_nonzero(implicit_dim, dtype=dtypes.int32),
+            1,
+            message="At most one dimension can be unknown."),
+        check_ops.assert_positive(
+            expanded_new_shape, message="Shape elements must be >=-1."),
+        check_ops.assert_equal(
+            math_ops.reduce_prod(expanded_new_shape),
+            original_size,
+            message="Shape sizes do not match."),
+    ]
+    return expanded_new_shape, batch_shape_static, validations
+
+
+def validate_init_args_statically(distribution, batch_shape):
   """Helper to __init__ which makes or raises assertions."""
-  with ops.name_scope(name="validate_init_args",
-                      values=[batch_shape] + distribution._graph_parents):  # pylint: disable=protected-access
-    runtime_assertions = []
-
-    if batch_shape.shape.ndims is not None:
-      if batch_shape.shape.ndims != 1:
-        raise ValueError("`batch_shape` must be a vector "
-                         "(saw rank: {}).".format(
-                             batch_shape.shape.ndims))
-    elif validate_args:
-      runtime_assertions += [
-          check_ops.assert_rank(
-              batch_shape,
-              1,
-              message="`batch_shape` must be a vector.",
-              name="assert_batch_shape_is_vector"),
-      ]
-
-    batch_size_static = np.prod(batch_shape_static)
-    dist_batch_size_static = (
-        None if not distribution.batch_shape.is_fully_defined()
-        else np.prod(distribution.batch_shape).value)
-
-    if batch_size_static is not None and dist_batch_size_static is not None:
-      if batch_size_static != dist_batch_size_static:
-        raise ValueError("`batch_shape` size ({}) must match "
-                         "`distribution.batch_shape` size ({}).".format(
-                             batch_size_static,
-                             dist_batch_size_static))
-    elif validate_args:
-      runtime_assertions += [
-          check_ops.assert_equal(
-              math_ops.reduce_prod(batch_shape),
-              math_ops.reduce_prod(distribution.batch_shape_tensor()),
-              message=("`batch_shape` size must match "
-                       "`distributions.batch_shape` size."),
-              name="assert_batch_size"),
-      ]
-
-    if batch_shape_static is not None:
-      if np.any(batch_shape_static < 1):
-        raise ValueError("`batch_shape` elements must be positive "
-                         "(i.e., larger than zero).")
-    elif validate_args:
-      runtime_assertions += [
-          check_ops.assert_positive(
-              batch_shape,
-              message=("`batch_shape` elements must be positive "
-                       "(i.e., larger than zero)."),
-              name="assert_batch_shape_positive")
-      ]
-
-    return runtime_assertions
+  if batch_shape.shape.ndims is not None:
+    if batch_shape.shape.ndims != 1:
+      raise ValueError("`batch_shape` must be a vector "
+                       "(saw rank: {}).".format(batch_shape.shape.ndims))
+
+  batch_shape_static = tensor_util.constant_value_as_shape(batch_shape)
+  batch_size_static = batch_shape_static.num_elements()
+  dist_batch_size_static = distribution.batch_shape.num_elements()
+
+  if batch_size_static is not None and dist_batch_size_static is not None:
+    if batch_size_static != dist_batch_size_static:
+      raise ValueError("`batch_shape` size ({}) must match "
+                       "`distribution.batch_shape` size ({}).".format(
+                           batch_size_static, dist_batch_size_static))
+
+  if batch_shape_static.dims is not None:
+    if any(
+        dim.value is not None and dim.value < 1 for dim in batch_shape_static):
+      raise ValueError("`batch_shape` elements must be >=-1.")
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
index 85ad23e4133ef09051cdc8b45e489caeea90fbb3..b158a51bb022b5e2ea3afda74e97b9dc131665a6 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
@@ -20,10 +20,9 @@ from __future__ import print_function
 
 import itertools
 
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import bijector
 
 
@@ -36,15 +35,6 @@ def _use_static_shape(input_tensor, ndims):
   return input_tensor.shape.is_fully_defined() and isinstance(ndims, int)
 
 
-def _maybe_get_event_ndims_statically(event_ndims):
-  static_event_ndims = (event_ndims if isinstance(event_ndims, int)
-                        else tensor_util.constant_value(event_ndims))
-  if static_event_ndims is not None:
-    return static_event_ndims
-
-  return event_ndims
-
-
 def _compute_min_event_ndims(bijector_list, compute_forward=True):
   """Computes the min_event_ndims associated with the give list of bijectors.
 
@@ -238,13 +228,13 @@ class Chain(bijector.Bijector):
     return y
 
   def _inverse_log_det_jacobian(self, y, **kwargs):
-    ildj = constant_op.constant(
-        0., dtype=y.dtype.base_dtype, name="inverse_log_det_jacobian")
+    y = ops.convert_to_tensor(y, name="y")
+    ildj = math_ops.cast(0., dtype=y.dtype.base_dtype)
 
     if not self.bijectors:
       return ildj
 
-    event_ndims = _maybe_get_event_ndims_statically(
+    event_ndims = self._maybe_get_event_ndims_statically(
         self.inverse_min_event_ndims)
 
     if _use_static_shape(y, event_ndims):
@@ -258,11 +248,12 @@ class Chain(bijector.Bijector):
 
       if _use_static_shape(y, event_ndims):
         event_shape = b.inverse_event_shape(event_shape)
-        event_ndims = _maybe_get_event_ndims_statically(event_shape.ndims)
+        event_ndims = self._maybe_get_event_ndims_statically(
+            event_shape.ndims)
       else:
         event_shape = b.inverse_event_shape_tensor(event_shape)
-        event_ndims = _maybe_get_event_ndims_statically(
-            array_ops.rank(event_shape))
+        event_ndims = self._maybe_get_event_ndims_statically(
+            array_ops.size(event_shape))
       y = b.inverse(y, **kwargs.get(b.name, {}))
     return ildj
 
@@ -274,13 +265,12 @@ class Chain(bijector.Bijector):
   def _forward_log_det_jacobian(self, x, **kwargs):
     x = ops.convert_to_tensor(x, name="x")
 
-    fldj = constant_op.constant(
-        0., dtype=x.dtype, name="inverse_log_det_jacobian")
+    fldj = math_ops.cast(0., dtype=x.dtype.base_dtype)
 
     if not self.bijectors:
       return fldj
 
-    event_ndims = _maybe_get_event_ndims_statically(
+    event_ndims = self._maybe_get_event_ndims_statically(
         self.forward_min_event_ndims)
 
     if _use_static_shape(x, event_ndims):
@@ -293,13 +283,21 @@ class Chain(bijector.Bijector):
           x, event_ndims=event_ndims, **kwargs.get(b.name, {}))
       if _use_static_shape(x, event_ndims):
         event_shape = b.forward_event_shape(event_shape)
-        event_ndims = _maybe_get_event_ndims_statically(event_shape.ndims)
+        event_ndims = self._maybe_get_event_ndims_statically(event_shape.ndims)
       else:
         event_shape = b.forward_event_shape_tensor(event_shape)
-        event_ndims = _maybe_get_event_ndims_statically(
-            array_ops.rank(event_shape))
+        event_ndims = self._maybe_get_event_ndims_statically(
+            array_ops.size(event_shape))
 
       x = b.forward(x, **kwargs.get(b.name, {}))
 
     return fldj
 
+  def _maybe_get_event_ndims_statically(self, event_ndims):
+    event_ndims_ = super(Chain, self)._maybe_get_event_ndims_statically(
+        event_ndims)
+    if event_ndims_ is None:
+      return event_ndims
+    return event_ndims_
+
+
diff --git a/tensorflow/contrib/distributions/python/ops/binomial.py b/tensorflow/contrib/distributions/python/ops/binomial.py
index 12d16031783b78dc3ea6273af77c1eaeb77ca94e..24b26bf124c78c8320b9a6bc3b900e6c7a93f5e4 100644
--- a/tensorflow/contrib/distributions/python/ops/binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/binomial.py
@@ -163,7 +163,7 @@ class Binomial(distribution.Distribution):
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[total_count, logits, probs]) as name:
       self._total_count = self._maybe_assert_valid_total_count(
           ops.convert_to_tensor(total_count, name="total_count"),
diff --git a/tensorflow/contrib/distributions/python/ops/cauchy.py b/tensorflow/contrib/distributions/python/ops/cauchy.py
index daacfe657fe154dce8d0db98894fe8b73546c476..f5ffdd873124d6626dca26f603592bd0b030d7b3 100644
--- a/tensorflow/contrib/distributions/python/ops/cauchy.py
+++ b/tensorflow/contrib/distributions/python/ops/cauchy.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 __all__ = [
     "Cauchy",
@@ -120,7 +121,7 @@ class Cauchy(distribution.Distribution):
     Raises:
       TypeError: if `loc` and `scale` have different `dtype`.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)]
                                     if validate_args else []):
diff --git a/tensorflow/contrib/distributions/python/ops/chi2.py b/tensorflow/contrib/distributions/python/ops/chi2.py
index c77c5fd20895a6220604d76a95a152a22cd3d914..08cdc1582892cc7d308bd60f082dde082704f57f 100644
--- a/tensorflow/contrib/distributions/python/ops/chi2.py
+++ b/tensorflow/contrib/distributions/python/ops/chi2.py
@@ -25,6 +25,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import gamma
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -83,7 +84,7 @@ class Chi2(gamma.Gamma):
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     # Even though all stats of chi2 are defined for valid parameters, this is
     # not true in the parent class "gamma."  therefore, passing
     # allow_nan_stats=True
@@ -119,7 +120,7 @@ class Chi2WithAbsDf(Chi2):
                validate_args=False,
                allow_nan_stats=True,
                name="Chi2WithAbsDf"):
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[df]) as name:
       super(Chi2WithAbsDf, self).__init__(
           df=math_ops.floor(
diff --git a/tensorflow/contrib/distributions/python/ops/deterministic.py b/tensorflow/contrib/distributions/python/ops/deterministic.py
index a42350430e98515e521ce357bf5a87ff2daefedc..6d7d6d307bd0f815344c8a0e347f45ae11ba6462 100644
--- a/tensorflow/contrib/distributions/python/ops/deterministic.py
+++ b/tensorflow/contrib/distributions/python/ops/deterministic.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 __all__ = [
     "Deterministic",
@@ -86,7 +87,7 @@ class _BaseDeterministic(distribution.Distribution):
     Raises:
       ValueError:  If `loc` is a scalar.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[loc, atol, rtol]) as name:
       loc = ops.convert_to_tensor(loc, name="loc")
       if is_vector and validate_args:
diff --git a/tensorflow/contrib/distributions/python/ops/geometric.py b/tensorflow/contrib/distributions/python/ops/geometric.py
index 53dd42f4c83fcea0ec5b1374c8e3109ebe1dd127..446cff6ec242f25178fed0c6a424791fa9f176ad 100644
--- a/tensorflow/contrib/distributions/python/ops/geometric.py
+++ b/tensorflow/contrib/distributions/python/ops/geometric.py
@@ -85,7 +85,7 @@ class Geometric(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
 
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits, probs, validate_args=validate_args, name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/gumbel.py b/tensorflow/contrib/distributions/python/ops/gumbel.py
index 2c261073ee16462599740cb241108bfe08c773ec..ed9ea6f4f3ffe18fb6bf1e0a7d57728d010e0f01 100644
--- a/tensorflow/contrib/distributions/python/ops/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/gumbel.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class _Gumbel(distribution.Distribution):
@@ -124,7 +125,7 @@ class _Gumbel(distribution.Distribution):
     Raises:
       TypeError: if loc and scale are different dtypes.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
diff --git a/tensorflow/contrib/distributions/python/ops/half_normal.py b/tensorflow/contrib/distributions/python/ops/half_normal.py
index d0df2befd6e46ca93e5a0b5d1cb5407d6719c7f2..7e12767f6d8f6c61565ecf266d3b222de68c0e40 100644
--- a/tensorflow/contrib/distributions/python/ops/half_normal.py
+++ b/tensorflow/contrib/distributions/python/ops/half_normal.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import special_math
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -105,7 +106,7 @@ class HalfNormal(distribution.Distribution):
         if one or more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
diff --git a/tensorflow/contrib/distributions/python/ops/independent.py b/tensorflow/contrib/distributions/python/ops/independent.py
index fbde55ef310de1d926b8ddd503499fbed4809373..fa89fff3b7b2f8266a44c446a0c9807790b3aed8 100644
--- a/tensorflow/contrib/distributions/python/ops/independent.py
+++ b/tensorflow/contrib/distributions/python/ops/independent.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import distribution as distribution_lib
 from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class Independent(distribution_lib.Distribution):
@@ -116,7 +117,7 @@ class Independent(distribution_lib.Distribution):
       ValueError: if `reinterpreted_batch_ndims` exceeds
         `distribution.batch_ndims`
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     name = name or "Independent" + distribution.name
     self._distribution = distribution
     with ops.name_scope(name) as name:
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index 502bd4f493337bab180129cd0ddfaf5a76a0ca4e..85e8e10466038e5e55ef4b754f82c0c2c2543b6d 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -125,7 +125,7 @@ class InverseGamma(distribution.Distribution):
     Raises:
       TypeError: if `concentration` and `rate` are different dtypes.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[concentration, rate]) as name:
       with ops.control_dependencies([
           check_ops.assert_positive(concentration),
@@ -280,7 +280,7 @@ class InverseGammaWithSoftplusConcentrationRate(InverseGamma):
                validate_args=False,
                allow_nan_stats=True,
                name="InverseGammaWithSoftplusConcentrationRate"):
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[concentration, rate]) as name:
       super(InverseGammaWithSoftplusConcentrationRate, self).__init__(
           concentration=nn.softplus(concentration,
diff --git a/tensorflow/contrib/distributions/python/ops/logistic.py b/tensorflow/contrib/distributions/python/ops/logistic.py
index c83b5bc2e3a8c56f5c52d063a7d0d399be1c1870..0103283259b0526b5a108ea1836f95709eedc067 100644
--- a/tensorflow/contrib/distributions/python/ops/logistic.py
+++ b/tensorflow/contrib/distributions/python/ops/logistic.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class Logistic(distribution.Distribution):
@@ -119,7 +120,7 @@ class Logistic(distribution.Distribution):
     Raises:
       TypeError: if loc and scale are different dtypes.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index 2ef294af2e8bc9beff735ec2e0fd6b619ce96176..d54f30dc634ab5c8aa82066056266747b63eec21 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -116,7 +116,7 @@ class Mixture(distribution.Distribution):
         matching static batch shapes, or all components do not
         have matching static event shapes.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     if not isinstance(cat, categorical.Categorical):
       raise TypeError("cat must be a Categorical distribution, but saw: %s" %
                       cat)
diff --git a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
index 0b1301e551728f74bb0048d2dcf3c356ae110c75..c7c90cf875484a1753577227bf22de878d00a502 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
@@ -130,7 +130,7 @@ class MixtureSameFamily(distribution.Distribution):
       ValueError: if `mixture_distribution` categories does not equal
         `components_distribution` rightmost batch shape.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name) as name:
       self._mixture_distribution = mixture_distribution
       self._components_distribution = components_distribution
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag.py b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
index e3236c2db93695a5e007bba9a1414773f3935f2e..cad398582b9c939e8e96cf498638869ccd3701bd 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
@@ -193,7 +193,7 @@ class MultivariateNormalDiag(
     Raises:
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier]):
@@ -224,7 +224,7 @@ class MultivariateNormalDiagWithSoftplusScale(MultivariateNormalDiag):
                validate_args=False,
                allow_nan_stats=True,
                name="MultivariateNormalDiagWithSoftplusScale"):
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[scale_diag]) as name:
       super(MultivariateNormalDiagWithSoftplusScale, self).__init__(
           loc=loc,
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
index 2f6a6f198cbcfbdcbd0993d3074ddde1c389585f..1c11594df3ad2612dd8746bb8785d86390b69937 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
@@ -215,7 +215,7 @@ class MultivariateNormalDiagPlusLowRank(
     Raises:
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     def _convert_to_tensor(x, name):
       return None if x is None else ops.convert_to_tensor(x, name=name)
     with ops.name_scope(name) as name:
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
index 5d06a396fe7a3b87cabb9c3081da45246854089f..47d7d13cf357f1ac657641420602c92eefdad197 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
@@ -24,6 +24,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -155,7 +156,7 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
     Raises:
       ValueError: if neither `loc` nor `covariance_matrix` are specified.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
 
     # Convert the covariance_matrix up to a scale_tril and call MVNTriL.
     with ops.name_scope(name) as name:
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
index 44c92312c7dc758500051f89923ec9fafe850c0e..79916fef8d7b752649dcc673a84ea45ccf460905 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
@@ -170,7 +170,7 @@ class MultivariateNormalLinearOperator(
       ValueError: if `scale` is unspecified.
       TypeError: if not `scale.dtype.is_floating`
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     if scale is None:
       raise ValueError("Missing required `scale` parameter.")
     if not scale.dtype.is_floating:
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_tril.py b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
index d6f8b731cbeed5fed3b43365e7c668d0434a267e..d6b0ed994ec0a62e9b7684e7478130052a1fd300 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_tril.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
@@ -179,7 +179,7 @@ class MultivariateNormalTriL(
     Raises:
       ValueError: if neither `loc` nor `scale_tril` are specified.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     def _convert_to_tensor(x, name):
       return None if x is None else ops.convert_to_tensor(x, name=name)
     if loc is None and scale_tril is None:
diff --git a/tensorflow/contrib/distributions/python/ops/negative_binomial.py b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
index eeaf9c0a5ebc1323e137ff73f82588f6907031c7..1085c56dc86c8d45bdab2e7cecedf44663e5c408 100644
--- a/tensorflow/contrib/distributions/python/ops/negative_binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
@@ -90,7 +90,7 @@ class NegativeBinomial(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
 
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[total_count, logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits, probs, validate_args=validate_args, name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
index 305b138fdc2318523ee078195213caf865d96b4d..a4b9f3b78d4fdcc328bac84623114b921b9ded49 100644
--- a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
@@ -115,7 +115,7 @@ class OneHotCategorical(distribution.Distribution):
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           name=name, logits=logits, probs=probs, validate_args=validate_args,
diff --git a/tensorflow/contrib/distributions/python/ops/poisson.py b/tensorflow/contrib/distributions/python/ops/poisson.py
index a84aad6fc9372395ac021fa3aa006ddf9272e6a9..b34539402102b8f289d4eb289fcb82f4030f4e8c 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson.py
@@ -93,7 +93,7 @@ class Poisson(distribution.Distribution):
       TypeError: if `rate` is not a float-type.
       TypeError: if `log_rate` is not a float-type.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[rate]) as name:
       if (rate is None) == (log_rate is None):
         raise ValueError("Must specify exactly one of `rate` and `log_rate`.")
diff --git a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
index 19c99dcee92978e938a73af9be445cd098e5fe90..fe72091d7d759e54c51eb666f2ceacc8371e55fd 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
@@ -255,7 +255,7 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
       TypeError: if `quadrature_grid` and `quadrature_probs` have different base
         `dtype`.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[loc, scale]) as name:
       if loc is not None:
         loc = ops.convert_to_tensor(loc, name="loc")
diff --git a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
index eb94760ad71f5babaedaafd3f7990b40aaad85c2..584d2c385fced95ec496bb8dae9556e5c376b66d 100644
--- a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
@@ -263,7 +263,7 @@ class QuantizedDistribution(distributions.Distribution):
           `Distribution` or continuous.
       NotImplementedError:  If the base distribution does not implement `cdf`.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     values = (
         list(distribution.parameters.values()) +
         [low, high])
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
index 84c8d29072c2f1f3888329638c4695bccf70eab7..0362996e684fb34b15cd98a2fc40df58087fbe95 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@@ -165,7 +165,7 @@ class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
     Raises:
       ValueError: If both `probs` and `logits` are passed, or if neither.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[logits, probs, temperature]) as name:
       with ops.control_dependencies([check_ops.assert_positive(temperature)]
                                     if validate_args else []):
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index 325f41e37c928ba8e81e45e63a7f7f8126bc80f8..910c430ae7f026a3ac9ce50d1d5936d4454cba41 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -162,7 +162,7 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[logits, probs, temperature]) as name:
 
       self._logits, self._probs = distribution_util.get_logits_and_probs(
diff --git a/tensorflow/contrib/distributions/python/ops/seed_stream.py b/tensorflow/contrib/distributions/python/ops/seed_stream.py
index 056d349688511e19a4fa3d58a5b3c1c8355671a3..cf505ac627b62ae0a3d1ec1ce2a237c3c2ff1b74 100644
--- a/tensorflow/contrib/distributions/python/ops/seed_stream.py
+++ b/tensorflow/contrib/distributions/python/ops/seed_stream.py
@@ -169,7 +169,7 @@ class SeedStream(object):
         and TensorFlow Probability code base.  See class docstring for
         rationale.
     """
-    self._seed = seed
+    self._seed = seed.original_seed if isinstance(seed, SeedStream) else seed
     self._salt = salt
     self._counter = 0
 
diff --git a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
index 03828fa61277eeaf7ce90de8023b4ed91f6cc4dc..f04dc8da39140240edbe4efb75de30e321436d55 100644
--- a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
@@ -132,7 +132,7 @@ class SinhArcsinh(transformed_distribution.TransformedDistribution):
         if one or more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
 
     with ops.name_scope(name,
                         values=[loc, scale, skewness, tailweight]) as name:
diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
index af6ff8162b173015dca2d568e13d63127af7853a..cd6d7499595d88d18de339371d4a07fe780662d9 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
@@ -395,7 +395,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       ValueError: if `not distribution.is_scalar_batch`.
       ValueError: if `not distribution.is_scalar_event`.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[mix_loc, temperature]) as name:
       if not scale or len(scale) < 2:
         raise ValueError("Must specify list (or list-like object) of scale "
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
index e265b5d0f7c10b2782a1a8924babdca9b986f622..3465d66b30501e7aebd9904d2ae2206d628c10b7 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
@@ -175,7 +175,7 @@ class VectorExponentialDiag(
     Raises:
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier]):
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
index 89136d6760bb663b5ff86a77c5945ce900f072b9..2c31b019845d7e4558eb3047af84732a2ae03986 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
@@ -175,7 +175,7 @@ class VectorExponentialLinearOperator(
       ValueError: if `scale` is unspecified.
       TypeError: if not `scale.dtype.is_floating`
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     if scale is None:
       raise ValueError("Missing required `scale` parameter.")
     if not scale.dtype.is_floating:
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
index 8dd983b750d9b39775e570800006011f4968f7f3..6a36018d6f1b83955ef9080ec11c74c08a670075 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
@@ -210,7 +210,7 @@ class VectorLaplaceDiag(
     Raises:
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name):
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier]):
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
index ec485c95c15da2794b67d2699d2bdd9db97bb6c4..97e5c76d800acd800e34a9e66a3c5fdd7ce4f660 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
@@ -191,7 +191,7 @@ class VectorLaplaceLinearOperator(
       ValueError: if `scale` is unspecified.
       TypeError: if not `scale.dtype.is_floating`
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     if scale is None:
       raise ValueError("Missing required `scale` parameter.")
     if not scale.dtype.is_floating:
diff --git a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
index 1438ede26500bca4541fa9b2020ff22d4c071098..ff5ca4525700aedc88d75e391bf0c2415c2afa13 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
@@ -163,7 +163,7 @@ class VectorSinhArcsinhDiag(transformed_distribution.TransformedDistribution):
     Raises:
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
 
     with ops.name_scope(
         name,
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index 7e78ded9df07564126b46b6beeeccf95bf1eef94..4742f7521816d4643354017495f3380c78ac7bc2 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -175,7 +175,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
         if one or more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     graph_parents = [df, loc, scale_identity_multiplier, scale_diag,
                      scale_tril, scale_perturb_factor, scale_perturb_diag]
     with ops.name_scope(name) as name:
diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py
index 91453fed5d279178a0e062b71dad3b0f957b11b4..f555867e7f3c2a6bc797e9b3d56da2fa434aba6f 100644
--- a/tensorflow/contrib/distributions/python/ops/wishart.py
+++ b/tensorflow/contrib/distributions/python/ops/wishart.py
@@ -107,7 +107,7 @@ class _WishartLinearOperator(distribution.Distribution):
       ValueError: if df < k, where scale operator event shape is
         `(k, k)`
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     self._cholesky_input_output_matrices = cholesky_input_output_matrices
     with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[df, scale_operator]):
@@ -530,7 +530,7 @@ class WishartCholesky(_WishartLinearOperator):
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[scale]) as name:
       with ops.name_scope("init", values=[scale]):
         scale = ops.convert_to_tensor(scale)
@@ -646,7 +646,7 @@ class WishartFull(_WishartLinearOperator):
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[scale]):
         scale = ops.convert_to_tensor(scale)
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index 99abbae03fc14f241dae27f317902f7335819037..0cc764d2208c5b061b7b836bdf57a035f52c6fcf 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -120,7 +120,6 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:checkpointable",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
@@ -131,6 +130,7 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/training/checkpointable:base",
     ],
 )
 
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 0783d1b5d70e502e6edd80b59f37fdd93b413e12..d7909dd5a2691a015a6afed2caa475b39ca7ebc3 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -31,7 +31,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.saver import BaseSaverBuilder
 
 _uid_counter = 0
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index 7b123707cc3a26073088cf2c57c6211e831c19fd..68bec9aee894edd60a025ac1cf87ca3e010db842 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -37,7 +37,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
-from tensorflow.python.training import checkpointable_utils
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 class IteratorTest(test.TestCase):
diff --git a/tensorflow/contrib/eager/python/examples/gan/mnist.py b/tensorflow/contrib/eager/python/examples/gan/mnist.py
index b80c90902353709b7f739585291ec3b5890c27c7..cc9cf53410f641cc3303b4450e9eaa1301904a64 100644
--- a/tensorflow/contrib/eager/python/examples/gan/mnist.py
+++ b/tensorflow/contrib/eager/python/examples/gan/mnist.py
@@ -227,7 +227,7 @@ def train_one_epoch(generator, discriminator, generator_optimizer,
           maxval=1.,
           seed=batch_index)
 
-      with tfe.GradientTape(persistent=True) as g:
+      with tf.GradientTape(persistent=True) as g:
         generated_images = generator(noise)
         tf.contrib.summary.image(
             'generated_images',
@@ -306,7 +306,7 @@ def main(_):
 
 
 if __name__ == '__main__':
-  tfe.enable_eager_execution()
+  tf.enable_eager_execution()
 
   parser = argparse.ArgumentParser()
   parser.add_argument(
diff --git a/tensorflow/contrib/eager/python/examples/gan/mnist_test.py b/tensorflow/contrib/eager/python/examples/gan/mnist_test.py
index bd35e50c1f434d167c5a8c5aa7d224912523ce28..81ac05e26d23c2fc53f63d64bb28bdea6072e396 100644
--- a/tensorflow/contrib/eager/python/examples/gan/mnist_test.py
+++ b/tensorflow/contrib/eager/python/examples/gan/mnist_test.py
@@ -111,5 +111,5 @@ class MnistEagerGanBenchmark(tf.test.Benchmark):
 
 
 if __name__ == '__main__':
-  tfe.enable_eager_execution()
+  tf.enable_eager_execution()
   tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
index 4e1380afb2e6e722de65c691d4fbf44621072e87..2259c20741ab689dbe0d08d32ff05fc7f8a2100d 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
@@ -119,7 +119,7 @@ def synthetic_dataset_helper(w, b, num_features, noise_level, batch_size,
 
 
 def main(_):
-  tfe.enable_eager_execution()
+  tf.enable_eager_execution()
   # Ground-truth constants.
   true_w = [[-2.0], [4.0], [1.0]]
   true_b = [0.5]
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py
index e53234b51a7dccc11e548ac81a7ef070c628aa52..2bc2fc2aa9150a3181db612439d0c37c8e76d1e3 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py
@@ -117,5 +117,5 @@ class EagerLinearRegressionBenchmark(tf.test.Benchmark):
 
 
 if __name__ == "__main__":
-  tfe.enable_eager_execution()
+  tf.enable_eager_execution()
   tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
index 8517a3bf7b6aebf4ecd2f148d2160cfea1b1b9c0..2d51cfdeee3f0b45514af0895366417158b01614 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
@@ -36,9 +36,7 @@ def device_and_data_format():
                                                               'channels_last')
 
 
-def random_batch(batch_size, device_and_format=None):
-  _, data_format = device_and_format or device_and_data_format()
-
+def random_batch(batch_size, data_format):
   shape = (3, 224, 224) if data_format == 'channels_first' else (224, 224, 3)
   shape = (batch_size,) + shape
 
@@ -53,7 +51,7 @@ def random_batch(batch_size, device_and_format=None):
 
 def train_one_step(model, images, labels, optimizer):
 
-  with tfe.GradientTape() as tape:
+  with tf.GradientTape() as tape:
     logits = model(images, training=True)
     loss = tf.losses.softmax_cross_entropy(
         logits=logits, onehot_labels=labels)
@@ -70,7 +68,7 @@ class ResNet50Test(tf.test.TestCase):
     if defun:
       model.call = tfe.defun(model.call)
     with tf.device(device), tfe.execution_mode(execution_mode):
-      images, _ = random_batch(2)
+      images, _ = random_batch(2, data_format)
       output = model(images, training=False)
       tfe.async_wait()
     self.assertEqual((2, 1000), output.shape)
@@ -91,7 +89,7 @@ class ResNet50Test(tf.test.TestCase):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False)
     with tf.device(device):
-      images, _ = random_batch(2)
+      images, _ = random_batch(2, data_format)
       output = model(images, training=False)
     output_shape = ((2, 2048, 1, 1)
                     if data_format == 'channels_first' else (2, 1, 1, 2048))
@@ -101,7 +99,7 @@ class ResNet50Test(tf.test.TestCase):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False, pooling='avg')
     with tf.device(device):
-      images, _ = random_batch(2)
+      images, _ = random_batch(2, data_format)
       output = model(images, training=False)
     self.assertEqual((2, 2048), output.shape)
 
@@ -115,7 +113,7 @@ class ResNet50Test(tf.test.TestCase):
         name='t0').as_default(), tf.contrib.summary.always_record_summaries():
       with tf.device(device), tfe.execution_mode(execution_mode):
         optimizer = tf.train.GradientDescentOptimizer(0.1)
-        images, labels = random_batch(2)
+        images, labels = random_batch(2, data_format)
         train_one_step(model, images, labels, optimizer)
         self.assertEqual(320, len(model.variables))
         tfe.async_wait()
@@ -134,7 +132,7 @@ class ResNet50Test(tf.test.TestCase):
     model = resnet50.ResNet50(data_format)
     optimizer = tf.train.GradientDescentOptimizer(0.1)
     with tf.device(device):
-      images, labels = random_batch(2)
+      images, labels = random_batch(2, data_format)
       gc.disable()
       # Warm up. Note that this first run does create significant amounts of
       # garbage to be collected. The hope is that this is a build-only effect,
@@ -202,18 +200,18 @@ class ResNet50Benchmarks(tf.test.Benchmark):
     # which forces a sync. This is a roundabout way, yes.
     tf.constant(1.).cpu()
 
-  def _benchmark_eager_apply(self, label, defun=False, execution_mode=None,
-                             device_and_format=None):
+  def _benchmark_eager_apply(self, label, device_and_format, defun=False,
+                             execution_mode=None, compiled=False):
     with tfe.execution_mode(execution_mode):
-      device, data_format = device_and_format or device_and_data_format()
+      device, data_format = device_and_format
       model = resnet50.ResNet50(data_format)
       if defun:
-        model.call = tfe.defun(model.call)
+        model.call = tfe.defun(model.call, compiled=compiled)
       batch_size = 64
       num_burn = 5
       num_iters = 30
       with tf.device(device):
-        images, _ = random_batch(batch_size, device_and_format)
+        images, _ = random_batch(batch_size, data_format)
         for _ in xrange(num_burn):
           model(images, training=False).cpu()
         if execution_mode:
@@ -227,30 +225,34 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         self._report(label, start, num_iters, device, batch_size, data_format)
 
   def benchmark_eager_apply_sync(self):
-    self._benchmark_eager_apply('eager_apply', defun=False)
+    self._benchmark_eager_apply('eager_apply', device_and_data_format(),
+                                defun=False)
 
   def benchmark_eager_apply_async(self):
     self._benchmark_eager_apply(
-        'eager_apply_async', defun=False, execution_mode=tfe.ASYNC)
+        'eager_apply_async', device_and_data_format(), defun=False,
+        execution_mode=tfe.ASYNC)
 
   def benchmark_eager_apply_with_defun(self):
-    self._benchmark_eager_apply('eager_apply_with_defun', defun=True)
+    self._benchmark_eager_apply('eager_apply_with_defun',
+                                device_and_data_format(), defun=True)
 
   def _benchmark_eager_train(self,
                              label,
                              make_iterator,
+                             device_and_format,
                              defun=False,
                              execution_mode=None,
-                             device_and_format=None):
+                             compiled=False):
     with tfe.execution_mode(execution_mode):
-      device, data_format = device_and_format or device_and_data_format()
+      device, data_format = device_and_format
       for batch_size in self._train_batch_sizes():
-        (images, labels) = random_batch(batch_size, device_and_format)
+        (images, labels) = random_batch(batch_size, data_format)
         num_burn = 3
         num_iters = 10
         model = resnet50.ResNet50(data_format)
         if defun:
-          model.call = tfe.defun(model.call)
+          model.call = tfe.defun(model.call, compiled=compiled)
         optimizer = tf.train.GradientDescentOptimizer(0.1)
 
         with tf.device(device):
@@ -273,18 +275,21 @@ class ResNet50Benchmarks(tf.test.Benchmark):
           self._report(label, start, num_iters, device, batch_size, data_format)
 
   def benchmark_eager_train_sync(self):
-    self._benchmark_eager_train('eager_train', MockIterator, defun=False)
+    self._benchmark_eager_train('eager_train', MockIterator,
+                                device_and_data_format(), defun=False)
 
   def benchmark_eager_train_async(self):
     self._benchmark_eager_train(
         'eager_train_async',
         MockIterator,
+        device_and_data_format(),
         defun=False,
         execution_mode=tfe.ASYNC)
 
   def benchmark_eager_train_with_defun(self):
     self._benchmark_eager_train(
-        'eager_train_with_defun', MockIterator, defun=True)
+        'eager_train_with_defun', MockIterator,
+        device_and_data_format(), defun=True)
 
   def benchmark_eager_train_datasets(self):
 
@@ -294,7 +299,8 @@ class ResNet50Benchmarks(tf.test.Benchmark):
       return tfe.Iterator(ds)
 
     self._benchmark_eager_train(
-        'eager_train_dataset', make_iterator, defun=False)
+        'eager_train_dataset', make_iterator,
+        device_and_data_format(), defun=False)
 
   def benchmark_eager_train_datasets_with_defun(self):
 
@@ -304,7 +310,8 @@ class ResNet50Benchmarks(tf.test.Benchmark):
       return tfe.Iterator(ds)
 
     self._benchmark_eager_train(
-        'eager_train_dataset_with_defun', make_iterator, defun=True)
+        'eager_train_dataset_with_defun', make_iterator,
+        device_and_data_format(), defun=True)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot_test.py b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot_test.py
index 75b342ba78bd5de5c2827296f6fba01ffa86d560..b7d8395e277b526ba40ccafa323ba453a8667b62 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot_test.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot_test.py
@@ -67,5 +67,5 @@ class RNNColorbotTest(tf.test.TestCase):
 
 
 if __name__ == "__main__":
-  tfe.enable_eager_execution()
+  tf.enable_eager_execution()
   tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
index be5d60449d7e08c99cc28e76befce56f468c77fd..74701b2f4f7448c5f6c2c1bd7c67a8ee112f4115 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
@@ -304,7 +304,7 @@ def test_model(use_cudnn_rnn):
 
 
 def main(_):
-  tfe.enable_eager_execution()
+  tf.enable_eager_execution()
 
   if not FLAGS.data_path:
     raise ValueError("Must specify --data-path")
diff --git a/tensorflow/contrib/eager/python/examples/scan/BUILD b/tensorflow/contrib/eager/python/examples/scan/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..638c57d1c92c1dce0ef9e73e9a6ac2369358080b
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/scan/BUILD
@@ -0,0 +1,25 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+cuda_py_test(
+    name = "scan_test",
+    size = "small",
+    srcs = ["scan_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "scan_graph_test",
+    size = "small",
+    srcs = ["scan_graph_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
+    ],
+)
diff --git a/tensorflow/contrib/eager/python/examples/scan/scan_graph_test.py b/tensorflow/contrib/eager/python/examples/scan/scan_graph_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4b8c8941ec411912f3089315d038fc4bcd049ae
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/scan/scan_graph_test.py
@@ -0,0 +1,54 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit test for tf.scan under graph mode execution."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+import tensorflow as tf
+
+
+class ScanBenchmark(tf.test.Benchmark):
+
+  def runScan(self, n):
+    elems = np.arange(n)
+    start_time = time.time()
+    sum_op = tf.scan(lambda a, x: a + x, elems, parallel_iterations=1)
+    with tf.Session() as sess:
+      sess.run(sum_op)
+    wall_time = time.time() - start_time
+
+    self.report_benchmark(
+        name='scan',
+        iters=n,
+        wall_time=wall_time)
+
+  def benchmarkScan16000(self):
+    self.runScan(16000)
+
+  def benchmarkScan32000(self):
+    self.runScan(32000)
+
+  def benchmarkScan64000(self):
+    self.runScan(64000)
+
+  def benchmarkScan128000(self):
+    self.runScan(128000)
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/applications/densenet/__init__.py b/tensorflow/contrib/eager/python/examples/scan/scan_test.py
similarity index 52%
rename from tensorflow/python/keras/applications/densenet/__init__.py
rename to tensorflow/contrib/eager/python/examples/scan/scan_test.py
index 6b8ea83920733a3a442171616ab460ffaf831521..a02fc24c79dae6c2565db8b138b1d7391d169ed8 100644
--- a/tensorflow/python/keras/applications/densenet/__init__.py
+++ b/tensorflow/contrib/eager/python/examples/scan/scan_test.py
@@ -12,18 +12,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""DenseNet Keras applications."""
-
+"""Unit test for tf.scan under eager execution."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.densenet import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet121
-from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet169
-from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet201
-from tensorflow.python.keras._impl.keras.applications.densenet import preprocess_input
+import time
+
+import numpy as np
+import tensorflow as tf
+
+
+class ScanBenchmark(tf.test.Benchmark):
+
+  def runScan(self, n):
+    elems = np.arange(n)
+    start_time = time.time()
+    _ = tf.scan(lambda a, x: a + x, elems, parallel_iterations=1)
+    wall_time = time.time() - start_time
+
+    self.report_benchmark(
+        name='scan',
+        iters=n,
+        wall_time=wall_time)
+
+  def benchmarkScan16000(self):
+    self.runScan(16000)
+
+  def benchmarkScan32000(self):
+    self.runScan(32000)
+
+  def benchmarkScan64000(self):
+    self.runScan(64000)
+
+  def benchmarkScan128000(self):
+    self.runScan(128000)
+
 
-del absolute_import
-del division
-del print_function
+if __name__ == '__main__':
+  tf.enable_eager_execution()
+  tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
index 1e4746d01ca1a8d13162844bc064c479c7184237..8ac553e0ae71382966d03d9ef4429adf5137b369 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -36,8 +36,8 @@ from third_party.examples.eager.spinn import spinn
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
-from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 # pylint: enable=g-bad-import-order
 
 
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index 907f9204c2d31a652ca2a0539a23db4722b4e154..1ae6415d5ecb03ef97cdf734c808e3f728dafcb0 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -30,7 +30,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
 
 _to_replace = re.compile("[^A-Za-z0-9.]")
 
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index f0fe4ce8c53bb80c03a3f0de37078bcdb975a0b4..98a98a8d358d8a7d7a06505ed1a7d4c0ff1e18f4 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -30,8 +30,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
-from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import training_util
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 class MetricsTest(test.TestCase):
@@ -146,8 +146,6 @@ class MetricsTest(test.TestCase):
     self.assertAllEqual(2.0, m2.result())
 
   def testNamesWithSpaces(self):
-    # Verify two metrics with the same class and name don't
-    # accidentally share state.
     m1 = metrics.Mean("has space")
     m1(0)
     self.assertEqual(m1.name, "has space")
@@ -186,8 +184,8 @@ class MetricsTest(test.TestCase):
     self.assertEqual(self.evaluate(value), 2.5)
 
   def testTwoMeansGraph(self):
-    # Verify two metrics with the same class and name don't
-    # accidentally share state.
+    # Verify two metrics with the same name in the same graph raises a
+    # ValueError.
     with context.graph_mode():
       m1 = metrics.Mean()
       m1(0)
diff --git a/tensorflow/contrib/eager/python/network.py b/tensorflow/contrib/eager/python/network.py
index 44828bea50c660815e457f21a1990cd706c40876..f801d9a47b2f831a48d9b6335c69612c1356d800 100644
--- a/tensorflow/contrib/eager/python/network.py
+++ b/tensorflow/contrib/eager/python/network.py
@@ -23,9 +23,8 @@ import os
 import weakref
 
 from tensorflow.python.eager import context
-from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import ops
-from tensorflow.python.keras._impl.keras.engine import base_layer as keras_base_layer
+from tensorflow.python.keras.engine import base_layer as keras_base_layer
 from tensorflow.python.layers import base
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
@@ -33,6 +32,7 @@ from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import function_utils
 
 # pylint: disable=protected-access
 # Explanation for protected-access disable: Network has lots of same-class and
@@ -545,10 +545,10 @@ class Sequential(Network):
 
   def add(self, layer_func):
     if isinstance(layer_func, base.Layer):
-      args = estimator_util.fn_args(layer_func.call)
+      args = function_utils.fn_args(layer_func.call)
       self.track_layer(layer_func)
     elif callable(layer_func):
-      args = estimator_util.fn_args(layer_func)
+      args = function_utils.fn_args(layer_func)
     else:
       raise TypeError(
           "Sequential.add() takes only tf.layers.Layer objects or callables; "
diff --git a/tensorflow/contrib/eager/python/network_test.py b/tensorflow/contrib/eager/python/network_test.py
index 6a51d03de52914d2ad0ac3ad05d1ba01d856ad9a..c92bd15b253b67a3301cd562046a4467e1bf877d 100644
--- a/tensorflow/contrib/eager/python/network_test.py
+++ b/tensorflow/contrib/eager/python/network_test.py
@@ -30,8 +30,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import training_util
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 # pylint: disable=not-callable
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index 79dd117854e5fe9f066f671d8ce62e08579e0ed9..5826700c73e255198e9a6974ca240ba55e438a26 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -120,9 +120,9 @@ from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Vari
 from tensorflow.python.ops.variable_scope import EagerVariableStore
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import template
-from tensorflow.python.training.checkpointable import Checkpointable
-from tensorflow.python.training.checkpointable_utils import CheckpointableSaver
-from tensorflow.python.training.checkpointable_utils import Checkpoint
+from tensorflow.python.training.checkpointable.base import Checkpointable
+from tensorflow.python.training.checkpointable.util import CheckpointableSaver
+from tensorflow.python.training.checkpointable.util import Checkpoint
 from tensorflow.python.util.all_util import remove_undocumented
 
 py_func = script_ops.eager_py_func
diff --git a/tensorflow/contrib/eager/python/tfe_test.py b/tensorflow/contrib/eager/python/tfe_test.py
index e80ccbb74d8623e977a98cb7fa5eb41f3c9bf250..db50b33af2e4f1cc6575d4b0d416d6d2669b5c35 100644
--- a/tensorflow/contrib/eager/python/tfe_test.py
+++ b/tensorflow/contrib/eager/python/tfe_test.py
@@ -57,7 +57,7 @@ class TFETest(test_util.TensorFlowTestCase):
       return math_ops.multiply(x, x)
 
     grad = tfe.gradients_function(square)
-    self.assertEquals([6], [x.numpy() for x in grad(3)])
+    self.assertEquals([6], [x.numpy() for x in grad(3.)])
 
   def testGradOfGrad(self):
 
@@ -66,7 +66,7 @@ class TFETest(test_util.TensorFlowTestCase):
 
     grad = tfe.gradients_function(square)
     gradgrad = tfe.gradients_function(lambda x: grad(x)[0])
-    self.assertEquals([2], [x.numpy() for x in gradgrad(3)])
+    self.assertEquals([2], [x.numpy() for x in gradgrad(3.)])
 
   def testCustomGrad(self):
 
@@ -80,7 +80,7 @@ class TFETest(test_util.TensorFlowTestCase):
       return y, grad_fn
 
     grad = tfe.gradients_function(f)
-    self.assertEquals([12], [x.numpy() for x in grad(3)])
+    self.assertEquals([12], [x.numpy() for x in grad(3.)])
 
   def testGPU(self):
     if tfe.num_gpus() <= 0:
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 571e2e3a5df08e09172ec7b6885c5c972b5abfb6..d5d2abf8c4c82374842ed2e10a849765a6dddd3b 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -14,11 +14,14 @@ py_library(
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":baseline",
         ":boosted_trees",
         ":dnn",
         ":dnn_linear_combined",
+        ":export",
         ":extenders",
         ":head",
+        ":hooks",
         ":linear",
         ":logit_fns",
         ":multi_head",
@@ -28,6 +31,49 @@ py_library(
     ],
 )
 
+py_library(
+    name = "baseline",
+    srcs = ["python/estimator/baseline.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:baseline",
+    ],
+)
+
+py_test(
+    name = "baseline_test",
+    size = "small",
+    srcs = ["python/estimator/baseline_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "notsan",
+    ],
+    deps = [
+        ":baseline",
+        ":head",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:export_export",
+        "//tensorflow/python/estimator:metric_keys",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "boosted_trees",
     srcs = ["python/estimator/boosted_trees.py"],
@@ -180,6 +226,43 @@ py_test(
     ],
 )
 
+py_library(
+    name = "export",
+    srcs = [
+        "python/estimator/export.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/estimator:model_fn",
+    ],
+)
+
+py_test(
+    name = "export_test",
+    size = "medium",
+    srcs = ["python/estimator/export_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],  # b/62863147
+    deps = [
+        ":export",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:export_export",
+        "//tensorflow/python/estimator:export_output",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:tag_constants",
+    ],
+)
+
 py_library(
     name = "head",
     srcs = [
@@ -239,6 +322,36 @@ py_test(
     ],
 )
 
+py_library(
+    name = "hooks",
+    srcs = [
+        "python/estimator/hooks.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+)
+
+py_test(
+    name = "hooks_test",
+    size = "medium",
+    srcs = ["python/estimator/hooks_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":hooks",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/estimator:estimator_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "linear",
     srcs = ["python/estimator/linear.py"],
@@ -284,9 +397,9 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
         "//tensorflow/python/estimator:dnn",
         "//tensorflow/python/estimator:linear",
-        "//tensorflow/python/estimator:util",
     ],
 )
 
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index d43b3ea6bf2718ab7f5317b95fdd93ec9917dc69..788ac5ca7046d6dd30a3d5520b243944532622fa 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -19,11 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
+from tensorflow.contrib.estimator.python.estimator.baseline import *
 from tensorflow.contrib.estimator.python.estimator.boosted_trees import *
 from tensorflow.contrib.estimator.python.estimator.dnn import *
 from tensorflow.contrib.estimator.python.estimator.dnn_linear_combined import *
+from tensorflow.contrib.estimator.python.estimator.export import *
 from tensorflow.contrib.estimator.python.estimator.extenders import *
 from tensorflow.contrib.estimator.python.estimator.head import *
+from tensorflow.contrib.estimator.python.estimator.hooks import *
 from tensorflow.contrib.estimator.python.estimator.linear import *
 from tensorflow.contrib.estimator.python.estimator.logit_fns import *
 from tensorflow.contrib.estimator.python.estimator.multi_head import *
@@ -38,12 +41,14 @@ _allowed_symbols = [
     'binary_classification_head',
     'clip_gradients_by_norm',
     'forward_features',
+    'InMemoryEvaluatorHook',
     'logistic_regression_head',
     'multi_class_head',
     'multi_head',
     'multi_label_head',
     'poisson_regression_head',
     'regression_head',
+    'BaselineEstimator',
     'DNNEstimator',
     'DNNLinearCombinedEstimator',
     'LinearEstimator',
@@ -56,6 +61,8 @@ _allowed_symbols = [
     'TowerOptimizer',
     'RNNClassifier',
     'RNNEstimator',
+    'export_saved_model_for_mode',
+    'export_all_saved_models',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/estimator/python/estimator/baseline.py b/tensorflow/contrib/estimator/python/estimator/baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..beffbee73064b9ef425b115317c43e29477b19af
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/baseline.py
@@ -0,0 +1,98 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Baseline estimators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import baseline
+
+
+class BaselineEstimator(estimator.Estimator):
+  """An estimator that can establish a simple baseline.
+
+  The estimator uses a user-specified head.
+
+  This estimator ignores feature values and will learn to predict the average
+  value of each label. E.g. for single-label classification problems, this will
+  predict the probability distribution of the classes as seen in the labels.
+  For multi-label classification problems, it will predict the ratio of examples
+  that contain each class.
+
+  Example:
+
+  ```python
+
+  # Build baseline multi-label classifier.
+  estimator = BaselineEstimator(
+      head=tf.contrib.estimator.multi_label_head(n_classes=3))
+
+  # Input builders
+  def input_fn_train: # returns x, y (where y represents label's class index).
+    pass
+
+  def input_fn_eval: # returns x, y (where y represents label's class index).
+    pass
+
+  # Fit model.
+  estimator.train(input_fn=input_fn_train)
+
+  # Evaluates cross entropy between the test and train labels.
+  loss = classifier.evaluate(input_fn=input_fn_eval)["loss"]
+
+  # For each class, predicts the ratio of training examples that contain the
+  # class.
+  predictions = classifier.predict(new_samples)
+
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+    otherwise there will be a `KeyError`:
+
+  * if `weight_column` passed to the `head` constructor is not `None`, a feature
+    with `key=weight_column` whose value is a `Tensor`.
+  """
+
+  def __init__(self,
+               head,
+               model_dir=None,
+               optimizer='Ftrl',
+               config=None):
+    """Initializes a BaselineEstimator instance.
+
+    Args:
+      head: A `_Head` instance constructed with a method such as
+        `tf.contrib.estimator.multi_label_head`.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+      optimizer: String, `tf.Optimizer` object, or callable that creates the
+        optimizer to use for training. If not specified, will use
+        `FtrlOptimizer` with a default learning rate of 0.3.
+      config: `RunConfig` object to configure the runtime settings.
+    """
+    def _model_fn(features, labels, mode, config):
+      return baseline._baseline_model_fn(  # pylint: disable=protected-access
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          optimizer=optimizer,
+          config=config)
+    super(BaselineEstimator, self).__init__(
+        model_fn=_model_fn,
+        model_dir=model_dir,
+        config=config)
diff --git a/tensorflow/contrib/estimator/python/estimator/baseline_test.py b/tensorflow/contrib/estimator/python/estimator/baseline_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0e3e670f7332811c1bfdaea65b0308ce59ade59
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/baseline_test.py
@@ -0,0 +1,430 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for baseline.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.contrib.estimator.python.estimator import baseline
+from tensorflow.contrib.estimator.python.estimator import head as head_lib
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import saver
+
+# Names of variables created by model.
+BIAS_NAME = 'baseline/bias'
+
+
+def assert_close(expected, actual, rtol=1e-04, name='assert_close'):
+  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
+    expected = ops.convert_to_tensor(expected, name='expected')
+    actual = ops.convert_to_tensor(actual, name='actual')
+    rdiff = math_ops.abs(expected - actual, 'diff') / math_ops.abs(expected)
+    rtol = ops.convert_to_tensor(rtol, name='rtol')
+    return check_ops.assert_less(
+        rdiff,
+        rtol,
+        data=('Condition expected =~ actual did not hold element-wise:'
+              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
+              'rtol = ', rtol,),
+        name=scope)
+
+
+def save_variables_to_ckpt(model_dir):
+  init_all_op = [variables.global_variables_initializer()]
+  with tf_session.Session() as sess:
+    sess.run(init_all_op)
+    saver.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
+
+
+def _baseline_estimator_fn(
+    weight_column=None, label_dimension=1, *args, **kwargs):
+  """Returns a BaselineEstimator that uses regression_head."""
+  return baseline.BaselineEstimator(
+      head=head_lib.regression_head(
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
+      *args, **kwargs)
+
+
+class BaselineEstimatorEvaluationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def test_evaluation_batch(self):
+    """Tests evaluation for batch_size==2."""
+    with ops.Graph().as_default():
+      variables.Variable([13.0], name=BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    baseline_estimator = _baseline_estimator_fn(model_dir=self._model_dir)
+    eval_metrics = baseline_estimator.evaluate(
+        input_fn=lambda: ({'age': ((1,), (1,))}, ((10.,), (10.,))), steps=1)
+
+    # Logit is bias = 13, while label is 10.
+    # Loss per example is 3**2 = 9.
+    # Training loss is the sum over batch = 9 + 9 = 18
+    # Average loss is the average over batch = 9
+    self.assertDictEqual({
+        metric_keys.MetricKeys.LOSS: 18.,
+        metric_keys.MetricKeys.LOSS_MEAN: 9.,
+        ops.GraphKeys.GLOBAL_STEP: 100
+    }, eval_metrics)
+
+  def test_evaluation_weights(self):
+    """Tests evaluation with weights."""
+    with ops.Graph().as_default():
+      variables.Variable([13.0], name=BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    def _input_fn():
+      features = {'age': ((1,), (1,)), 'weights': ((1.,), (2.,))}
+      labels = ((10.,), (10.,))
+      return features, labels
+
+    baseline_estimator = _baseline_estimator_fn(
+        weight_column='weights',
+        model_dir=self._model_dir)
+    eval_metrics = baseline_estimator.evaluate(input_fn=_input_fn, steps=1)
+
+    # Logit is bias = 13, while label is 10.
+    # Loss per example is 3**2 = 9.
+    # Training loss is the weighted sum over batch = 9 + 2*9 = 27
+    # average loss is the weighted average = 9 + 2*9 / (1 + 2) = 9
+    self.assertDictEqual({
+        metric_keys.MetricKeys.LOSS: 27.,
+        metric_keys.MetricKeys.LOSS_MEAN: 9.,
+        ops.GraphKeys.GLOBAL_STEP: 100
+    }, eval_metrics)
+
+  def test_evaluation_for_multi_dimensions(self):
+    label_dim = 2
+    with ops.Graph().as_default():
+      variables.Variable([46.0, 58.0], name=BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    baseline_estimator = _baseline_estimator_fn(
+        label_dimension=label_dim,
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'age': np.array([[2., 4., 5.]]),
+        },
+        y=np.array([[46., 58.]]),
+        batch_size=1,
+        num_epochs=None,
+        shuffle=False)
+    eval_metrics = baseline_estimator.evaluate(input_fn=input_fn, steps=1)
+
+    self.assertItemsEqual(
+        (metric_keys.MetricKeys.LOSS, metric_keys.MetricKeys.LOSS_MEAN,
+         ops.GraphKeys.GLOBAL_STEP), eval_metrics.keys())
+
+    # Logit is bias which is [46, 58]
+    self.assertAlmostEqual(0, eval_metrics[metric_keys.MetricKeys.LOSS])
+
+
+class BaselineEstimatorPredictTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def test_1d(self):
+    """Tests predict when all variables are one-dimensional."""
+    with ops.Graph().as_default():
+      variables.Variable([.2], name=BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    baseline_estimator = _baseline_estimator_fn(model_dir=self._model_dir)
+
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': np.array([[2.]])},
+        y=None,
+        batch_size=1,
+        num_epochs=1,
+        shuffle=False)
+    predictions = baseline_estimator.predict(input_fn=predict_input_fn)
+    predicted_scores = list([x['predictions'] for x in predictions])
+    # x * weight + bias = 2. * 10. + .2 = 20.2
+    self.assertAllClose([[.2]], predicted_scores)
+
+  def testMultiDim(self):
+    """Tests predict when all variables are multi-dimenstional."""
+    batch_size = 2
+    label_dimension = 3
+    with ops.Graph().as_default():
+      variables.Variable(  # shape=[label_dimension]
+          [.2, .4, .6], name=BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    baseline_estimator = _baseline_estimator_fn(
+        label_dimension=label_dimension,
+        model_dir=self._model_dir)
+
+    predict_input_fn = numpy_io.numpy_input_fn(
+        # x shape=[batch_size, x_dim]
+        x={'x': np.array([[1., 2., 3., 4.], [5., 6., 7., 8.]])},
+        y=None,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+    predictions = baseline_estimator.predict(input_fn=predict_input_fn)
+    predicted_scores = list([x['predictions'] for x in predictions])
+    # score = bias, shape=[batch_size, label_dimension]
+    self.assertAllClose([[0.2, 0.4, 0.6], [0.2, 0.4, 0.6]],
+                        predicted_scores)
+
+
+class BaselineEstimatorIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
+                          input_dimension, label_dimension, prediction_length):
+    feature_columns = [
+        feature_column_lib.numeric_column('x', shape=(input_dimension,))
+    ]
+    est = _baseline_estimator_fn(
+        label_dimension=label_dimension,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    # learn y = x
+    est.train(train_input_fn, steps=200)
+
+    # EVALUTE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(200, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn(metric_keys.MetricKeys.LOSS, six.iterkeys(scores))
+
+    # PREDICT
+    predictions = np.array(
+        [x['predictions'] for x in est.predict(predict_input_fn)])
+    self.assertAllEqual((prediction_length, label_dimension), predictions.shape)
+
+    # EXPORT
+    feature_spec = feature_column_lib.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def test_numpy_input_fn(self):
+    """Tests complete flow with numpy_input_fn."""
+    label_dimension = 2
+    input_dimension = label_dimension
+    batch_size = 10
+    prediction_length = batch_size
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=None,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=input_dimension,
+        label_dimension=label_dimension,
+        prediction_length=prediction_length)
+
+
+class BaselineEstimatorTrainingTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _mock_optimizer(self, expected_loss=None):
+    expected_var_names = [
+        '%s:0' % BIAS_NAME
+    ]
+
+    def _minimize(loss, global_step=None, var_list=None):
+      trainable_vars = var_list or ops.get_collection(
+          ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertItemsEqual(expected_var_names,
+                            [var.name for var in trainable_vars])
+
+      # Verify loss. We can't check the value directly, so we add an assert op.
+      self.assertEquals(0, loss.shape.ndims)
+      if expected_loss is None:
+        if global_step is not None:
+          return distribute_lib.increment_var(global_step)
+        return control_flow_ops.no_op()
+      assert_loss = assert_close(
+          math_ops.to_float(expected_loss, name='expected'),
+          loss,
+          name='assert_loss')
+      with ops.control_dependencies((assert_loss,)):
+        if global_step is not None:
+          return distribute_lib.increment_var(global_step)
+        return control_flow_ops.no_op()
+
+    mock_optimizer = test.mock.NonCallableMock(
+        spec=optimizer.Optimizer,
+        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
+
+    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
+    # So, return mock_optimizer itself for deepcopy.
+    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
+    return mock_optimizer
+
+  def _assert_checkpoint(self,
+                         label_dimension,
+                         expected_global_step,
+                         expected_bias=None):
+    shapes = {
+        name: shape
+        for (name, shape) in checkpoint_utils.list_variables(self._model_dir)
+    }
+
+    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
+    self.assertEqual(expected_global_step,
+                     checkpoint_utils.load_variable(self._model_dir,
+                                                    ops.GraphKeys.GLOBAL_STEP))
+
+    self.assertEqual([label_dimension], shapes[BIAS_NAME])
+    if expected_bias is not None:
+      self.assertEqual(expected_bias,
+                       checkpoint_utils.load_variable(self._model_dir,
+                                                      BIAS_NAME))
+
+  def testFromScratch(self):
+    # Create BaselineRegressor.
+    label = 5.
+    age = 17
+    # loss = (logits - label)^2 = (0 - 5.)^2 = 25.
+    mock_optimizer = self._mock_optimizer(expected_loss=25.)
+    baseline_estimator = _baseline_estimator_fn(
+        model_dir=self._model_dir,
+        optimizer=mock_optimizer)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    baseline_estimator.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        label_dimension=1,
+        expected_global_step=num_steps,
+        expected_bias=[0.])
+
+  def testFromCheckpoint(self):
+    # Create initial checkpoint.
+    bias = 7.0
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable([bias], name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step,
+          name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    # logits = bias = 6.
+    # loss = (logits - label)^2 = (7 - 5)^2 = 4
+    mock_optimizer = self._mock_optimizer(expected_loss=4.)
+    baseline_estimator = _baseline_estimator_fn(
+        model_dir=self._model_dir,
+        optimizer=mock_optimizer)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    baseline_estimator.train(
+        input_fn=lambda: ({'age': ((17,),)}, ((5.,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        label_dimension=1,
+        expected_global_step=initial_global_step + num_steps,
+        expected_bias=[bias])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn.py b/tensorflow/contrib/estimator/python/estimator/dnn.py
index cf6e3329d2e27735d8759cc2ab3726e8c624c6ae..7ff25b95c079c7e06d29e874bcaa0d2c13e7167e 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn.py
+++ b/tensorflow/contrib/estimator/python/estimator/dnn.py
@@ -93,7 +93,7 @@ class DNNEstimator(estimator.Estimator):
                dropout=None,
                input_layer_partitioner=None,
                config=None):
-    """Initializes a `DNNClassifier` instance.
+    """Initializes a `DNNEstimator` instance.
 
     Args:
       head: A `_Head` instance constructed with a method such as
diff --git a/tensorflow/contrib/estimator/python/estimator/export.py b/tensorflow/contrib/estimator/python/estimator/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..03cf6f107c1c5589522d7be4946562a466740b0e
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/export.py
@@ -0,0 +1,223 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wrapper for methods to export train/eval graphs from Estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator import model_fn as model_fn_lib
+
+
+def export_saved_model_for_mode(
+    estimator, export_dir_base, input_receiver_fn,
+    assets_extra=None,
+    as_text=False,
+    checkpoint_path=None,
+    strip_default_attrs=False,
+    mode=model_fn_lib.ModeKeys.PREDICT):
+  # pylint: disable=line-too-long
+  """Exports a single train/eval/predict graph as a SavedModel.
+
+  For a detailed guide, see
+  @{$saved_model#using_savedmodel_with_estimators$Using SavedModel with Estimators}.
+
+  Sample usage:
+  ```python
+  classifier = tf.estimator.LinearClassifier(
+      feature_columns=[age, language])
+  classifier.train(input_fn=input_fn, steps=1000)
+
+  feature_spec = {
+      'age': tf.placeholder(dtype=tf.int64),
+      'language': array_ops.placeholder(dtype=tf.string)
+  }
+  label_spec = tf.placeholder(dtype=dtypes.int64)
+
+  train_rcvr_fn = tf.contrib.estimator.build_raw_supervised_input_receiver_fn(
+      feature_spec, label_spec)
+
+  export_dir = tf.contrib.estimator.export_saved_model_for_mode(
+      classifier,
+      export_dir_base='my_model/',
+      input_receiver_fn=train_rcvr_fn,
+      mode=model_fn_lib.ModeKeys.TRAIN)
+
+  # export_dir is a timestamped directory with the SavedModel, which
+  # can be used for serving, analysis with TFMA, or directly loaded in.
+  with ops.Graph().as_default() as graph:
+    with session.Session(graph=graph) as sess:
+      loader.load(sess, [tag_constants.TRAINING], export_dir)
+      weights = graph.get_tensor_by_name(''linear/linear_model/age/weights')
+      ...
+  ```
+
+  This method is a wrapper for _export_all_saved_models, and wraps a raw
+  input_receiver_fn in a dictionary to pass in to that function.
+  See _export_all_saved_models for full docs.
+
+  See tf.contrib.estimator.export_saved_model_for_mode for the currently
+  exposed version of this function.
+
+  Args:
+    estimator: an instance of tf.estimator.Estimator
+    export_dir_base: A string containing a directory in which to create
+      timestamped subdirectories containing exported SavedModels.
+    input_receiver_fn: a function that takes no argument and
+      returns the appropriate subclass of `InputReceiver`.
+    assets_extra: A dict specifying how to populate the assets.extra directory
+      within the exported SavedModel, or `None` if no extra assets are needed.
+    as_text: whether to write the SavedModel proto in text format.
+    checkpoint_path: The checkpoint path to export.  If `None` (the default),
+      the most recent checkpoint found within the model directory is chosen.
+    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+      removed from the NodeDefs. For a detailed guide, see
+      [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+    mode: tf.estimator.ModeKeys value indicating with mode will be exported.
+
+  Returns:
+    The string path to the exported directory.
+
+  Raises:
+    ValueError: if input_receiver_fn is None, no export_outputs
+      are provided, or no checkpoint can be found.
+  """
+  # pylint: enable=line-too-long
+
+  # pylint: disable=protected-access
+  return estimator._export_saved_model_for_mode(
+      export_dir_base, input_receiver_fn,
+      assets_extra=assets_extra,
+      as_text=as_text,
+      checkpoint_path=checkpoint_path,
+      strip_default_attrs=strip_default_attrs,
+      mode=mode)
+  # pylint: enable=protected-access
+
+
+def export_all_saved_models(
+    estimator, export_dir_base, input_receiver_fn_map,
+    assets_extra=None,
+    as_text=False,
+    checkpoint_path=None,
+    strip_default_attrs=False):
+  # pylint: disable=line-too-long
+  """Exports requested train/eval/predict graphs as separate SavedModels.
+
+  See tf.contrib.estimator.export_all_saved_models for the currently
+  exposed version of this function.
+
+  For each mode passed in via the input_receiver_fn_map,
+  this method builds a new graph by calling the input_receiver_fn to obtain
+  feature and label `Tensor`s. Next, this method calls the `Estimator`'s
+  model_fn in the passed mode to generate the model graph based on
+  those features and labels, and restores the given checkpoint
+  (or, lacking that, the most recent checkpoint) into the graph.
+  Only one of the modes is used for saving variables to the SavedModel
+  (order of preference: TRAIN, EVAL, then PREDICT), such that up to three
+  MetaGraphDefs are saved with a single set of variables in a single
+  SavedModel directory.
+
+  For prediction, the exported `MetaGraphDef` will provide one `SignatureDef`
+  for each element of the export_outputs dict returned from the model_fn,
+  named using the same keys.  One of these keys is always
+  signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, indicating which
+  signature will be served when a serving request does not specify one.
+  For each signature, the outputs are provided by the corresponding
+  `ExportOutput`s, and the inputs are always the input receivers provided by
+  the serving_input_receiver_fn.
+
+  For training and evaluation, the train_op is stored in an extra collection,
+  and loss, metrics, and predictions are included in a SignatureDef for the
+  mode in question.
+
+  Extra assets may be written into the SavedModel via the assets_extra
+  argument.  This should be a dict, where each key gives a destination path
+  (including the filename) relative to the assets.extra directory.  The
+  corresponding value gives the full path of the source file to be copied.
+  For example, the simple case of copying a single file without renaming it
+  is specified as `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
+
+  Sample usage:
+  ```python
+  classifier = tf.estimator.LinearClassifier(
+      feature_columns=[age, language])
+  classifier.train(input_fn=input_fn)
+
+  feature_spec = {
+      'age': tf.placeholder(dtype=tf.int64),
+      'language': array_ops.placeholder(dtype=tf.string)
+  }
+  label_spec = tf.placeholder(dtype=dtypes.int64)
+
+  train_rcvr_fn = tf.contrib.estimator.build_raw_supervised_input_receiver_fn(
+      feature_spec, label_spec)
+
+  serve_rcvr_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
+      feature_spec)
+
+  rcvr_fn_map = {
+      model_fn_lib.ModeKeys.TRAIN: train_rcvr_fn,
+      model_fn_lib.ModeKeys.PREDICT: serve_rcvr_fn,
+  }
+
+  export_dir = tf.contrib.estimator.export_all_saved_models(
+      classifier,
+      export_dir_base='my_model/',
+      input_receiver_fn_map=rcvr_fn_map)
+
+  # export_dirs is a dict of directories with SavedModels, which
+  # can be used for serving, analysis with TFMA, or directly loaded in.
+  with ops.Graph().as_default() as graph:
+    with session.Session(graph=graph) as sess:
+      loader.load(sess, [tag_constants.TRAINING], export_dir)
+      weights = graph.get_tensor_by_name('linear/linear_model/age/weights')
+      ...
+  ```
+
+  Args:
+    estimator: an instance of tf.estimator.Estimator
+    export_dir_base: A string containing a directory in which to create
+      timestamped subdirectories containing exported SavedModels.
+    input_receiver_fn_map: dict of tf.estimator.ModeKeys to input_receiver_fn
+      mappings, where the input_receiver_fn is a function that takes no
+      argument and returns the appropriate subclass of `InputReceiver`.
+    assets_extra: A dict specifying how to populate the assets.extra directory
+      within the exported SavedModel, or `None` if no extra assets are needed.
+    as_text: whether to write the SavedModel proto in text format.
+    checkpoint_path: The checkpoint path to export.  If `None` (the default),
+      the most recent checkpoint found within the model directory is chosen.
+    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+      removed from the NodeDefs. For a detailed guide, see
+      [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+
+  Returns:
+    A dict of tf.estimator.ModeKeys value to string path for each exported
+    directory.
+
+  Raises:
+    ValueError: if any input_receiver_fn is None, no export_outputs
+      are provided, or no checkpoint can be found.
+  """
+  # pylint: enable=line-too-long
+
+  # pylint: disable=protected-access
+  return estimator._export_all_saved_models(
+      export_dir_base, input_receiver_fn_map,
+      assets_extra=assets_extra,
+      as_text=as_text,
+      checkpoint_path=checkpoint_path,
+      strip_default_attrs=strip_default_attrs)
+  # pylint: enable=protected-access
diff --git a/tensorflow/contrib/estimator/python/estimator/export_test.py b/tensorflow/contrib/estimator/python/estimator/export_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..050821ee672f30a6926c4a0a0e48915515d9afd7
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/export_test.py
@@ -0,0 +1,373 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib wrapping of export_saved_model_for_mode functionality.
+
+These are direct copies of the tests included in core, with import locations
+changed. These should be removed when the functionality in core is part of the
+public API.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+
+from tensorflow.contrib.estimator.python.estimator import export as contrib_export
+from tensorflow.python.client import session
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import training
+from tensorflow.python.util import compat
+
+
+def _model_fn_for_export_tests(features, labels, mode):
+  _, _ = features, labels
+  variables.Variable(1., name='weight')
+  scores = constant_op.constant([3.])
+  classes = constant_op.constant(['wumpus'])
+  update_global_step = state_ops.assign_add(training.get_global_step(), 1)
+  with ops.control_dependencies([update_global_step]):
+    train_op = constant_op.constant(2.)
+  return model_fn_lib.EstimatorSpec(
+      mode,
+      predictions=constant_op.constant(10.),
+      loss=constant_op.constant(1.),
+      train_op=train_op,
+      export_outputs={
+          'test': export_output.ClassificationOutput(scores, classes)})
+
+
+def _x_y_input_fn():
+  return ({'x': constant_op.constant([[1], [1]]),
+           'y': constant_op.constant([[2], [2]])},
+          constant_op.constant([[1], [1]]))
+
+
+def _model_fn_with_x_y(features, labels, mode):
+  _ = labels
+  variables.Variable(1., name='weight')
+  scores = constant_op.constant([3.])
+  classes = constant_op.constant(['wumpus'])
+  if mode == model_fn_lib.ModeKeys.PREDICT:
+    variables.Variable(36., name='name_collision')
+    return model_fn_lib.EstimatorSpec(
+        mode,
+        predictions=constant_op.constant(10.),
+        export_outputs={
+            'test': export_output.ClassificationOutput(scores, classes)})
+  else:
+    prefix = 'eval_' if mode == model_fn_lib.ModeKeys.EVAL else ''
+
+    multiplied = math_ops.multiply(
+        features['x'], features['y'], name='{}multiplied'.format(prefix))
+    metrics = {'mean': metrics_lib.mean(features['x'] - features['y'],
+                                        name='{}mean'.format(prefix))}
+    variables.Variable(1., name='later_var')
+    variables.Variable(3., name='name_collision')
+    return model_fn_lib.EstimatorSpec(
+        mode,
+        predictions=multiplied,
+        loss=constant_op.constant(1.),
+        train_op=state_ops.assign_add(training.get_global_step(), 1),
+        eval_metric_ops=metrics)
+
+
+def _get_serving_input_receiver_fn():
+  feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
+                  'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
+  return export.build_parsing_serving_input_receiver_fn(feature_spec)
+
+
+def _get_supervised_input_receiver_fn():
+  feature_spec = {
+      'x': array_ops.placeholder(
+          dtype=dtypes.int64, shape=(2, 1), name='feature_x'),
+      'y': array_ops.placeholder(
+          dtype=dtypes.int64, shape=(2, 1), name='feature_y')
+      }
+  label_spec = array_ops.placeholder(
+      dtype=dtypes.float32, shape=[1], name='truth')
+
+  return export.build_raw_supervised_input_receiver_fn(
+      feature_spec, label_spec)
+
+
+class EstimatorExportTest(test.TestCase):
+
+  def test_export_saved_model_train(self):
+    self._test_export_saved_model_for_mode(
+        _get_supervised_input_receiver_fn(), model_fn_lib.ModeKeys.TRAIN)
+
+  def test_export_saved_model_eval(self):
+    self._test_export_saved_model_for_mode(
+        _get_supervised_input_receiver_fn(), model_fn_lib.ModeKeys.EVAL)
+
+  def test_export_saved_model_predict(self):
+    self._test_export_saved_model_for_mode(
+        _get_serving_input_receiver_fn(), model_fn_lib.ModeKeys.PREDICT)
+
+  def _test_export_saved_model_for_mode(self, input_receiver_fn, mode):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
+    est.train(input_fn=_x_y_input_fn, steps=1)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = contrib_export.export_saved_model_for_mode(
+        est, export_dir_base, input_receiver_fn, mode=mode)
+
+    # Check that all the files are in the right places.
+    self.assertTrue(gfile.Exists(export_dir_base))
+    self._validate_exported_files(export_dir)
+
+    # Restore, to validate that the export was well-formed.
+    tag_set = model_fn_lib.EXPORT_TAG_MAP[mode]
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, tag_set, export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertFalse('name_collision_1' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_receiver_map(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('input_example_tensor' in graph_ops)
+        self.assertTrue('ParseExample/ParseExample' in graph_ops)
+        self.assertFalse('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_train_only(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('multiplied' in graph_ops)
+        self.assertTrue('mean/update_op' in graph_ops)
+        self.assertFalse('eval_multiplied' in graph_ops)
+        self.assertTrue('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_eval_only(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.EVAL], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('eval_multiplied' in graph_ops)
+        self.assertTrue('eval_mean/value' in graph_ops)
+        self.assertFalse('multiplied' in graph_ops)
+        self.assertTrue('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_no_serving(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('multiplied' in graph_ops)
+        self.assertFalse('eval_multiplied' in graph_ops)
+        self.assertTrue('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.EVAL], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('eval_multiplied' in graph_ops)
+        self.assertFalse('multiplied' in graph_ops)
+        # TODO(karmel): is this the desired behavior when names are shared?
+        self.assertTrue('feature_x_1' in graph_ops)
+        self.assertTrue('feature_y_1' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_three_defs(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    # Restore, to validate that the export was well-formed.
+    for tag_set in model_fn_lib.EXPORT_TAG_MAP.values():
+      with ops.Graph().as_default() as graph:
+        with session.Session(graph=graph) as sess:
+          loader.load(sess, tag_set, export_dir)
+          graph_ops = [x.name for x in graph.get_operations()]
+          self.assertTrue('global_step/Assign' in graph_ops)
+          self.assertTrue('global_step/Initializer/zeros' in graph_ops)
+          self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_all_vars(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('later_var' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertFalse('later_var' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_name_collision(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('name_collision' in graph_ops)
+        self.assertFalse('name_collision_1' in graph_ops)
+        collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+        self.assertEqual(3, collection_vars[-1].eval())
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('name_collision' in graph_ops)
+        self.assertFalse('name_collision_1' in graph_ops)
+        collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+        # This is a non-obvious detail: when we load the estimator spec
+        # for predict, name_collision gets set to 36. However, we then restore
+        # from checkpoint, which should overwrite that var and make it the 3
+        # from training. In practice, this would not be a good way to write
+        # a model_fn, but leaving this check in for now to ensure consistency
+        # with what would happen given our current order of spec, then
+        # checkpoint.
+        self.assertEqual(3, collection_vars[-1].eval())
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def _test_export_all_saved_models(self, input_receiver_fn_map):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_with_x_y)
+    est.train(input_fn=_x_y_input_fn, steps=1)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = contrib_export.export_all_saved_models(
+        est, export_dir_base, input_receiver_fn_map)
+
+    # Check that all the files are in the right places.
+    self.assertTrue(gfile.Exists(export_dir_base))
+
+    self._validate_exported_files(export_dir)
+
+    return export_dir, tmpdir
+
+  def _validate_exported_files(self, export_dir):
+    self.assertTrue(gfile.Exists(export_dir))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('saved_model.pb'))))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('variables'))))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('variables/variables.index'))))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('variables/variables.data-00000-of-00001'))))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders.py b/tensorflow/contrib/estimator/python/estimator/extenders.py
index 201699ed775f701bc9f215fff11a688175d51645..bf08be09e7baf63e507a6a4db6a91e7b6bb20b74 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders.py
@@ -22,12 +22,12 @@ import six
 
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.estimator.export.export_output import PredictOutput
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.util import function_utils
 
 
 _VALID_METRIC_FN_ARGS = set(['features', 'labels', 'predictions', 'config'])
@@ -330,7 +330,7 @@ class _TransformGradients(optimizer_lib.Optimizer):
 
 
 def _verify_metric_fn_args(metric_fn):
-  args = set(estimator_util.fn_args(metric_fn))
+  args = set(function_utils.fn_args(metric_fn))
   invalid_args = list(args - _VALID_METRIC_FN_ARGS)
   if invalid_args:
     raise ValueError('metric_fn (%s) has following not expected args: %s' %
@@ -339,7 +339,7 @@ def _verify_metric_fn_args(metric_fn):
 
 def _call_metric_fn(metric_fn, features, labels, predictions, config):
   """Calls metric fn with proper arguments."""
-  metric_fn_args = estimator_util.fn_args(metric_fn)
+  metric_fn_args = function_utils.fn_args(metric_fn)
   kwargs = {}
   if 'features' in metric_fn_args:
     kwargs['features'] = features
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index 5d19bf4714ff6fcdb77948e14839a37a2ec56b75..8b97f86db19a1bc2d9f17c9935e6678844daf177 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from tensorflow.python.estimator import model_fn
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import metric_keys
@@ -72,6 +74,33 @@ def multi_class_head(n_classes,
   shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
   the input labels before passing them to `loss_fn`.
 
+  The head can be used with a canned estimator. Example:
+
+  ```python
+  my_head = tf.contrib.estimator.multi_class_head(n_classes=3)
+  my_estimator = tf.contrib.estimator.DNNEstimator(
+      head=my_head,
+      hidden_units=...,
+      feature_columns=...)
+  ```
+
+  It can also be used with a custom `model_fn`. Example:
+
+  ```python
+  def _my_model_fn(features, labels, mode):
+    my_head = tf.contrib.estimator.multi_class_head(n_classes=3)
+    logits = tf.keras.Model(...)(features)
+
+    return my_head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
+        logits=logits)
+
+  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
+  ```
+
   Args:
     n_classes: Number of classes, must be greater than 2 (for 2 classes, use
       `binary_classification_head`).
@@ -139,6 +168,33 @@ def binary_classification_head(
   shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
   the input labels before passing them to `loss_fn`.
 
+  The head can be used with a canned estimator. Example:
+
+  ```python
+  my_head = tf.contrib.estimator.binary_classification_head()
+  my_estimator = tf.contrib.estimator.DNNEstimator(
+      head=my_head,
+      hidden_units=...,
+      feature_columns=...)
+  ```
+
+  It can also be used with a custom `model_fn`. Example:
+
+  ```python
+  def _my_model_fn(features, labels, mode):
+    my_head = tf.contrib.estimator.binary_classification_head()
+    logits = tf.keras.Model(...)(features)
+
+    return my_head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
+        logits=logits)
+
+  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
+  ```
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -211,6 +267,33 @@ def regression_head(weight_column=None,
   https://en.wikipedia.org/wiki/Generalized_linear_model#Link_function
   Namely, for poisson regression, set `inverse_link_fn=tf.exp`.
 
+  The head can be used with a canned estimator. Example:
+
+  ```python
+  my_head = tf.contrib.estimator.regression_head()
+  my_estimator = tf.contrib.estimator.DNNEstimator(
+      head=my_head,
+      hidden_units=...,
+      feature_columns=...)
+  ```
+
+  It can also be used with a custom `model_fn`. Example:
+
+  ```python
+  def _my_model_fn(features, labels, mode):
+    my_head = tf.contrib.estimator.regression_head()
+    logits = tf.keras.Model(...)(features)
+
+    return my_head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
+        logits=logits)
+
+  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
+  ```
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -270,6 +353,33 @@ def poisson_regression_head(
   This is implemented as a generalized linear model, see
   https://en.wikipedia.org/wiki/Generalized_linear_model.
 
+  The head can be used with a canned estimator. Example:
+
+  ```python
+  my_head = tf.contrib.estimator.poisson_regression_head()
+  my_estimator = tf.contrib.estimator.DNNEstimator(
+      head=my_head,
+      hidden_units=...,
+      feature_columns=...)
+  ```
+
+  It can also be used with a custom `model_fn`. Example:
+
+  ```python
+  def _my_model_fn(features, labels, mode):
+    my_head = tf.contrib.estimator.poisson_regression_head()
+    logits = tf.keras.Model(...)(features)
+
+    return my_head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
+        logits=logits)
+
+  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
+  ```
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -337,6 +447,33 @@ def logistic_regression_head(
   This is implemented as a generalized linear model, see
   https://en.wikipedia.org/wiki/Generalized_linear_model.
 
+  The head can be used with a canned estimator. Example:
+
+  ```python
+  my_head = tf.contrib.estimator.logistic_regression_head()
+  my_estimator = tf.contrib.estimator.DNNEstimator(
+      head=my_head,
+      hidden_units=...,
+      feature_columns=...)
+  ```
+
+  It can also be used with a custom `model_fn`. Example:
+
+  ```python
+  def _my_model_fn(features, labels, mode):
+    my_head = tf.contrib.estimator.logistic_regression_head()
+    logits = tf.keras.Model(...)(features)
+
+    return my_head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
+        logits=logits)
+
+  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
+  ```
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -375,6 +512,7 @@ def multi_label_head(n_classes,
                      label_vocabulary=None,
                      loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                      loss_fn=None,
+                     classes_for_class_based_metrics=None,
                      name=None):
   """Creates a `_Head` for multi-label classification.
 
@@ -406,6 +544,33 @@ def multi_label_head(n_classes,
   shape `[D0, D1, ... DN, n_classes]`. Namely, the head applies
   `label_vocabulary` to the input labels before passing them to `loss_fn`.
 
+  The head can be used with a canned estimator. Example:
+
+  ```python
+  my_head = tf.contrib.estimator.multi_label_head(n_classes=3)
+  my_estimator = tf.contrib.estimator.DNNEstimator(
+      head=my_head,
+      hidden_units=...,
+      feature_columns=...)
+  ```
+
+  It can also be used with a custom `model_fn`. Example:
+
+  ```python
+  def _my_model_fn(features, labels, mode):
+    my_head = tf.contrib.estimator.multi_label_head(n_classes=3)
+    logits = tf.keras.Model(...)(features)
+
+    return my_head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
+        logits=logits)
+
+  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
+  ```
+
   Args:
     n_classes: Number of classes, must be greater than 1 (for 1 class, use
       `binary_classification_head`).
@@ -427,6 +592,10 @@ def multi_label_head(n_classes,
       reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`, namely
       weighted sum of losses divided by batch size. See `tf.losses.Reduction`.
     loss_fn: Optional loss function.
+    classes_for_class_based_metrics: List of integer class IDs or string class
+      names for which per-class metrics are evaluated. If integers, all must be
+      in the range `[0, n_classes - 1]`. If strings, all must be in
+      `label_vocabulary`.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -434,8 +603,8 @@ def multi_label_head(n_classes,
     An instance of `_Head` for multi-label classification.
 
   Raises:
-    ValueError: if `n_classes`, `thresholds`, `loss_reduction` or `loss_fn` is
-    invalid.
+    ValueError: if `n_classes`, `thresholds`, `loss_reduction`, `loss_fn` or
+    `metric_class_ids` is invalid.
   """
   thresholds = tuple(thresholds) if thresholds else tuple()
   if n_classes is None or n_classes < 2:
@@ -460,10 +629,31 @@ def multi_label_head(n_classes,
   if (loss_reduction not in losses.Reduction.all() or
       loss_reduction == losses.Reduction.NONE):
     raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
+  classes_for_class_based_metrics = tuple(
+      [] if classes_for_class_based_metrics is None
+      else classes_for_class_based_metrics)
+  if classes_for_class_based_metrics:
+    if isinstance(classes_for_class_based_metrics[0], six.string_types):
+      if not label_vocabulary:
+        raise ValueError(
+            'label_vocabulary must be provided when '
+            'classes_for_class_based_metrics are sting.')
+      class_ids = []
+      for class_string in classes_for_class_based_metrics:
+        class_ids.append(label_vocabulary.index(class_string))
+      classes_for_class_based_metrics = tuple(class_ids)
+    else:
+      for class_id in classes_for_class_based_metrics:
+        if (class_id < 0) or (class_id >= n_classes):
+          raise ValueError(
+              'All classes_for_class_based_metrics must be in range [0, {}]. '
+              'Given: {}'.format(n_classes - 1, class_id))
   return _MultiLabelHead(
       n_classes=n_classes, weight_column=weight_column, thresholds=thresholds,
       label_vocabulary=label_vocabulary, loss_reduction=loss_reduction,
-      loss_fn=loss_fn, name=name)
+      loss_fn=loss_fn,
+      classes_for_class_based_metrics=classes_for_class_based_metrics,
+      name=name)
 
 
 class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
@@ -476,6 +666,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
                label_vocabulary=None,
                loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                loss_fn=None,
+               classes_for_class_based_metrics=None,
                name=None):
     self._n_classes = n_classes
     self._weight_column = weight_column
@@ -483,6 +674,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
     self._label_vocabulary = label_vocabulary
     self._loss_reduction = loss_reduction
     self._loss_fn = loss_fn
+    self._classes_for_class_based_metrics = classes_for_class_based_metrics
     self._name = name
 
   @property
@@ -560,10 +752,10 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         weights=weights,
         processed_labels=processed_labels)
 
-  def create_estimator_spec(
+  def _create_tpu_estimator_spec(
       self, features, mode, logits, labels=None, optimizer=None,
       train_op_fn=None, regularization_losses=None):
-    """Returns an `EstimatorSpec`.
+    """Returns an `model_fn._TPUEstimatorSpec`.
 
     Args:
       features: Input `dict` of `Tensor` or `SparseTensor` objects.
@@ -586,7 +778,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
         avoid scaling errors.
     Returns:
-      `EstimatorSpec`.
+      `model_fn._TPUEstimatorSpec`.
     Raises:
       ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
         mode, or if both are set.
@@ -606,7 +798,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         classifier_output = head_lib._classification_output(  # pylint:disable=protected-access
             scores=probabilities, n_classes=self._n_classes,
             label_vocabulary=self._label_vocabulary)
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint:disable=protected-access
             mode=model_fn.ModeKeys.PREDICT,
             predictions=predictions,
             export_outputs={
@@ -629,16 +821,18 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
 
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint:disable=protected-access
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
             loss=regularized_training_loss,
-            eval_metric_ops=self._eval_metric_ops(
-                labels=processed_labels,
-                probabilities=probabilities,
-                weights=weights,
-                unreduced_loss=unreduced_loss,
-                regularization_loss=regularization_loss))
+            eval_metrics=head_lib._create_eval_metrics_tuple(  # pylint:disable=protected-access
+                self._eval_metric_ops, {
+                    'labels': processed_labels,
+                    'probabilities': probabilities,
+                    'weights': weights,
+                    'unreduced_loss': unreduced_loss,
+                    'regularization_loss': regularization_loss,
+                }))
 
       # Train.
       if optimizer is not None:
@@ -672,7 +866,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         summary.scalar(
             head_lib._summary_key(self._name, keys.LOSS_REGULARIZATION),  # pylint:disable=protected-access
             regularization_loss)
-    return model_fn.EstimatorSpec(
+    return model_fn._TPUEstimatorSpec(  # pylint:disable=protected-access
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
@@ -735,4 +929,36 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
                 weights=weights,
                 threshold=threshold,
                 name=recall_key))
+      for class_id in self._classes_for_class_based_metrics:
+        batch_rank = array_ops.rank(probabilities) - 1
+        begin = array_ops.concat(
+            [array_ops.zeros([batch_rank], dtype=dtypes.int32), [class_id]],
+            axis=0)
+        size = array_ops.concat(
+            [-1 * array_ops.ones([batch_rank], dtype=dtypes.int32), [1]],
+            axis=0)
+        class_probabilities = array_ops.slice(
+            probabilities, begin=begin, size=size)
+        class_labels = array_ops.slice(labels, begin=begin, size=size)
+        prob_key = keys.PROBABILITY_MEAN_AT_CLASS % class_id
+        metric_ops[head_lib._summary_key(self._name, prob_key)] = (  # pylint:disable=protected-access
+            head_lib._predictions_mean(  # pylint:disable=protected-access
+                predictions=class_probabilities,
+                weights=weights,
+                name=prob_key))
+        auc_key = keys.AUC_AT_CLASS % class_id
+        metric_ops[head_lib._summary_key(self._name, auc_key)] = (  # pylint:disable=protected-access
+            head_lib._auc(  # pylint:disable=protected-access
+                labels=class_labels,
+                predictions=class_probabilities,
+                weights=weights,
+                name=auc_key))
+        auc_pr_key = keys.AUC_PR_AT_CLASS % class_id
+        metric_ops[head_lib._summary_key(self._name, auc_pr_key)] = (  # pylint:disable=protected-access
+            head_lib._auc(  # pylint:disable=protected-access
+                labels=class_labels,
+                predictions=class_probabilities,
+                weights=weights,
+                curve='PR',
+                name=auc_pr_key))
     return metric_ops
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
index 19b86df5565a85168bdbc37076a0af69248a8010..d6c158608b5c564f24bc90583084306aa7084742 100644
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/head_test.py
@@ -175,6 +175,21 @@ class MultiLabelHead(test.TestCase):
         r'loss_fn has unexpected args: \[\'name\'\]'):
       head_lib.multi_label_head(n_classes=3, loss_fn=_loss_fn)
 
+  def test_classes_for_class_based_metrics_invalid(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'All classes_for_class_based_metrics must be in range \[0, 2\]\. '
+        r'Given: -1'):
+      head_lib.multi_label_head(
+          n_classes=3, classes_for_class_based_metrics=[2, -1])
+
+  def test_classes_for_class_based_metrics_string_invalid(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'\'z\' is not in list'):
+      head_lib.multi_label_head(
+          n_classes=3, label_vocabulary=['a', 'b', 'c'],
+          classes_for_class_based_metrics=['c', 'z'])
+
   def test_name(self):
     head = head_lib.multi_label_head(n_classes=4, name='foo')
     self.assertEqual('foo', head.name)
@@ -591,6 +606,81 @@ class MultiLabelHead(test.TestCase):
         expected_loss=expected_loss,
         expected_metrics=expected_metrics)
 
+  def test_eval_with_classes_for_class_based_metrics(self):
+    head = head_lib.multi_label_head(
+        n_classes=2, classes_for_class_based_metrics=[0, 1])
+
+    logits = np.array([[-1., 1.], [-1.5, 1.5]], dtype=np.float32)
+    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
+    # loss = labels * -log(sigmoid(logits)) +
+    #        (1 - labels) * -log(1 - sigmoid(logits))
+    # Sum over examples, divide by batch_size.
+    expected_loss = 0.5 * np.sum(
+        _sigmoid_cross_entropy(labels=labels, logits=logits))
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        # Average loss over examples.
+        keys.LOSS_MEAN: expected_loss,
+        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
+        # this assert tests that the algorithm remains consistent.
+        keys.AUC: 0.3333,
+        keys.AUC_PR: 0.7639,
+        keys.PROBABILITY_MEAN_AT_CLASS % 0: np.sum(_sigmoid(logits[:, 0])) / 2.,
+        keys.AUC_AT_CLASS % 0: 0.,
+        keys.AUC_PR_AT_CLASS % 0: 1.,
+        keys.PROBABILITY_MEAN_AT_CLASS % 1: np.sum(_sigmoid(logits[:, 1])) / 2.,
+        keys.AUC_AT_CLASS % 1: 1.,
+        keys.AUC_PR_AT_CLASS % 1: 1.,
+    }
+
+    self._test_eval(
+        head=head,
+        logits=logits,
+        labels=labels,
+        expected_loss=expected_loss,
+        expected_metrics=expected_metrics)
+
+  def test_eval_with_classes_for_class_based_metrics_string(self):
+    head = head_lib.multi_label_head(
+        n_classes=2, label_vocabulary=['a', 'b'],
+        classes_for_class_based_metrics=['a', 'b'])
+
+    logits = np.array([[-1., 1.], [-1.5, 1.5]], dtype=np.float32)
+    labels = sparse_tensor.SparseTensor(
+        values=['a', 'a', 'b'],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    labels_onehot = np.array([[1, 0], [1, 1]], dtype=np.int64)
+    # loss = labels * -log(sigmoid(logits)) +
+    #        (1 - labels) * -log(1 - sigmoid(logits))
+    # Sum over examples, divide by batch_size.
+    expected_loss = 0.5 * np.sum(
+        _sigmoid_cross_entropy(labels=labels_onehot, logits=logits))
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        # Average loss over examples.
+        keys.LOSS_MEAN: expected_loss,
+        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
+        # this assert tests that the algorithm remains consistent.
+        keys.AUC: 0.3333,
+        keys.AUC_PR: 0.7639,
+        keys.PROBABILITY_MEAN_AT_CLASS % 0: np.sum(_sigmoid(logits[:, 0])) / 2.,
+        keys.AUC_AT_CLASS % 0: 0.,
+        keys.AUC_PR_AT_CLASS % 0: 1.,
+        keys.PROBABILITY_MEAN_AT_CLASS % 1: np.sum(_sigmoid(logits[:, 1])) / 2.,
+        keys.AUC_AT_CLASS % 1: 1.,
+        keys.AUC_PR_AT_CLASS % 1: 1.,
+    }
+
+    self._test_eval(
+        head=head,
+        logits=logits,
+        labels=labels,
+        expected_loss=expected_loss,
+        expected_metrics=expected_metrics)
+
   def test_eval_with_weights(self):
     n_classes = 2
     head = head_lib.multi_label_head(n_classes, weight_column='example_weights')
diff --git a/tensorflow/contrib/estimator/python/estimator/hooks.py b/tensorflow/contrib/estimator/python/estimator/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..4808b9ee30e10047aaf3d33f74457b2717c87a13
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/hooks.py
@@ -0,0 +1,213 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Some useful session run hooks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.estimator import estimator as estimator_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training
+
+
+# pylint: disable=protected-access
+class InMemoryEvaluatorHook(training.SessionRunHook):
+  """Hook to run evaluation in training without a checkpoint.
+
+  Example:
+
+  ```python
+  def train_input_fn():
+    ...
+    return train_dataset
+
+  def eval_input_fn():
+    ...
+    return eval_dataset
+
+  estimator = tf.estimator.DNNClassifier(...)
+
+  evaluator = tf.contrib.estimator.InMemoryEvaluatorHook(
+      estimator, eval_input_fn)
+  estimator.train(train_input_fn, hooks=[evaluator])
+  ```
+
+  Current limitations of this approach are:
+  * It doesn't support multi-node distributed mode.
+  * It doesn't support saveable objects other than variables (such as boosted
+    tree support)
+  * It doesn't support custom saver logic (such as ExponentialMovingAverage
+    support)
+
+  """
+
+  def __init__(self,
+               estimator,
+               input_fn,
+               steps=None,
+               hooks=None,
+               name=None,
+               every_n_iter=100):
+    """Initializes a `InMemoryEvaluatorHook`.
+
+    Args:
+      estimator: A `tf.estimator.Estimator` instance to call evaluate.
+      input_fn:  Equivalent to the `input_fn` arg to `estimator.evaluate`. A
+        function that constructs the input data for evaluation.
+        See @{$get_started/premade_estimators#create_input_functions} for more
+        information. The function should construct and return one of
+        the following:
+
+          * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
+            tuple (features, labels) with same constraints as below.
+          * A tuple (features, labels): Where `features` is a `Tensor` or a
+            dictionary of string feature name to `Tensor` and `labels` is a
+            `Tensor` or a dictionary of string label name to `Tensor`. Both
+            `features` and `labels` are consumed by `model_fn`. They should
+            satisfy the expectation of `model_fn` from inputs.
+
+      steps: Equivalent to the `steps` arg to `estimator.evaluate`.  Number of
+        steps for which to evaluate model. If `None`, evaluates until `input_fn`
+        raises an end-of-input exception.
+      hooks: Equivalent to the `hooks` arg to `estimator.evaluate`. List of
+        `SessionRunHook` subclass instances. Used for callbacks inside the
+        evaluation call.
+      name:  Equivalent to the `name` arg to `estimator.evaluate`. Name of the
+        evaluation if user needs to run multiple evaluations on different data
+        sets, such as on training data vs test data. Metrics for different
+        evaluations are saved in separate folders, and appear separately in
+        tensorboard.
+      every_n_iter: `int`, runs the evaluator once every N training iteration.
+
+    Raises:
+      ValueError: if `every_n_iter` is non-positive or it's not a single machine
+        training
+    """
+    if every_n_iter is None or every_n_iter <= 0:
+      raise ValueError('invalid every_n_iter=%s.' % every_n_iter)
+    if (estimator.config.num_ps_replicas > 0 or
+        estimator.config.num_worker_replicas > 1):
+      raise ValueError(
+          'InMemoryEvaluator supports only single machine (aka Local) setting.')
+    self._estimator = estimator
+    self._input_fn = input_fn
+    self._steps = steps
+    self._name = name
+    self._every_n_iter = every_n_iter
+    self._eval_dir = os.path.join(self._estimator.model_dir, 'eval'
+                                  if not name else 'eval_' + name)
+
+    self._graph = None
+    self._hooks = estimator_lib._check_hooks_type(hooks)
+    self._hooks.extend(self._estimator._convert_eval_steps_to_hooks(steps))
+    self._timer = training.SecondOrStepTimer(every_steps=every_n_iter)
+
+  def begin(self):
+    """Build eval graph and restoring op."""
+    self._timer.reset()
+    self._iter_count = 0
+    self._graph = ops.Graph()
+    with self._graph.as_default():
+      (self._scaffold, self._update_op, self._eval_dict,
+       self._all_hooks) = self._estimator._evaluate_build_graph(
+           self._input_fn, self._hooks, checkpoint_path=None)
+
+      if self._scaffold.saver is not None:
+        raise ValueError('InMemoryEvaluator does not support custom saver')
+      if self._scaffold.init_fn is not None:
+        raise ValueError('InMemoryEvaluator does not support custom init_fn')
+
+      self._var_name_to_eval_var = {
+          v.name: v for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      }
+      self._var_name_to_placeholder = {
+          v.name: array_ops.placeholder(v.dtype)
+          for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      }
+
+  def after_create_session(self, session, coord):  # pylint: disable=unused-argument
+    """Does first run which shows the eval metrics before training."""
+    if ops.get_collection(ops.GraphKeys.SAVEABLE_OBJECTS):
+      raise ValueError(
+          'InMemoryEvaluator does not support saveables other than global '
+          'variables.')
+    self._var_name_to_train_var = {
+        v.name: v for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    }
+    var_names_to_transfer = set(self._var_name_to_placeholder.keys()) & set(
+        self._var_name_to_train_var.keys())
+    # Filter training var names that are not exist in evaluation
+    self._var_name_to_train_var = {
+        v_name: self._var_name_to_train_var[v_name]
+        for v_name in var_names_to_transfer
+    }
+    # Filter eval var names that are not exist in training
+    self._var_name_to_eval_var = {
+        v_name: self._var_name_to_eval_var[v_name]
+        for v_name in var_names_to_transfer
+    }
+
+    with self._graph.as_default():
+      self._var_feed_op = control_flow_ops.group([
+          state_ops.assign(self._var_name_to_eval_var[v_name],
+                           self._var_name_to_placeholder[v_name])
+          for v_name in var_names_to_transfer
+      ])
+
+    self._evaluate(session)
+
+  def _evaluate(self, train_session):
+    var_name_to_value = train_session.run(self._var_name_to_train_var)
+    placeholder_to_value = {
+        self._var_name_to_placeholder[v_name]: var_name_to_value[v_name]
+        for v_name in var_name_to_value
+    }
+
+    def feed_variables(scaffold, session):
+      del scaffold
+      session.run(self._var_feed_op, feed_dict=placeholder_to_value)
+
+    scaffold = training.Scaffold(
+        init_fn=feed_variables, copy_from_scaffold=self._scaffold)
+
+    with self._graph.as_default():
+      return self._estimator._evaluate_run(
+          checkpoint_path=None,
+          scaffold=scaffold,
+          update_op=self._update_op,
+          eval_dict=self._eval_dict,
+          all_hooks=self._all_hooks,
+          output_dir=self._eval_dir)
+
+    self._timer.update_last_triggered_step(self._iter_count)
+
+  def after_run(self, run_context, run_values):  # pylint: disable=unused-argument
+    """Runs evaluator."""
+    self._iter_count += 1
+    if self._timer.should_trigger_for_step(self._iter_count):
+      self._evaluate(run_context.session)
+
+  def end(self, session):  # pylint: disable=unused-argument
+    """Runs evaluator for final model."""
+    self._evaluate(session)
+
+
+# pylint: enable=protected-access
diff --git a/tensorflow/contrib/estimator/python/estimator/hooks_test.py b/tensorflow/contrib/estimator/python/estimator/hooks_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..95ae971852ee6dffb6174fc243686721c30ef685
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/hooks_test.py
@@ -0,0 +1,318 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for hooks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import json
+import os
+
+from tensorflow.contrib.estimator.python.estimator import hooks as hooks_lib
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.summary import summary_iterator
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import training
+
+
+def summary_step_keyword_to_value_mapping(dir_):
+  writer_cache.FileWriterCache.clear()
+
+  # Get last Event written.
+  event_paths = glob.glob(os.path.join(dir_, 'events*'))
+  step_keyword_to_value = {}
+  for last_event in summary_iterator.summary_iterator(event_paths[-1]):
+    if last_event.step not in step_keyword_to_value:
+      step_keyword_to_value[last_event.step] = {}
+    if last_event.summary is not None:
+      for value in last_event.summary.value:
+        step_keyword_to_value[last_event.step][value.tag] = value.simple_value
+
+  return step_keyword_to_value
+
+
+def get_summary_value(dir_, step, keyword):
+  """Get summary value for given step and keyword."""
+
+  writer_cache.FileWriterCache.clear()
+  # Get last Event written.
+  event_paths = glob.glob(os.path.join(dir_, 'events*'))
+  print('XXX', event_paths)
+  for last_event in summary_iterator.summary_iterator(event_paths[-1]):
+    if last_event.step == step and last_event.summary is not None:
+      for value in last_event.summary.value:
+        if keyword in value.tag:
+          return value.simple_value
+  return None
+
+
+class InMemoryEvaluatorHookTest(test.TestCase):
+
+  def test_runs_eval_metrics(self):
+
+    def model_fn(features, labels, mode):
+      _ = labels
+      if estimator_lib.ModeKeys.TRAIN == mode:
+        with ops.control_dependencies([features]):
+          train_op = state_ops.assign_add(training.get_global_step(), 1)
+        return estimator_lib.EstimatorSpec(
+            mode, loss=constant_op.constant(3.), train_op=train_op)
+      if estimator_lib.ModeKeys.EVAL == mode:
+        return estimator_lib.EstimatorSpec(
+            mode,
+            loss=constant_op.constant(5.),
+            eval_metric_ops={'mean_of_features': metrics_lib.mean(features)})
+
+    estimator = estimator_lib.Estimator(model_fn=model_fn)
+
+    def input_fn():
+      return dataset_ops.Dataset.range(10)
+
+    evaluator = hooks_lib.InMemoryEvaluatorHook(
+        estimator, input_fn, every_n_iter=4)
+    estimator.train(input_fn, hooks=[evaluator])
+
+    self.assertTrue(os.path.isdir(estimator.eval_dir()))
+    step_keyword_to_value = summary_step_keyword_to_value_mapping(
+        estimator.eval_dir())
+    # 4.5 = sum(range(10))/10
+    # before training
+    self.assertEqual(4.5, step_keyword_to_value[0]['mean_of_features'])
+    # intervals (every_n_iter=4)
+    self.assertEqual(4.5, step_keyword_to_value[4]['mean_of_features'])
+    self.assertEqual(4.5, step_keyword_to_value[8]['mean_of_features'])
+    # end
+    self.assertEqual(4.5, step_keyword_to_value[10]['mean_of_features'])
+
+  def test_uses_latest_variable_value(self):
+
+    def model_fn(features, labels, mode):
+      _ = labels
+      step = training.get_global_step()
+      w = variable_scope.get_variable(
+          'w',
+          shape=[],
+          initializer=init_ops.zeros_initializer(),
+          dtype=dtypes.int64)
+      if estimator_lib.ModeKeys.TRAIN == mode:
+        # to consume features, we have control dependency
+        with ops.control_dependencies([features]):
+          step_inc = state_ops.assign_add(training.get_global_step(), 1)
+        with ops.control_dependencies([step_inc]):
+          assign_w_to_step_plus_2 = w.assign(step + 2)
+        return estimator_lib.EstimatorSpec(
+            mode,
+            loss=constant_op.constant(3.),
+            train_op=assign_w_to_step_plus_2)
+      if estimator_lib.ModeKeys.EVAL == mode:
+        # to consume features, we have control dependency
+        with ops.control_dependencies([features]):
+          loss = constant_op.constant(5.)
+        return estimator_lib.EstimatorSpec(
+            mode,
+            loss=loss,
+            # w is constant in each step, so the mean.
+            # w = 0 if step==0 else step+2
+            eval_metric_ops={'mean_of_const': metrics_lib.mean(w)})
+
+    estimator = estimator_lib.Estimator(model_fn=model_fn)
+
+    def input_fn():
+      return dataset_ops.Dataset.range(10)
+
+    evaluator = hooks_lib.InMemoryEvaluatorHook(
+        estimator, input_fn, every_n_iter=4)
+    estimator.train(input_fn, hooks=[evaluator])
+
+    self.assertTrue(os.path.isdir(estimator.eval_dir()))
+    step_keyword_to_value = summary_step_keyword_to_value_mapping(
+        estimator.eval_dir())
+    # w = 0 if step==0 else step+2
+    self.assertEqual(0, step_keyword_to_value[0]['mean_of_const'])
+    self.assertEqual(6, step_keyword_to_value[4]['mean_of_const'])
+    self.assertEqual(12, step_keyword_to_value[10]['mean_of_const'])
+
+  def test_dnn_classifier(self):
+    embedding = feature_column_lib.embedding_column(
+        feature_column_lib.categorical_column_with_vocabulary_list(
+            'wire_cast', ['kima', 'omar', 'stringer']), 8)
+    dnn = estimator_lib.DNNClassifier(
+        feature_columns=[embedding], hidden_units=[3, 1])
+
+    def train_input_fn():
+      return dataset_ops.Dataset.from_tensors(({
+          'wire_cast': [['omar'], ['kima']]
+      }, [[0], [1]])).repeat(3)
+
+    def eval_input_fn():
+      return dataset_ops.Dataset.from_tensors(({
+          'wire_cast': [['stringer'], ['kima']]
+      }, [[0], [1]])).repeat(2)
+
+    evaluator = hooks_lib.InMemoryEvaluatorHook(
+        dnn, eval_input_fn, name='in-memory')
+    dnn.train(train_input_fn, hooks=[evaluator])
+    self.assertTrue(os.path.isdir(dnn.eval_dir('in-memory')))
+    step_keyword_to_value = summary_step_keyword_to_value_mapping(
+        dnn.eval_dir('in-memory'))
+
+    final_metrics = dnn.evaluate(eval_input_fn)
+    step = final_metrics[ops.GraphKeys.GLOBAL_STEP]
+    for summary_tag in final_metrics:
+      if summary_tag == ops.GraphKeys.GLOBAL_STEP:
+        continue
+      self.assertEqual(final_metrics[summary_tag],
+                       step_keyword_to_value[step][summary_tag])
+
+  def test_raise_error_with_multi_worker(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
+        },
+        'task': {
+            'type': run_config_lib.TaskType.CHIEF,
+            'index': 0
+        }
+    }
+    with test.mock.patch.dict('os.environ',
+                              {'TF_CONFIG': json.dumps(tf_config)}):
+      dnn = estimator_lib.DNNClassifier(
+          feature_columns=[feature_column_lib.numeric_column('x')],
+          hidden_units=[3, 1])
+
+    def eval_input_fn():
+      pass
+
+    with self.assertRaisesRegexp(ValueError, 'supports only single machine'):
+      hooks_lib.InMemoryEvaluatorHook(dnn, eval_input_fn)
+
+  def test_raise_error_with_ps(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+            run_config_lib.TaskType.PS: ['host1:1'],
+        },
+        'task': {
+            'type': run_config_lib.TaskType.CHIEF,
+            'index': 0
+        }
+    }
+    with test.mock.patch.dict('os.environ',
+                              {'TF_CONFIG': json.dumps(tf_config)}):
+      dnn = estimator_lib.DNNClassifier(
+          feature_columns=[feature_column_lib.numeric_column('x')],
+          hidden_units=[3, 1])
+
+    def eval_input_fn():
+      pass
+
+    with self.assertRaisesRegexp(ValueError, 'supports only single machine'):
+      hooks_lib.InMemoryEvaluatorHook(dnn, eval_input_fn)
+
+  def test_raise_error_with_custom_saver_in_eval(self):
+
+    def model_fn(features, labels, mode):
+      _, _ = features, labels
+      return estimator_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(3.),
+          scaffold=training.Scaffold(saver=training.Saver()),
+          train_op=constant_op.constant(5.),
+          eval_metric_ops={
+              'mean_of_features': metrics_lib.mean(constant_op.constant(2.))
+          })
+
+    estimator = estimator_lib.Estimator(model_fn=model_fn)
+
+    def input_fn():
+      return dataset_ops.Dataset.range(10)
+
+    evaluator = hooks_lib.InMemoryEvaluatorHook(estimator, input_fn)
+    with self.assertRaisesRegexp(ValueError, 'does not support custom saver'):
+      evaluator.begin()
+
+  def test_raise_error_with_custom_init_fn_in_eval(self):
+
+    def model_fn(features, labels, mode):
+      _, _ = features, labels
+
+      def init_fn(scaffold, session):
+        _, _ = scaffold, session
+
+      return estimator_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(3.),
+          scaffold=training.Scaffold(init_fn=init_fn),
+          train_op=constant_op.constant(5.),
+          eval_metric_ops={
+              'mean_of_features': metrics_lib.mean(constant_op.constant(2.))
+          })
+
+    estimator = estimator_lib.Estimator(model_fn=model_fn)
+
+    def input_fn():
+      return dataset_ops.Dataset.range(10)
+
+    evaluator = hooks_lib.InMemoryEvaluatorHook(estimator, input_fn)
+    with self.assertRaisesRegexp(ValueError, 'does not support custom init_fn'):
+      evaluator.begin()
+
+  def test_raise_error_with_saveables_other_than_global_variables(self):
+
+    def model_fn(features, labels, mode):
+      _, _ = features, labels
+      w = variables.Variable(
+          initial_value=[0.],
+          trainable=False,
+          collections=[ops.GraphKeys.SAVEABLE_OBJECTS])
+      init_op = control_flow_ops.group(
+          [w.initializer, training.get_global_step().initializer])
+      return estimator_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(3.),
+          scaffold=training.Scaffold(init_op=init_op),
+          train_op=constant_op.constant(5.),
+          eval_metric_ops={
+              'mean_of_features': metrics_lib.mean(constant_op.constant(2.))
+          })
+
+    estimator = estimator_lib.Estimator(model_fn=model_fn)
+
+    def input_fn():
+      return dataset_ops.Dataset.range(10)
+
+    evaluator = hooks_lib.InMemoryEvaluatorHook(estimator, input_fn)
+    with self.assertRaisesRegexp(ValueError, 'does not support saveables'):
+      estimator.train(input_fn, hooks=[evaluator])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/logit_fns.py b/tensorflow/contrib/estimator/python/estimator/logit_fns.py
index 09c2862ccd3f90de4153a2095afc9c3d3f9476c1..c8b0dd62970e341a3c6b176278fe1c2adfcd8d20 100644
--- a/tensorflow/contrib/estimator/python/estimator/logit_fns.py
+++ b/tensorflow/contrib/estimator/python/estimator/logit_fns.py
@@ -41,10 +41,10 @@ from __future__ import print_function
 
 import six
 
-from tensorflow.python.estimator import util
 from tensorflow.python.estimator.canned import dnn as dnn_core
 from tensorflow.python.estimator.canned import linear as linear_core
 from tensorflow.python.framework import ops
+from tensorflow.python.util import function_utils
 
 # pylint: disable=protected-access
 dnn_logit_fn_builder = dnn_core._dnn_logit_fn_builder
@@ -72,7 +72,7 @@ def call_logit_fn(logit_fn, features, mode, params, config):
     ValueError: if logit_fn does not return a Tensor or a dictionary mapping
       strings to Tensors.
   """
-  logit_fn_args = util.fn_args(logit_fn)
+  logit_fn_args = function_utils.fn_args(logit_fn)
   kwargs = {}
   if 'mode' in logit_fn_args:
     kwargs['mode'] = mode
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index f8564446e5da3e785b85010998d18dca0424d16b..cda23aa437f954700b74dcb9294550eb9a8a8c5c 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -32,7 +32,6 @@ import six
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.client import device_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator import util
 from tensorflow.python.estimator.export import export_output as export_output_lib
 from tensorflow.python.framework import device as framework_device
 from tensorflow.python.framework import ops as ops_lib
@@ -48,6 +47,7 @@ from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import device_setter as device_setter_lib
 from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import function_utils
 
 
 @deprecation.deprecated(
@@ -521,7 +521,7 @@ def _get_loss_towers(model_fn,
   """Replicate the loss computation across devices."""
   tower_specs = []
 
-  model_fn_args = util.fn_args(model_fn)
+  model_fn_args = function_utils.fn_args(model_fn)
   optional_params = {}
   if 'params' in model_fn_args:
     optional_params['params'] = copy.deepcopy(params)
diff --git a/tensorflow/contrib/estimator/python/estimator/rnn.py b/tensorflow/contrib/estimator/python/estimator/rnn.py
index 7f385fd76e88aba46f45d16198d707bf1d1e0d8a..7c49cd00d16777872ad1211dfa1d1a3ac9ac1cee 100644
--- a/tensorflow/contrib/estimator/python/estimator/rnn.py
+++ b/tensorflow/contrib/estimator/python/estimator/rnn.py
@@ -229,6 +229,7 @@ def _rnn_logit_fn_builder(output_units, rnn_cell_fn, sequence_feature_columns,
     rnn_outputs, _ = rnn.dynamic_rnn(
         cell=cell,
         inputs=sequence_input,
+        sequence_length=sequence_length,
         dtype=dtypes.float32,
         time_major=False)
     last_activations = _select_last_activations(rnn_outputs, sequence_length)
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
index 8fc4f60492b0bfb22ea78cb7b5906e452bb6da58..af1b404cb51bf5d8f8350481f2301d9653895e85 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@@ -78,7 +78,6 @@ class AssertScalarIntTest(test.TestCase):
               [3, 4], dtype=dtypes.int32))
 
 
-@test_util.with_c_api
 class WithShapeTest(test.TestCase):
 
   def _assert_with_shape(self, tensor, expected_value, expected_shape,
@@ -216,25 +215,18 @@ class WithShapeTest(test.TestCase):
       tensor_partial_shape.set_shape([None, 2])
 
       for incompatible_shape in [[0], [1]]:
-        if ops._USE_C_API:
-          error_message = "Shapes must be equal rank, but are 2 and 1"
-        else:
-          error_message = r"Shapes \(\?, 2\) and \([01],\) are not compatible"
         self.assertRaisesRegexp(
-            ValueError, error_message,
+            ValueError, "Shapes must be equal rank, but are 2 and 1",
             tensor_util.with_shape, incompatible_shape, tensor_partial_shape)
       for incompatible_shape in [[1, 2, 1]]:
         self.assertRaisesRegexp(ValueError, "Dimensions must be equal",
                                 tensor_util.with_shape, incompatible_shape,
                                 tensor_partial_shape)
       for incompatible_shape in [[2, 1]]:
-        if ops._USE_C_API:
-          error_message = (r"Dimension 1 in both shapes must be equal, but are "
-                           r"2 and 1. Shapes are \[\?,2\] and \[2,1\].")
-        else:
-          error_message = r"Shapes \(\?, 2\) and \(2, 1\) are not compatible"
         self.assertRaisesRegexp(
-            ValueError, error_message,
+            ValueError,
+            r"Dimension 1 in both shapes must be equal, but are 2 and 1. "
+            r"Shapes are \[\?,2\] and \[2,1\].",
             tensor_util.with_shape, incompatible_shape, tensor_partial_shape)
 
       compatible_shape = [2, 2]
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index e3fc6bf0f034051fc33ff5966e2f4ea85aa538db..4092b320042162e4eb4c5f4879c2c3ea5dc14fc9 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -112,6 +112,7 @@ class GANEstimator(estimator.Estimator):
                generator_optimizer=None,
                discriminator_optimizer=None,
                get_hooks_fn=None,
+               get_eval_metric_ops_fn=None,
                add_summaries=None,
                use_loss_summaries=True,
                config=None):
@@ -146,6 +147,9 @@ class GANEstimator(estimator.Estimator):
         list of hooks. These hooks are run on the generator and discriminator
         train ops, and can be used to implement the GAN training scheme.
         Defaults to `train.get_sequential_train_hooks()`.
+      get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
+        dict of metric results keyed by name. The output of this function is
+        passed into `tf.estimator.EstimatorSpec` during evaluation.
       add_summaries: `None`, a single `SummaryType`, or a list of `SummaryType`.
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
@@ -160,7 +164,8 @@ class GANEstimator(estimator.Estimator):
               else discriminator_optimizer)
       gan_head = head_lib.gan_head(
           generator_loss_fn, discriminator_loss_fn, gopt, dopt,
-          use_loss_summaries, get_hooks_fn=get_hooks_fn)
+          use_loss_summaries, get_hooks_fn=get_hooks_fn,
+          get_eval_metric_ops_fn=get_eval_metric_ops_fn)
       return _gan_model_fn(
           features, labels, mode, generator_fn, discriminator_fn, gan_head,
           add_summaries)
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index 387a62bd741bd42c03dc1bf70592060c29ccd7a8..955482599b372be3f0d0cbc81451c514958d0eb1 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -194,6 +195,12 @@ class GANEstimatorIntegrationTest(test.TestCase):
       lr = learning_rate_decay.exponential_decay(1.0, gstep, 10, 0.9)
       return training.GradientDescentOptimizer(lr)
 
+    def get_metrics(gan_model):
+      return {
+          'mse_custom_metric': metrics_lib.mean_squared_error(
+              gan_model.real_data, gan_model.generated_data)
+      }
+
     gopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
     dopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
     est = estimator.GANEstimator(
@@ -203,6 +210,7 @@ class GANEstimatorIntegrationTest(test.TestCase):
         discriminator_loss_fn=losses.wasserstein_discriminator_loss,
         generator_optimizer=gopt,
         discriminator_optimizer=dopt,
+        get_eval_metric_ops_fn=get_metrics,
         model_dir=self._model_dir)
 
     # TRAIN
@@ -213,6 +221,9 @@ class GANEstimatorIntegrationTest(test.TestCase):
     scores = est.evaluate(eval_input_fn)
     self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
     self.assertIn('loss', six.iterkeys(scores))
+    self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
+                     scores['loss'])
+    self.assertIn('mse_custom_metric', six.iterkeys(scores))
 
     # PREDICT
     predictions = np.array([x for x in est.predict(predict_input_fn)])
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_impl.py b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
index a21358c50bbdb4a1a929b0c5bc322cec4c9923b5..ff903a78cc36c1965b7655aa902501b1943637a8 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
@@ -25,17 +25,21 @@ from tensorflow.contrib.gan.python import train as tfgan_train
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator.canned import head
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import metrics as metrics_lib
 
 __all__ = [
     'GANHead',
     'gan_head',
 ]
 
+def _summary_key(head_name, val):
+  return '%s/%s' % (val, head_name) if head_name else val
+
 
 def gan_head(generator_loss_fn, discriminator_loss_fn, generator_optimizer,
              discriminator_optimizer, use_loss_summaries=True,
              get_hooks_fn=tfgan_train.get_sequential_train_hooks(),
-             name=None):
+             get_eval_metric_ops_fn=None, name=None):
   """Creates a `GANHead`.
 
   Args:
@@ -47,9 +51,12 @@ def gan_head(generator_loss_fn, discriminator_loss_fn, generator_optimizer,
     discriminator_optimizer: Same as `generator_optimizer`, but for the
       discriminator updates.
     use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
-        If `None`, uses defaults.
-    get_hooks_fn: A function that takes a GANTrainOps tuple and returns a list
-        of hooks.
+      If `None`, uses defaults.
+    get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
+      list of hooks.
+    get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
+      dict of metric results keyed by name. The output of this function is
+      passed into `tf.estimator.EstimatorSpec` during evaluation.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`.
 
@@ -62,6 +69,7 @@ def gan_head(generator_loss_fn, discriminator_loss_fn, generator_optimizer,
                  discriminator_optimizer=discriminator_optimizer,
                  use_loss_summaries=use_loss_summaries,
                  get_hooks_fn=get_hooks_fn,
+                 get_eval_metric_ops_fn=get_eval_metric_ops_fn,
                  name=name)
 
 
@@ -72,6 +80,7 @@ class GANHead(head._Head):  # pylint: disable=protected-access
                generator_optimizer, discriminator_optimizer,
                use_loss_summaries=True,
                get_hooks_fn=None,
+               get_eval_metric_ops_fn=None,
                name=None):
     """`Head` for GAN training.
 
@@ -85,8 +94,11 @@ class GANHead(head._Head):  # pylint: disable=protected-access
         discriminator updates.
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
-      get_hooks_fn: A function that takes a GANTrainOps tuple and returns a list
-        of hooks. Defaults to `train.get_sequential_train_hooks()`
+      get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
+        list of hooks. Defaults to `train.get_sequential_train_hooks()`
+      get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
+        dict of metric results keyed by name. The output of this function is
+        passed into `tf.estimator.EstimatorSpec` during evaluation.
       name: name of the head. If provided, summary and metrics keys will be
         suffixed by `"/" + name`.
     """
@@ -104,6 +116,8 @@ class GANHead(head._Head):  # pylint: disable=protected-access
     self._generator_optimizer = generator_optimizer
     self._discriminator_optimizer = discriminator_optimizer
     self._get_hooks_fn = get_hooks_fn
+    self._get_eval_metric_ops_fn = get_eval_metric_ops_fn
+    self._name = name
 
   @property
   def name(self):
@@ -173,13 +187,26 @@ class GANHead(head._Head):  # pylint: disable=protected-access
         gan_loss = self.create_loss(
             features=None, mode=mode, logits=gan_model, labels=None)
         scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
+        with ops.name_scope(None, 'metrics',
+                            [gan_loss.generator_loss,
+                             gan_loss.discriminator_loss]):
+          eval_metric_ops = {
+              _summary_key(self._name, 'generator_loss'):
+                  metrics_lib.mean(gan_loss.generator_loss),
+              _summary_key(self._name, 'discriminator_loss'):
+                  metrics_lib.mean(gan_loss.discriminator_loss)
+          }
+          if self._get_eval_metric_ops_fn is not None:
+            custom_eval_metric_ops = self._get_eval_metric_ops_fn(gan_model)
+            if not isinstance(custom_eval_metric_ops, dict):
+              raise TypeError('get_eval_metric_ops_fn must return a dict, '
+                              'received: {}'.format(custom_eval_metric_ops))
+            eval_metric_ops.update(custom_eval_metric_ops)
         return model_fn_lib.EstimatorSpec(
             mode=model_fn_lib.ModeKeys.EVAL,
             predictions=gan_model.generated_data,
             loss=scalar_loss,
-            # TODO(joelshor): Add metrics. If head name provided, append it to
-            # metric keys.
-            eval_metric_ops={})
+            eval_metric_ops=eval_metric_ops)
       elif mode == model_fn_lib.ModeKeys.TRAIN:
         if train_op_fn is None:
           raise ValueError('train_op_fn can not be None.')
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_test.py b/tensorflow/contrib/gan/python/estimator/python/head_test.py
index 8168f005cd1105886390a2384a936663c83fa5f5..6587f1fc600b94d27f7c12b44ca2136d0be5a8c5 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_test.py
@@ -62,9 +62,14 @@ class GANHeadTest(test.TestCase):
         generator_loss_fn=dummy_loss,
         discriminator_loss_fn=dummy_loss,
         generator_optimizer=training.GradientDescentOptimizer(1.0),
-        discriminator_optimizer=training.GradientDescentOptimizer(1.0))
+        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
+        get_eval_metric_ops_fn=self.get_metrics)
     self.assertTrue(isinstance(self.gan_head, head.GANHead))
 
+  def get_metrics(self, gan_model):
+    self.assertTrue(isinstance(gan_model, tfgan_tuples.GANModel))
+    return {}
+
   def _test_modes_helper(self, mode):
     self.gan_head.create_estimator_spec(
         features=None,
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
index 2889e937436d2faa66b5693c19046e122cbaf652..9f5fee45422e0b9bcbc73674e55ae395ea8533d5 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
@@ -570,7 +570,7 @@ class MutualInformationPenaltyTest(test.TestCase, _PenaltyTest):
         'predicted_distributions': self._predicted_distributions,
     }
     self._expected_loss = 1.61610
-    self._expected_op_name = 'mutual_information_loss/mul'
+    self._expected_op_name = 'mutual_information_loss/mul_1'
     self._batch_size = 2
 
 
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
index 6a5d982dc8514d69277b8f042ac1256e28715d9e..2e5c84704f8464ab46d740ea3c1eef0548826e8d 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "hexagon_controller.h"
 
-#include <malloc.h>
+#include <stdlib.h>
 #include <stdio.h>
 
 #include "adspmsgd.h"
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
index 60281951dda94008cad3a164be67d6fe8b59a916..66939fbb0f0d3bb5d2181e38428c038f661d3772 100644
--- a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
@@ -115,7 +115,7 @@ static void CheckOpsSupport(const GraphDef& graph_def,
       HexagonOpsDefinitions::getInstance();
   LOG(INFO) << "Checking " << graph_def.node_size() << " nodes";
   LOG(INFO) << "dump_all_nodes = " << dump_all_nodes
-            << ", dump_shape_and_tpye = " << dump_shape_and_type;
+            << ", dump_shape_and_type = " << dump_shape_and_type;
 
   std::unordered_set<string> unsupported_ops;
   bool all_supported = true;
diff --git a/tensorflow/contrib/keras/api/keras/activations/__init__.py b/tensorflow/contrib/keras/api/keras/activations/__init__.py
index d04838c218d6643a703723a1d163c88547c14da7..3f0184276f6b903be63f7b35459e4ad57044eb2c 100644
--- a/tensorflow/contrib/keras/api/keras/activations/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/activations/__init__.py
@@ -19,22 +19,22 @@ from __future__ import division
 from __future__ import print_function
 
 # Activation functions.
-from tensorflow.python.keras._impl.keras.activations import elu
-from tensorflow.python.keras._impl.keras.activations import hard_sigmoid
-from tensorflow.python.keras._impl.keras.activations import linear
-from tensorflow.python.keras._impl.keras.activations import relu
-from tensorflow.python.keras._impl.keras.activations import selu
-from tensorflow.python.keras._impl.keras.activations import sigmoid
-from tensorflow.python.keras._impl.keras.activations import softmax
-from tensorflow.python.keras._impl.keras.activations import softplus
-from tensorflow.python.keras._impl.keras.activations import softsign
-from tensorflow.python.keras._impl.keras.activations import tanh
+from tensorflow.python.keras.activations import elu
+from tensorflow.python.keras.activations import hard_sigmoid
+from tensorflow.python.keras.activations import linear
+from tensorflow.python.keras.activations import relu
+from tensorflow.python.keras.activations import selu
+from tensorflow.python.keras.activations import sigmoid
+from tensorflow.python.keras.activations import softmax
+from tensorflow.python.keras.activations import softplus
+from tensorflow.python.keras.activations import softsign
+from tensorflow.python.keras.activations import tanh
 
 # Auxiliary utils.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.activations import deserialize
-from tensorflow.python.keras._impl.keras.activations import serialize
-from tensorflow.python.keras._impl.keras.activations import get
+from tensorflow.python.keras.activations import deserialize
+from tensorflow.python.keras.activations import serialize
+from tensorflow.python.keras.activations import get
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/applications/inception_v3/__init__.py b/tensorflow/contrib/keras/api/keras/applications/inception_v3/__init__.py
index abf8393ae45d71dc0cb746706abb72f77b82d199..6dfb5cab17c088bfab8ed806adeabd793ced4d12 100644
--- a/tensorflow/contrib/keras/api/keras/applications/inception_v3/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/applications/inception_v3/__init__.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import InceptionV3
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import preprocess_input
+from tensorflow.python.keras.applications.inception_v3 import decode_predictions
+from tensorflow.python.keras.applications.inception_v3 import InceptionV3
+from tensorflow.python.keras.applications.inception_v3 import preprocess_input
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/applications/mobilenet/__init__.py b/tensorflow/contrib/keras/api/keras/applications/mobilenet/__init__.py
index b809e91193b459a46906443796344c092e1d2a6b..67306cc51e1927cfbc2db424b1f4165dabfa22f9 100644
--- a/tensorflow/contrib/keras/api/keras/applications/mobilenet/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/applications/mobilenet/__init__.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.mobilenet import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.mobilenet import MobileNet
-from tensorflow.python.keras._impl.keras.applications.mobilenet import preprocess_input
+from tensorflow.python.keras.applications.mobilenet import decode_predictions
+from tensorflow.python.keras.applications.mobilenet import MobileNet
+from tensorflow.python.keras.applications.mobilenet import preprocess_input
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/applications/resnet50/__init__.py b/tensorflow/contrib/keras/api/keras/applications/resnet50/__init__.py
index 530805d150bfe32c5b81d7d7d3f92e203b83b602..a25ff48b593a9a9ea56fd427a932bb64c10f7b7b 100644
--- a/tensorflow/contrib/keras/api/keras/applications/resnet50/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/applications/resnet50/__init__.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.resnet50 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.resnet50 import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.resnet50 import ResNet50
+from tensorflow.python.keras.applications.resnet50 import decode_predictions
+from tensorflow.python.keras.applications.resnet50 import preprocess_input
+from tensorflow.python.keras.applications.resnet50 import ResNet50
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/applications/vgg16/__init__.py b/tensorflow/contrib/keras/api/keras/applications/vgg16/__init__.py
index 118361604bbc7e0a88ed34243c0d5ea98856a301..4964b1b7deb56fe0025e9a8d8cb45d18e0209fea 100644
--- a/tensorflow/contrib/keras/api/keras/applications/vgg16/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/applications/vgg16/__init__.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.vgg16 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.vgg16 import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.vgg16 import VGG16
+from tensorflow.python.keras.applications.vgg16 import decode_predictions
+from tensorflow.python.keras.applications.vgg16 import preprocess_input
+from tensorflow.python.keras.applications.vgg16 import VGG16
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/applications/vgg19/__init__.py b/tensorflow/contrib/keras/api/keras/applications/vgg19/__init__.py
index cda52628f3c10d65fdbe70b2f86cc12c771870a9..afb3abebdd6735e6f17bc94c1fcd15a31b74f983 100644
--- a/tensorflow/contrib/keras/api/keras/applications/vgg19/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/applications/vgg19/__init__.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.vgg19 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.vgg19 import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.vgg19 import VGG19
+from tensorflow.python.keras.applications.vgg19 import decode_predictions
+from tensorflow.python.keras.applications.vgg19 import preprocess_input
+from tensorflow.python.keras.applications.vgg19 import VGG19
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/applications/xception/__init__.py b/tensorflow/contrib/keras/api/keras/applications/xception/__init__.py
index ae9cd9cd18c5ccc5ec37c8cd1bf36f8aabd9929c..2e3335d02aff0fff805fc2dac614b14e0593d40d 100644
--- a/tensorflow/contrib/keras/api/keras/applications/xception/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/applications/xception/__init__.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.xception import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.xception import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.xception import Xception
+from tensorflow.python.keras.applications.xception import decode_predictions
+from tensorflow.python.keras.applications.xception import preprocess_input
+from tensorflow.python.keras.applications.xception import Xception
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/backend/__init__.py b/tensorflow/contrib/keras/api/keras/backend/__init__.py
index 10ef5a75852deb6595bced2703d7c5f29b0efac3..a755364014206e92289eec0b9c8e510251862e0e 100644
--- a/tensorflow/contrib/keras/api/keras/backend/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/backend/__init__.py
@@ -19,144 +19,144 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=redefined-builtin
-from tensorflow.python.keras._impl.keras.backend import abs
-from tensorflow.python.keras._impl.keras.backend import all
-from tensorflow.python.keras._impl.keras.backend import any
-from tensorflow.python.keras._impl.keras.backend import arange
-from tensorflow.python.keras._impl.keras.backend import argmax
-from tensorflow.python.keras._impl.keras.backend import argmin
-from tensorflow.python.keras._impl.keras.backend import backend
-from tensorflow.python.keras._impl.keras.backend import batch_dot
-from tensorflow.python.keras._impl.keras.backend import batch_flatten
-from tensorflow.python.keras._impl.keras.backend import batch_get_value
-from tensorflow.python.keras._impl.keras.backend import batch_normalization
-from tensorflow.python.keras._impl.keras.backend import batch_set_value
-from tensorflow.python.keras._impl.keras.backend import bias_add
-from tensorflow.python.keras._impl.keras.backend import binary_crossentropy
-from tensorflow.python.keras._impl.keras.backend import cast
-from tensorflow.python.keras._impl.keras.backend import cast_to_floatx
-from tensorflow.python.keras._impl.keras.backend import categorical_crossentropy
-from tensorflow.python.keras._impl.keras.backend import clear_session
-from tensorflow.python.keras._impl.keras.backend import clip
-from tensorflow.python.keras._impl.keras.backend import concatenate
-from tensorflow.python.keras._impl.keras.backend import constant
-from tensorflow.python.keras._impl.keras.backend import conv1d
-from tensorflow.python.keras._impl.keras.backend import conv2d
-from tensorflow.python.keras._impl.keras.backend import conv2d_transpose
-from tensorflow.python.keras._impl.keras.backend import conv3d
-from tensorflow.python.keras._impl.keras.backend import cos
-from tensorflow.python.keras._impl.keras.backend import count_params
-from tensorflow.python.keras._impl.keras.backend import ctc_batch_cost
-from tensorflow.python.keras._impl.keras.backend import ctc_decode
-from tensorflow.python.keras._impl.keras.backend import ctc_label_dense_to_sparse
-from tensorflow.python.keras._impl.keras.backend import dot
-from tensorflow.python.keras._impl.keras.backend import dropout
-from tensorflow.python.keras._impl.keras.backend import dtype
-from tensorflow.python.keras._impl.keras.backend import elu
-from tensorflow.python.keras._impl.keras.backend import epsilon
-from tensorflow.python.keras._impl.keras.backend import equal
-from tensorflow.python.keras._impl.keras.backend import eval
-from tensorflow.python.keras._impl.keras.backend import exp
-from tensorflow.python.keras._impl.keras.backend import expand_dims
-from tensorflow.python.keras._impl.keras.backend import eye
-from tensorflow.python.keras._impl.keras.backend import flatten
-from tensorflow.python.keras._impl.keras.backend import floatx
-from tensorflow.python.keras._impl.keras.backend import foldl
-from tensorflow.python.keras._impl.keras.backend import foldr
-from tensorflow.python.keras._impl.keras.backend import function
-from tensorflow.python.keras._impl.keras.backend import gather
-from tensorflow.python.keras._impl.keras.backend import get_session
-from tensorflow.python.keras._impl.keras.backend import get_uid
-from tensorflow.python.keras._impl.keras.backend import get_value
-from tensorflow.python.keras._impl.keras.backend import gradients
-from tensorflow.python.keras._impl.keras.backend import greater
-from tensorflow.python.keras._impl.keras.backend import greater_equal
-from tensorflow.python.keras._impl.keras.backend import hard_sigmoid
-from tensorflow.python.keras._impl.keras.backend import image_data_format
-from tensorflow.python.keras._impl.keras.backend import in_test_phase
-from tensorflow.python.keras._impl.keras.backend import in_top_k
-from tensorflow.python.keras._impl.keras.backend import in_train_phase
-from tensorflow.python.keras._impl.keras.backend import int_shape
-from tensorflow.python.keras._impl.keras.backend import is_sparse
-from tensorflow.python.keras._impl.keras.backend import l2_normalize
-from tensorflow.python.keras._impl.keras.backend import learning_phase
-from tensorflow.python.keras._impl.keras.backend import less
-from tensorflow.python.keras._impl.keras.backend import less_equal
-from tensorflow.python.keras._impl.keras.backend import log
-from tensorflow.python.keras._impl.keras.backend import manual_variable_initialization
-from tensorflow.python.keras._impl.keras.backend import map_fn
-from tensorflow.python.keras._impl.keras.backend import max
-from tensorflow.python.keras._impl.keras.backend import maximum
-from tensorflow.python.keras._impl.keras.backend import mean
-from tensorflow.python.keras._impl.keras.backend import min
-from tensorflow.python.keras._impl.keras.backend import minimum
-from tensorflow.python.keras._impl.keras.backend import moving_average_update
-from tensorflow.python.keras._impl.keras.backend import name_scope
-from tensorflow.python.keras._impl.keras.backend import ndim
-from tensorflow.python.keras._impl.keras.backend import normalize_batch_in_training
-from tensorflow.python.keras._impl.keras.backend import not_equal
-from tensorflow.python.keras._impl.keras.backend import one_hot
-from tensorflow.python.keras._impl.keras.backend import ones
-from tensorflow.python.keras._impl.keras.backend import ones_like
-from tensorflow.python.keras._impl.keras.backend import permute_dimensions
-from tensorflow.python.keras._impl.keras.backend import placeholder
-from tensorflow.python.keras._impl.keras.backend import pool2d
-from tensorflow.python.keras._impl.keras.backend import pool3d
-from tensorflow.python.keras._impl.keras.backend import pow
-from tensorflow.python.keras._impl.keras.backend import print_tensor
-from tensorflow.python.keras._impl.keras.backend import prod
-from tensorflow.python.keras._impl.keras.backend import random_binomial
-from tensorflow.python.keras._impl.keras.backend import random_normal
-from tensorflow.python.keras._impl.keras.backend import random_normal_variable
-from tensorflow.python.keras._impl.keras.backend import random_uniform
-from tensorflow.python.keras._impl.keras.backend import random_uniform_variable
-from tensorflow.python.keras._impl.keras.backend import relu
-from tensorflow.python.keras._impl.keras.backend import repeat
-from tensorflow.python.keras._impl.keras.backend import repeat_elements
-from tensorflow.python.keras._impl.keras.backend import reset_uids
-from tensorflow.python.keras._impl.keras.backend import reshape
-from tensorflow.python.keras._impl.keras.backend import resize_images
-from tensorflow.python.keras._impl.keras.backend import resize_volumes
-from tensorflow.python.keras._impl.keras.backend import reverse
-from tensorflow.python.keras._impl.keras.backend import rnn
-from tensorflow.python.keras._impl.keras.backend import round
-from tensorflow.python.keras._impl.keras.backend import separable_conv2d
-from tensorflow.python.keras._impl.keras.backend import set_epsilon
-from tensorflow.python.keras._impl.keras.backend import set_floatx
-from tensorflow.python.keras._impl.keras.backend import set_image_data_format
-from tensorflow.python.keras._impl.keras.backend import set_learning_phase
-from tensorflow.python.keras._impl.keras.backend import set_session
-from tensorflow.python.keras._impl.keras.backend import set_value
-from tensorflow.python.keras._impl.keras.backend import shape
-from tensorflow.python.keras._impl.keras.backend import sigmoid
-from tensorflow.python.keras._impl.keras.backend import sign
-from tensorflow.python.keras._impl.keras.backend import sin
-from tensorflow.python.keras._impl.keras.backend import softmax
-from tensorflow.python.keras._impl.keras.backend import softplus
-from tensorflow.python.keras._impl.keras.backend import softsign
-from tensorflow.python.keras._impl.keras.backend import sparse_categorical_crossentropy
-from tensorflow.python.keras._impl.keras.backend import spatial_2d_padding
-from tensorflow.python.keras._impl.keras.backend import spatial_3d_padding
-from tensorflow.python.keras._impl.keras.backend import sqrt
-from tensorflow.python.keras._impl.keras.backend import square
-from tensorflow.python.keras._impl.keras.backend import squeeze
-from tensorflow.python.keras._impl.keras.backend import stack
-from tensorflow.python.keras._impl.keras.backend import std
-from tensorflow.python.keras._impl.keras.backend import stop_gradient
-from tensorflow.python.keras._impl.keras.backend import sum
-from tensorflow.python.keras._impl.keras.backend import switch
-from tensorflow.python.keras._impl.keras.backend import tanh
-from tensorflow.python.keras._impl.keras.backend import temporal_padding
-from tensorflow.python.keras._impl.keras.backend import to_dense
-from tensorflow.python.keras._impl.keras.backend import transpose
-from tensorflow.python.keras._impl.keras.backend import truncated_normal
-from tensorflow.python.keras._impl.keras.backend import update
-from tensorflow.python.keras._impl.keras.backend import update_add
-from tensorflow.python.keras._impl.keras.backend import update_sub
-from tensorflow.python.keras._impl.keras.backend import var
-from tensorflow.python.keras._impl.keras.backend import variable
-from tensorflow.python.keras._impl.keras.backend import zeros
-from tensorflow.python.keras._impl.keras.backend import zeros_like
+from tensorflow.python.keras.backend import abs
+from tensorflow.python.keras.backend import all
+from tensorflow.python.keras.backend import any
+from tensorflow.python.keras.backend import arange
+from tensorflow.python.keras.backend import argmax
+from tensorflow.python.keras.backend import argmin
+from tensorflow.python.keras.backend import backend
+from tensorflow.python.keras.backend import batch_dot
+from tensorflow.python.keras.backend import batch_flatten
+from tensorflow.python.keras.backend import batch_get_value
+from tensorflow.python.keras.backend import batch_normalization
+from tensorflow.python.keras.backend import batch_set_value
+from tensorflow.python.keras.backend import bias_add
+from tensorflow.python.keras.backend import binary_crossentropy
+from tensorflow.python.keras.backend import cast
+from tensorflow.python.keras.backend import cast_to_floatx
+from tensorflow.python.keras.backend import categorical_crossentropy
+from tensorflow.python.keras.backend import clear_session
+from tensorflow.python.keras.backend import clip
+from tensorflow.python.keras.backend import concatenate
+from tensorflow.python.keras.backend import constant
+from tensorflow.python.keras.backend import conv1d
+from tensorflow.python.keras.backend import conv2d
+from tensorflow.python.keras.backend import conv2d_transpose
+from tensorflow.python.keras.backend import conv3d
+from tensorflow.python.keras.backend import cos
+from tensorflow.python.keras.backend import count_params
+from tensorflow.python.keras.backend import ctc_batch_cost
+from tensorflow.python.keras.backend import ctc_decode
+from tensorflow.python.keras.backend import ctc_label_dense_to_sparse
+from tensorflow.python.keras.backend import dot
+from tensorflow.python.keras.backend import dropout
+from tensorflow.python.keras.backend import dtype
+from tensorflow.python.keras.backend import elu
+from tensorflow.python.keras.backend import epsilon
+from tensorflow.python.keras.backend import equal
+from tensorflow.python.keras.backend import eval
+from tensorflow.python.keras.backend import exp
+from tensorflow.python.keras.backend import expand_dims
+from tensorflow.python.keras.backend import eye
+from tensorflow.python.keras.backend import flatten
+from tensorflow.python.keras.backend import floatx
+from tensorflow.python.keras.backend import foldl
+from tensorflow.python.keras.backend import foldr
+from tensorflow.python.keras.backend import function
+from tensorflow.python.keras.backend import gather
+from tensorflow.python.keras.backend import get_session
+from tensorflow.python.keras.backend import get_uid
+from tensorflow.python.keras.backend import get_value
+from tensorflow.python.keras.backend import gradients
+from tensorflow.python.keras.backend import greater
+from tensorflow.python.keras.backend import greater_equal
+from tensorflow.python.keras.backend import hard_sigmoid
+from tensorflow.python.keras.backend import image_data_format
+from tensorflow.python.keras.backend import in_test_phase
+from tensorflow.python.keras.backend import in_top_k
+from tensorflow.python.keras.backend import in_train_phase
+from tensorflow.python.keras.backend import int_shape
+from tensorflow.python.keras.backend import is_sparse
+from tensorflow.python.keras.backend import l2_normalize
+from tensorflow.python.keras.backend import learning_phase
+from tensorflow.python.keras.backend import less
+from tensorflow.python.keras.backend import less_equal
+from tensorflow.python.keras.backend import log
+from tensorflow.python.keras.backend import manual_variable_initialization
+from tensorflow.python.keras.backend import map_fn
+from tensorflow.python.keras.backend import max
+from tensorflow.python.keras.backend import maximum
+from tensorflow.python.keras.backend import mean
+from tensorflow.python.keras.backend import min
+from tensorflow.python.keras.backend import minimum
+from tensorflow.python.keras.backend import moving_average_update
+from tensorflow.python.keras.backend import name_scope
+from tensorflow.python.keras.backend import ndim
+from tensorflow.python.keras.backend import normalize_batch_in_training
+from tensorflow.python.keras.backend import not_equal
+from tensorflow.python.keras.backend import one_hot
+from tensorflow.python.keras.backend import ones
+from tensorflow.python.keras.backend import ones_like
+from tensorflow.python.keras.backend import permute_dimensions
+from tensorflow.python.keras.backend import placeholder
+from tensorflow.python.keras.backend import pool2d
+from tensorflow.python.keras.backend import pool3d
+from tensorflow.python.keras.backend import pow
+from tensorflow.python.keras.backend import print_tensor
+from tensorflow.python.keras.backend import prod
+from tensorflow.python.keras.backend import random_binomial
+from tensorflow.python.keras.backend import random_normal
+from tensorflow.python.keras.backend import random_normal_variable
+from tensorflow.python.keras.backend import random_uniform
+from tensorflow.python.keras.backend import random_uniform_variable
+from tensorflow.python.keras.backend import relu
+from tensorflow.python.keras.backend import repeat
+from tensorflow.python.keras.backend import repeat_elements
+from tensorflow.python.keras.backend import reset_uids
+from tensorflow.python.keras.backend import reshape
+from tensorflow.python.keras.backend import resize_images
+from tensorflow.python.keras.backend import resize_volumes
+from tensorflow.python.keras.backend import reverse
+from tensorflow.python.keras.backend import rnn
+from tensorflow.python.keras.backend import round
+from tensorflow.python.keras.backend import separable_conv2d
+from tensorflow.python.keras.backend import set_epsilon
+from tensorflow.python.keras.backend import set_floatx
+from tensorflow.python.keras.backend import set_image_data_format
+from tensorflow.python.keras.backend import set_learning_phase
+from tensorflow.python.keras.backend import set_session
+from tensorflow.python.keras.backend import set_value
+from tensorflow.python.keras.backend import shape
+from tensorflow.python.keras.backend import sigmoid
+from tensorflow.python.keras.backend import sign
+from tensorflow.python.keras.backend import sin
+from tensorflow.python.keras.backend import softmax
+from tensorflow.python.keras.backend import softplus
+from tensorflow.python.keras.backend import softsign
+from tensorflow.python.keras.backend import sparse_categorical_crossentropy
+from tensorflow.python.keras.backend import spatial_2d_padding
+from tensorflow.python.keras.backend import spatial_3d_padding
+from tensorflow.python.keras.backend import sqrt
+from tensorflow.python.keras.backend import square
+from tensorflow.python.keras.backend import squeeze
+from tensorflow.python.keras.backend import stack
+from tensorflow.python.keras.backend import std
+from tensorflow.python.keras.backend import stop_gradient
+from tensorflow.python.keras.backend import sum
+from tensorflow.python.keras.backend import switch
+from tensorflow.python.keras.backend import tanh
+from tensorflow.python.keras.backend import temporal_padding
+from tensorflow.python.keras.backend import to_dense
+from tensorflow.python.keras.backend import transpose
+from tensorflow.python.keras.backend import truncated_normal
+from tensorflow.python.keras.backend import update
+from tensorflow.python.keras.backend import update_add
+from tensorflow.python.keras.backend import update_sub
+from tensorflow.python.keras.backend import var
+from tensorflow.python.keras.backend import variable
+from tensorflow.python.keras.backend import zeros
+from tensorflow.python.keras.backend import zeros_like
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/callbacks/__init__.py b/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
index 2d884790ddb9ccf49649c6af4cfd40cddbc38cb3..10e05f2969bc404d4cf3a9b7a999510cd40e3c17 100644
--- a/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
@@ -18,19 +18,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.callbacks import BaseLogger
-from tensorflow.python.keras._impl.keras.callbacks import Callback
-from tensorflow.python.keras._impl.keras.callbacks import CSVLogger
-from tensorflow.python.keras._impl.keras.callbacks import EarlyStopping
-from tensorflow.python.keras._impl.keras.callbacks import History
-from tensorflow.python.keras._impl.keras.callbacks import LambdaCallback
-from tensorflow.python.keras._impl.keras.callbacks import LearningRateScheduler
-from tensorflow.python.keras._impl.keras.callbacks import ModelCheckpoint
-from tensorflow.python.keras._impl.keras.callbacks import ProgbarLogger
-from tensorflow.python.keras._impl.keras.callbacks import ReduceLROnPlateau
-from tensorflow.python.keras._impl.keras.callbacks import RemoteMonitor
-from tensorflow.python.keras._impl.keras.callbacks import TensorBoard
-from tensorflow.python.keras._impl.keras.callbacks import TerminateOnNaN
+from tensorflow.python.keras.callbacks import BaseLogger
+from tensorflow.python.keras.callbacks import Callback
+from tensorflow.python.keras.callbacks import CSVLogger
+from tensorflow.python.keras.callbacks import EarlyStopping
+from tensorflow.python.keras.callbacks import History
+from tensorflow.python.keras.callbacks import LambdaCallback
+from tensorflow.python.keras.callbacks import LearningRateScheduler
+from tensorflow.python.keras.callbacks import ModelCheckpoint
+from tensorflow.python.keras.callbacks import ProgbarLogger
+from tensorflow.python.keras.callbacks import ReduceLROnPlateau
+from tensorflow.python.keras.callbacks import RemoteMonitor
+from tensorflow.python.keras.callbacks import TensorBoard
+from tensorflow.python.keras.callbacks import TerminateOnNaN
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/constraints/__init__.py b/tensorflow/contrib/keras/api/keras/constraints/__init__.py
index 152606d8ebbcadf57d971d508e15283da65e4aa3..08debf974ec3a36174c353ecaf9e425a9afc3f36 100644
--- a/tensorflow/contrib/keras/api/keras/constraints/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/constraints/__init__.py
@@ -19,21 +19,21 @@ from __future__ import division
 from __future__ import print_function
 
 # Constraints functions / callable classes.
-from tensorflow.python.keras._impl.keras.constraints import Constraint
-from tensorflow.python.keras._impl.keras.constraints import max_norm
-from tensorflow.python.keras._impl.keras.constraints import MaxNorm
-from tensorflow.python.keras._impl.keras.constraints import min_max_norm
-from tensorflow.python.keras._impl.keras.constraints import MinMaxNorm
-from tensorflow.python.keras._impl.keras.constraints import non_neg
-from tensorflow.python.keras._impl.keras.constraints import NonNeg
-from tensorflow.python.keras._impl.keras.constraints import unit_norm
-from tensorflow.python.keras._impl.keras.constraints import UnitNorm
+from tensorflow.python.keras.constraints import Constraint
+from tensorflow.python.keras.constraints import max_norm
+from tensorflow.python.keras.constraints import MaxNorm
+from tensorflow.python.keras.constraints import min_max_norm
+from tensorflow.python.keras.constraints import MinMaxNorm
+from tensorflow.python.keras.constraints import non_neg
+from tensorflow.python.keras.constraints import NonNeg
+from tensorflow.python.keras.constraints import unit_norm
+from tensorflow.python.keras.constraints import UnitNorm
 
 # Auxiliary utils.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.constraints import deserialize
-from tensorflow.python.keras._impl.keras.constraints import serialize
-from tensorflow.python.keras._impl.keras.constraints import get
+from tensorflow.python.keras.constraints import deserialize
+from tensorflow.python.keras.constraints import serialize
+from tensorflow.python.keras.constraints import get
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/datasets/boston_housing/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/boston_housing/__init__.py
index b5371a03fd5f5755ba8844415276113c565f52db..a5a6fdab445d2d5328f203b6a704f89e9bb4ce67 100644
--- a/tensorflow/contrib/keras/api/keras/datasets/boston_housing/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/datasets/boston_housing/__init__.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.boston_housing import load_data
+from tensorflow.python.keras.datasets.boston_housing import load_data
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/datasets/cifar10/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/cifar10/__init__.py
index 68d3eb789ea2c410095c0c75e0b79a9b07d209a3..e74e5f347df2eeb626cd781c54c9a7b76561d4e9 100644
--- a/tensorflow/contrib/keras/api/keras/datasets/cifar10/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/datasets/cifar10/__init__.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.cifar10 import load_data
+from tensorflow.python.keras.datasets.cifar10 import load_data
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/datasets/cifar100/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/cifar100/__init__.py
index ca93742673341660ba69712feb59c5dd32ea3252..8f5753a6360dfbddb5678c4f2c02adff86b5f0cb 100644
--- a/tensorflow/contrib/keras/api/keras/datasets/cifar100/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/datasets/cifar100/__init__.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.cifar100 import load_data
+from tensorflow.python.keras.datasets.cifar100 import load_data
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/datasets/imdb/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/imdb/__init__.py
index 1c6396d2d32b88eaa900a5af4e62c7484fceab63..bd6ec4b8dfb0344ad0b89956939607ef51bb0889 100644
--- a/tensorflow/contrib/keras/api/keras/datasets/imdb/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/datasets/imdb/__init__.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.imdb import get_word_index
-from tensorflow.python.keras._impl.keras.datasets.imdb import load_data
+from tensorflow.python.keras.datasets.imdb import get_word_index
+from tensorflow.python.keras.datasets.imdb import load_data
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/datasets/mnist/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/mnist/__init__.py
index 364255f3387b59a419c010db9b93cdfbcba36186..f61145655bd5d98965e15fecd387d538e9bc642b 100644
--- a/tensorflow/contrib/keras/api/keras/datasets/mnist/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/datasets/mnist/__init__.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.mnist import load_data
+from tensorflow.python.keras.datasets.mnist import load_data
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/datasets/reuters/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/reuters/__init__.py
index bb6791a344ad0c372ac60cd4a332f5632841dd46..ade31f4ea9c33204a4350e6bc3a5a2469e54fd61 100644
--- a/tensorflow/contrib/keras/api/keras/datasets/reuters/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/datasets/reuters/__init__.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.reuters import get_word_index
-from tensorflow.python.keras._impl.keras.datasets.reuters import load_data
+from tensorflow.python.keras.datasets.reuters import get_word_index
+from tensorflow.python.keras.datasets.reuters import load_data
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/initializers/__init__.py b/tensorflow/contrib/keras/api/keras/initializers/__init__.py
index 6b1fcfd2d9585d19ae3fd9705e128b19b1ec40e7..c6bdc4f0dac3f446238dc4cbc72fe4be278a5ff6 100644
--- a/tensorflow/contrib/keras/api/keras/initializers/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/initializers/__init__.py
@@ -19,30 +19,30 @@ from __future__ import division
 from __future__ import print_function
 
 # Initializer functions / callable classes.
-from tensorflow.python.keras._impl.keras.initializers import Constant
-from tensorflow.python.keras._impl.keras.initializers import Identity
-from tensorflow.python.keras._impl.keras.initializers import Initializer
-from tensorflow.python.keras._impl.keras.initializers import Ones
-from tensorflow.python.keras._impl.keras.initializers import Orthogonal
-from tensorflow.python.keras._impl.keras.initializers import RandomNormal
-from tensorflow.python.keras._impl.keras.initializers import RandomUniform
-from tensorflow.python.keras._impl.keras.initializers import TruncatedNormal
-from tensorflow.python.keras._impl.keras.initializers import VarianceScaling
-from tensorflow.python.keras._impl.keras.initializers import Zeros
+from tensorflow.python.keras.initializers import Constant
+from tensorflow.python.keras.initializers import Identity
+from tensorflow.python.keras.initializers import Initializer
+from tensorflow.python.keras.initializers import Ones
+from tensorflow.python.keras.initializers import Orthogonal
+from tensorflow.python.keras.initializers import RandomNormal
+from tensorflow.python.keras.initializers import RandomUniform
+from tensorflow.python.keras.initializers import TruncatedNormal
+from tensorflow.python.keras.initializers import VarianceScaling
+from tensorflow.python.keras.initializers import Zeros
 
 # Functional interface.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.initializers import glorot_normal
-from tensorflow.python.keras._impl.keras.initializers import glorot_uniform
-from tensorflow.python.keras._impl.keras.initializers import he_normal
-from tensorflow.python.keras._impl.keras.initializers import he_uniform
-from tensorflow.python.keras._impl.keras.initializers import lecun_normal
-from tensorflow.python.keras._impl.keras.initializers import lecun_uniform
+from tensorflow.python.keras.initializers import glorot_normal
+from tensorflow.python.keras.initializers import glorot_uniform
+from tensorflow.python.keras.initializers import he_normal
+from tensorflow.python.keras.initializers import he_uniform
+from tensorflow.python.keras.initializers import lecun_normal
+from tensorflow.python.keras.initializers import lecun_uniform
 
 # Auxiliary utils.
-from tensorflow.python.keras._impl.keras.initializers import deserialize
-from tensorflow.python.keras._impl.keras.initializers import serialize
-from tensorflow.python.keras._impl.keras.initializers import get
+from tensorflow.python.keras.initializers import deserialize
+from tensorflow.python.keras.initializers import serialize
+from tensorflow.python.keras.initializers import get
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/layers/__init__.py b/tensorflow/contrib/keras/api/keras/layers/__init__.py
index acf0a5e1799b7c57dfd82861c9ccc1f132c34375..938c881fcbe18623fa18c21c112375f9914f887b 100644
--- a/tensorflow/contrib/keras/api/keras/layers/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/layers/__init__.py
@@ -20,128 +20,128 @@ from __future__ import print_function
 
 # Generic layers.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.engine import Input
-from tensorflow.python.keras._impl.keras.engine import InputLayer
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras.engine import Input
+from tensorflow.python.keras.engine import InputLayer
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
 
 # Advanced activations.
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import LeakyReLU
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import PReLU
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import ELU
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import ThresholdedReLU
+from tensorflow.python.keras.layers.advanced_activations import LeakyReLU
+from tensorflow.python.keras.layers.advanced_activations import PReLU
+from tensorflow.python.keras.layers.advanced_activations import ELU
+from tensorflow.python.keras.layers.advanced_activations import ThresholdedReLU
 
 # Convolution layers.
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv2DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv3DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConv2D
+from tensorflow.python.keras.layers.convolutional import Conv1D
+from tensorflow.python.keras.layers.convolutional import Conv2D
+from tensorflow.python.keras.layers.convolutional import Conv3D
+from tensorflow.python.keras.layers.convolutional import Conv2DTranspose
+from tensorflow.python.keras.layers.convolutional import Conv3DTranspose
+from tensorflow.python.keras.layers.convolutional import SeparableConv2D
 
 # Convolution layer aliases.
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution2DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution2D
+from tensorflow.python.keras.layers.convolutional import Convolution1D
+from tensorflow.python.keras.layers.convolutional import Convolution2D
+from tensorflow.python.keras.layers.convolutional import Convolution3D
+from tensorflow.python.keras.layers.convolutional import Convolution2DTranspose
+from tensorflow.python.keras.layers.convolutional import Convolution3DTranspose
+from tensorflow.python.keras.layers.convolutional import SeparableConvolution2D
 
 # Image processing layers.
-from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import ZeroPadding1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import ZeroPadding2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import ZeroPadding3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Cropping1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Cropping2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Cropping3D
+from tensorflow.python.keras.layers.convolutional import UpSampling1D
+from tensorflow.python.keras.layers.convolutional import UpSampling2D
+from tensorflow.python.keras.layers.convolutional import UpSampling3D
+from tensorflow.python.keras.layers.convolutional import ZeroPadding1D
+from tensorflow.python.keras.layers.convolutional import ZeroPadding2D
+from tensorflow.python.keras.layers.convolutional import ZeroPadding3D
+from tensorflow.python.keras.layers.convolutional import Cropping1D
+from tensorflow.python.keras.layers.convolutional import Cropping2D
+from tensorflow.python.keras.layers.convolutional import Cropping3D
 
 # Convolutional-recurrent layers.
-from tensorflow.python.keras._impl.keras.layers.convolutional_recurrent import ConvLSTM2D
+from tensorflow.python.keras.layers.convolutional_recurrent import ConvLSTM2D
 
 # Core layers.
-from tensorflow.python.keras._impl.keras.layers.core import Masking
-from tensorflow.python.keras._impl.keras.layers.core import Dropout
-from tensorflow.python.keras._impl.keras.layers.core import SpatialDropout1D
-from tensorflow.python.keras._impl.keras.layers.core import SpatialDropout2D
-from tensorflow.python.keras._impl.keras.layers.core import SpatialDropout3D
-from tensorflow.python.keras._impl.keras.layers.core import Activation
-from tensorflow.python.keras._impl.keras.layers.core import Reshape
-from tensorflow.python.keras._impl.keras.layers.core import Permute
-from tensorflow.python.keras._impl.keras.layers.core import Flatten
-from tensorflow.python.keras._impl.keras.layers.core import RepeatVector
-from tensorflow.python.keras._impl.keras.layers.core import Lambda
-from tensorflow.python.keras._impl.keras.layers.core import Dense
-from tensorflow.python.keras._impl.keras.layers.core import ActivityRegularization
+from tensorflow.python.keras.layers.core import Masking
+from tensorflow.python.keras.layers.core import Dropout
+from tensorflow.python.keras.layers.core import SpatialDropout1D
+from tensorflow.python.keras.layers.core import SpatialDropout2D
+from tensorflow.python.keras.layers.core import SpatialDropout3D
+from tensorflow.python.keras.layers.core import Activation
+from tensorflow.python.keras.layers.core import Reshape
+from tensorflow.python.keras.layers.core import Permute
+from tensorflow.python.keras.layers.core import Flatten
+from tensorflow.python.keras.layers.core import RepeatVector
+from tensorflow.python.keras.layers.core import Lambda
+from tensorflow.python.keras.layers.core import Dense
+from tensorflow.python.keras.layers.core import ActivityRegularization
 
 # Embedding layers.
-from tensorflow.python.keras._impl.keras.layers.embeddings import Embedding
+from tensorflow.python.keras.layers.embeddings import Embedding
 
 # Locally-connected layers.
-from tensorflow.python.keras._impl.keras.layers.local import LocallyConnected1D
-from tensorflow.python.keras._impl.keras.layers.local import LocallyConnected2D
+from tensorflow.python.keras.layers.local import LocallyConnected1D
+from tensorflow.python.keras.layers.local import LocallyConnected2D
 
 # Merge layers.
-from tensorflow.python.keras._impl.keras.layers.merge import Add
-from tensorflow.python.keras._impl.keras.layers.merge import Multiply
-from tensorflow.python.keras._impl.keras.layers.merge import Average
-from tensorflow.python.keras._impl.keras.layers.merge import Maximum
-from tensorflow.python.keras._impl.keras.layers.merge import Concatenate
-from tensorflow.python.keras._impl.keras.layers.merge import Dot
-from tensorflow.python.keras._impl.keras.layers.merge import add
-from tensorflow.python.keras._impl.keras.layers.merge import multiply
-from tensorflow.python.keras._impl.keras.layers.merge import average
-from tensorflow.python.keras._impl.keras.layers.merge import maximum
-from tensorflow.python.keras._impl.keras.layers.merge import concatenate
-from tensorflow.python.keras._impl.keras.layers.merge import dot
+from tensorflow.python.keras.layers.merge import Add
+from tensorflow.python.keras.layers.merge import Multiply
+from tensorflow.python.keras.layers.merge import Average
+from tensorflow.python.keras.layers.merge import Maximum
+from tensorflow.python.keras.layers.merge import Concatenate
+from tensorflow.python.keras.layers.merge import Dot
+from tensorflow.python.keras.layers.merge import add
+from tensorflow.python.keras.layers.merge import multiply
+from tensorflow.python.keras.layers.merge import average
+from tensorflow.python.keras.layers.merge import maximum
+from tensorflow.python.keras.layers.merge import concatenate
+from tensorflow.python.keras.layers.merge import dot
 
 # Noise layers.
-from tensorflow.python.keras._impl.keras.layers.noise import AlphaDropout
-from tensorflow.python.keras._impl.keras.layers.noise import GaussianNoise
-from tensorflow.python.keras._impl.keras.layers.noise import GaussianDropout
+from tensorflow.python.keras.layers.noise import AlphaDropout
+from tensorflow.python.keras.layers.noise import GaussianNoise
+from tensorflow.python.keras.layers.noise import GaussianDropout
 
 # Normalization layers.
-from tensorflow.python.keras._impl.keras.layers.normalization import BatchNormalization
+from tensorflow.python.keras.layers.normalization import BatchNormalization
 
 # Pooling layers.
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling3D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAveragePooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAveragePooling3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPooling3D
+from tensorflow.python.keras.layers.pooling import MaxPooling1D
+from tensorflow.python.keras.layers.pooling import MaxPooling2D
+from tensorflow.python.keras.layers.pooling import MaxPooling3D
+from tensorflow.python.keras.layers.pooling import AveragePooling1D
+from tensorflow.python.keras.layers.pooling import AveragePooling2D
+from tensorflow.python.keras.layers.pooling import AveragePooling3D
+from tensorflow.python.keras.layers.pooling import GlobalAveragePooling1D
+from tensorflow.python.keras.layers.pooling import GlobalAveragePooling2D
+from tensorflow.python.keras.layers.pooling import GlobalAveragePooling3D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPooling1D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPooling2D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPooling3D
 
 # Pooling layer aliases.
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPool3D
-from tensorflow.python.keras._impl.keras.layers.pooling import AvgPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import AvgPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import AvgPool3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAvgPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAvgPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAvgPool3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool3D
+from tensorflow.python.keras.layers.pooling import MaxPool1D
+from tensorflow.python.keras.layers.pooling import MaxPool2D
+from tensorflow.python.keras.layers.pooling import MaxPool3D
+from tensorflow.python.keras.layers.pooling import AvgPool1D
+from tensorflow.python.keras.layers.pooling import AvgPool2D
+from tensorflow.python.keras.layers.pooling import AvgPool3D
+from tensorflow.python.keras.layers.pooling import GlobalAvgPool1D
+from tensorflow.python.keras.layers.pooling import GlobalAvgPool2D
+from tensorflow.python.keras.layers.pooling import GlobalAvgPool3D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPool1D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPool2D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPool3D
 
 # Recurrent layers.
-from tensorflow.python.keras._impl.keras.layers.recurrent import SimpleRNN
-from tensorflow.python.keras._impl.keras.layers.recurrent import GRU
-from tensorflow.python.keras._impl.keras.layers.recurrent import LSTM
+from tensorflow.python.keras.layers.recurrent import SimpleRNN
+from tensorflow.python.keras.layers.recurrent import GRU
+from tensorflow.python.keras.layers.recurrent import LSTM
 
 # Wrapper functions
-from tensorflow.python.keras._impl.keras.layers.wrappers import Wrapper
-from tensorflow.python.keras._impl.keras.layers.wrappers import Bidirectional
-from tensorflow.python.keras._impl.keras.layers.wrappers import TimeDistributed
+from tensorflow.python.keras.layers.wrappers import Wrapper
+from tensorflow.python.keras.layers.wrappers import Bidirectional
+from tensorflow.python.keras.layers.wrappers import TimeDistributed
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/losses/__init__.py b/tensorflow/contrib/keras/api/keras/losses/__init__.py
index 66721b694f5fd5fae7ca521ff56d4c6c6bce79b5..c4476a7bbd5056fa898468a46031bf3d8b1e44cf 100644
--- a/tensorflow/contrib/keras/api/keras/losses/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/losses/__init__.py
@@ -19,26 +19,26 @@ from __future__ import division
 from __future__ import print_function
 
 # Loss functions.
-from tensorflow.python.keras._impl.keras.losses import binary_crossentropy
-from tensorflow.python.keras._impl.keras.losses import categorical_crossentropy
-from tensorflow.python.keras._impl.keras.losses import categorical_hinge
-from tensorflow.python.keras._impl.keras.losses import cosine_proximity
-from tensorflow.python.keras._impl.keras.losses import hinge
-from tensorflow.python.keras._impl.keras.losses import kullback_leibler_divergence
-from tensorflow.python.keras._impl.keras.losses import logcosh
-from tensorflow.python.keras._impl.keras.losses import mean_absolute_error
-from tensorflow.python.keras._impl.keras.losses import mean_absolute_percentage_error
-from tensorflow.python.keras._impl.keras.losses import mean_squared_error
-from tensorflow.python.keras._impl.keras.losses import mean_squared_logarithmic_error
-from tensorflow.python.keras._impl.keras.losses import poisson
-from tensorflow.python.keras._impl.keras.losses import sparse_categorical_crossentropy
-from tensorflow.python.keras._impl.keras.losses import squared_hinge
+from tensorflow.python.keras.losses import binary_crossentropy
+from tensorflow.python.keras.losses import categorical_crossentropy
+from tensorflow.python.keras.losses import categorical_hinge
+from tensorflow.python.keras.losses import cosine_proximity
+from tensorflow.python.keras.losses import hinge
+from tensorflow.python.keras.losses import kullback_leibler_divergence
+from tensorflow.python.keras.losses import logcosh
+from tensorflow.python.keras.losses import mean_absolute_error
+from tensorflow.python.keras.losses import mean_absolute_percentage_error
+from tensorflow.python.keras.losses import mean_squared_error
+from tensorflow.python.keras.losses import mean_squared_logarithmic_error
+from tensorflow.python.keras.losses import poisson
+from tensorflow.python.keras.losses import sparse_categorical_crossentropy
+from tensorflow.python.keras.losses import squared_hinge
 
 # Auxiliary utils.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.losses import deserialize
-from tensorflow.python.keras._impl.keras.losses import serialize
-from tensorflow.python.keras._impl.keras.losses import get
+from tensorflow.python.keras.losses import deserialize
+from tensorflow.python.keras.losses import serialize
+from tensorflow.python.keras.losses import get
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/metrics/__init__.py b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
index 59faf037bce0f087d244a2faaeb52713bdc3b772..7317fdb52c5b79e787a49d71be49f5261d6b1fff 100644
--- a/tensorflow/contrib/keras/api/keras/metrics/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
@@ -19,28 +19,28 @@ from __future__ import division
 from __future__ import print_function
 
 # Metrics functions.
-from tensorflow.python.keras._impl.keras.metrics import binary_accuracy
-from tensorflow.python.keras._impl.keras.metrics import binary_crossentropy
-from tensorflow.python.keras._impl.keras.metrics import categorical_accuracy
-from tensorflow.python.keras._impl.keras.metrics import categorical_crossentropy
-from tensorflow.python.keras._impl.keras.metrics import cosine_proximity
-from tensorflow.python.keras._impl.keras.metrics import hinge
-from tensorflow.python.keras._impl.keras.metrics import kullback_leibler_divergence
-from tensorflow.python.keras._impl.keras.metrics import mean_absolute_error
-from tensorflow.python.keras._impl.keras.metrics import mean_absolute_percentage_error
-from tensorflow.python.keras._impl.keras.metrics import mean_squared_error
-from tensorflow.python.keras._impl.keras.metrics import mean_squared_logarithmic_error
-from tensorflow.python.keras._impl.keras.metrics import poisson
-from tensorflow.python.keras._impl.keras.metrics import sparse_categorical_crossentropy
-from tensorflow.python.keras._impl.keras.metrics import sparse_top_k_categorical_accuracy
-from tensorflow.python.keras._impl.keras.metrics import squared_hinge
-from tensorflow.python.keras._impl.keras.metrics import top_k_categorical_accuracy
+from tensorflow.python.keras.metrics import binary_accuracy
+from tensorflow.python.keras.metrics import binary_crossentropy
+from tensorflow.python.keras.metrics import categorical_accuracy
+from tensorflow.python.keras.metrics import categorical_crossentropy
+from tensorflow.python.keras.metrics import cosine_proximity
+from tensorflow.python.keras.metrics import hinge
+from tensorflow.python.keras.metrics import kullback_leibler_divergence
+from tensorflow.python.keras.metrics import mean_absolute_error
+from tensorflow.python.keras.metrics import mean_absolute_percentage_error
+from tensorflow.python.keras.metrics import mean_squared_error
+from tensorflow.python.keras.metrics import mean_squared_logarithmic_error
+from tensorflow.python.keras.metrics import poisson
+from tensorflow.python.keras.metrics import sparse_categorical_crossentropy
+from tensorflow.python.keras.metrics import sparse_top_k_categorical_accuracy
+from tensorflow.python.keras.metrics import squared_hinge
+from tensorflow.python.keras.metrics import top_k_categorical_accuracy
 
 # Auxiliary utils.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.metrics import deserialize
-from tensorflow.python.keras._impl.keras.metrics import serialize
-from tensorflow.python.keras._impl.keras.metrics import get
+from tensorflow.python.keras.metrics import deserialize
+from tensorflow.python.keras.metrics import serialize
+from tensorflow.python.keras.metrics import get
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/models/__init__.py b/tensorflow/contrib/keras/api/keras/models/__init__.py
index 2fb4ac0960d38f28a1c9c897a0f1aedf57e048ac..3a196984cd88cb60fbc2a9db306ce8fecf0febc0 100644
--- a/tensorflow/contrib/keras/api/keras/models/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/models/__init__.py
@@ -18,13 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.models import load_model
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.models import model_from_config
-from tensorflow.python.keras._impl.keras.models import model_from_json
-from tensorflow.python.keras._impl.keras.models import model_from_yaml
-from tensorflow.python.keras._impl.keras.models import save_model
-from tensorflow.python.keras._impl.keras.models import Sequential
+from tensorflow.python.keras.models import load_model
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.models import model_from_config
+from tensorflow.python.keras.models import model_from_json
+from tensorflow.python.keras.models import model_from_yaml
+from tensorflow.python.keras.models import save_model
+from tensorflow.python.keras.models import Sequential
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/optimizers/__init__.py b/tensorflow/contrib/keras/api/keras/optimizers/__init__.py
index 44f47bc47f4a0e31aaf2ac8f67cfdbef410d8c44..4849a06747958ab41b8b6309fa848aff3da3f633 100644
--- a/tensorflow/contrib/keras/api/keras/optimizers/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/optimizers/__init__.py
@@ -19,20 +19,20 @@ from __future__ import division
 from __future__ import print_function
 
 # Optimizer classes.
-from tensorflow.python.keras._impl.keras.optimizers import Adadelta
-from tensorflow.python.keras._impl.keras.optimizers import Adagrad
-from tensorflow.python.keras._impl.keras.optimizers import Adam
-from tensorflow.python.keras._impl.keras.optimizers import Adamax
-from tensorflow.python.keras._impl.keras.optimizers import Nadam
-from tensorflow.python.keras._impl.keras.optimizers import Optimizer
-from tensorflow.python.keras._impl.keras.optimizers import RMSprop
-from tensorflow.python.keras._impl.keras.optimizers import SGD
+from tensorflow.python.keras.optimizers import Adadelta
+from tensorflow.python.keras.optimizers import Adagrad
+from tensorflow.python.keras.optimizers import Adam
+from tensorflow.python.keras.optimizers import Adamax
+from tensorflow.python.keras.optimizers import Nadam
+from tensorflow.python.keras.optimizers import Optimizer
+from tensorflow.python.keras.optimizers import RMSprop
+from tensorflow.python.keras.optimizers import SGD
 
 # Auxiliary utils.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.optimizers import deserialize
-from tensorflow.python.keras._impl.keras.optimizers import serialize
-from tensorflow.python.keras._impl.keras.optimizers import get
+from tensorflow.python.keras.optimizers import deserialize
+from tensorflow.python.keras.optimizers import serialize
+from tensorflow.python.keras.optimizers import get
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py
index b96e7675527041d3952b049f5f431d3df36eea4c..1f9e82b41bf09b235e93fa512a50ea4c3047c01b 100644
--- a/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py
@@ -18,20 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.preprocessing.image import apply_transform
-from tensorflow.python.keras._impl.keras.preprocessing.image import array_to_img
-from tensorflow.python.keras._impl.keras.preprocessing.image import DirectoryIterator
-from tensorflow.python.keras._impl.keras.preprocessing.image import flip_axis
-from tensorflow.python.keras._impl.keras.preprocessing.image import ImageDataGenerator
-from tensorflow.python.keras._impl.keras.preprocessing.image import img_to_array
-from tensorflow.python.keras._impl.keras.preprocessing.image import Iterator
-from tensorflow.python.keras._impl.keras.preprocessing.image import load_img
-from tensorflow.python.keras._impl.keras.preprocessing.image import NumpyArrayIterator
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_channel_shift
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_rotation
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_shear
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_shift
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_zoom
+from tensorflow.python.keras.preprocessing.image import apply_transform
+from tensorflow.python.keras.preprocessing.image import array_to_img
+from tensorflow.python.keras.preprocessing.image import DirectoryIterator
+from tensorflow.python.keras.preprocessing.image import flip_axis
+from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
+from tensorflow.python.keras.preprocessing.image import img_to_array
+from tensorflow.python.keras.preprocessing.image import Iterator
+from tensorflow.python.keras.preprocessing.image import load_img
+from tensorflow.python.keras.preprocessing.image import NumpyArrayIterator
+from tensorflow.python.keras.preprocessing.image import random_channel_shift
+from tensorflow.python.keras.preprocessing.image import random_rotation
+from tensorflow.python.keras.preprocessing.image import random_shear
+from tensorflow.python.keras.preprocessing.image import random_shift
+from tensorflow.python.keras.preprocessing.image import random_zoom
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/sequence/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/sequence/__init__.py
index 112f6af5e588bcb2e85fdbecea86f402742d44e7..9a93b6fb57ff5aaab25f2b606249a6022814b5e4 100644
--- a/tensorflow/contrib/keras/api/keras/preprocessing/sequence/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/preprocessing/sequence/__init__.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import make_sampling_table
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import pad_sequences
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import skipgrams
+from tensorflow.python.keras.preprocessing.sequence import make_sampling_table
+from tensorflow.python.keras.preprocessing.sequence import pad_sequences
+from tensorflow.python.keras.preprocessing.sequence import skipgrams
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/text/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/text/__init__.py
index 5bf1a2fb21dc27f7aa10cd08b1496e3991c61d2f..86386a9b6762d1c5cb3915ace64686cc25367e0f 100644
--- a/tensorflow/contrib/keras/api/keras/preprocessing/text/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/preprocessing/text/__init__.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.preprocessing.text import one_hot
-from tensorflow.python.keras._impl.keras.preprocessing.text import text_to_word_sequence
-from tensorflow.python.keras._impl.keras.preprocessing.text import Tokenizer
+from tensorflow.python.keras.preprocessing.text import one_hot
+from tensorflow.python.keras.preprocessing.text import text_to_word_sequence
+from tensorflow.python.keras.preprocessing.text import Tokenizer
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/regularizers/__init__.py b/tensorflow/contrib/keras/api/keras/regularizers/__init__.py
index 3e707ccab577b5e28febd83d91f84d7b1c0d5d82..d668e39c09ca28239e56763f111fb01939bedc69 100644
--- a/tensorflow/contrib/keras/api/keras/regularizers/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/regularizers/__init__.py
@@ -19,19 +19,19 @@ from __future__ import division
 from __future__ import print_function
 
 # Regularizer functions / callable classes.
-from tensorflow.python.keras._impl.keras.regularizers import L1L2
-from tensorflow.python.keras._impl.keras.regularizers import Regularizer
+from tensorflow.python.keras.regularizers import L1L2
+from tensorflow.python.keras.regularizers import Regularizer
 
 # Functional interface.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.regularizers import l1
-from tensorflow.python.keras._impl.keras.regularizers import l2
-from tensorflow.python.keras._impl.keras.regularizers import l1_l2
+from tensorflow.python.keras.regularizers import l1
+from tensorflow.python.keras.regularizers import l2
+from tensorflow.python.keras.regularizers import l1_l2
 
 # Auxiliary utils.
-from tensorflow.python.keras._impl.keras.regularizers import deserialize
-from tensorflow.python.keras._impl.keras.regularizers import serialize
-from tensorflow.python.keras._impl.keras.regularizers import get
+from tensorflow.python.keras.regularizers import deserialize
+from tensorflow.python.keras.regularizers import serialize
+from tensorflow.python.keras.regularizers import get
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/utils/__init__.py b/tensorflow/contrib/keras/api/keras/utils/__init__.py
index a7c2179fe7ad434356921a5fb8709aa5b1f33498..47cd01b924fb43e8a83836c58f8ced61e9e88268 100644
--- a/tensorflow/contrib/keras/api/keras/utils/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/utils/__init__.py
@@ -18,21 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
-from tensorflow.python.keras._impl.keras.utils.data_utils import SequenceEnqueuer
-from tensorflow.python.keras._impl.keras.utils.generic_utils import custom_object_scope
-from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import get_custom_objects
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.io_utils import HDF5Matrix
-from tensorflow.python.keras._impl.keras.utils.layer_utils import convert_all_kernels_in_model
-from tensorflow.python.keras._impl.keras.utils.np_utils import normalize
-from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
-from tensorflow.python.keras._impl.keras.utils.vis_utils import plot_model
+from tensorflow.python.keras.utils.data_utils import GeneratorEnqueuer
+from tensorflow.python.keras.utils.data_utils import get_file
+from tensorflow.python.keras.utils.data_utils import Sequence
+from tensorflow.python.keras.utils.data_utils import SequenceEnqueuer
+from tensorflow.python.keras.utils.generic_utils import custom_object_scope
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import get_custom_objects
+from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras.utils.io_utils import HDF5Matrix
+from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
+from tensorflow.python.keras.utils.np_utils import normalize
+from tensorflow.python.keras.utils.np_utils import to_categorical
+from tensorflow.python.keras.utils.vis_utils import plot_model
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/wrappers/scikit_learn/__init__.py b/tensorflow/contrib/keras/api/keras/wrappers/scikit_learn/__init__.py
index a46f859273ea0117e29a403057f9f81bc758dd52..c4b7aa765c26bafbfcfe45df02e58d1cf1064b4b 100644
--- a/tensorflow/contrib/keras/api/keras/wrappers/scikit_learn/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/wrappers/scikit_learn/__init__.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.wrappers.scikit_learn import KerasClassifier
-from tensorflow.python.keras._impl.keras.wrappers.scikit_learn import KerasRegressor
+from tensorflow.python.keras.wrappers.scikit_learn import KerasClassifier
+from tensorflow.python.keras.wrappers.scikit_learn import KerasRegressor
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index d5b3b279a1b7327602790c0260349cb0c758aa86..7355a403aeef78cc7e76d58adfe114e4729f6595 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -381,7 +381,7 @@ py_test(
 
 py_test(
     name = "rev_block_lib_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/layers/rev_block_lib_test.py"],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index 49c3faf3b7f5eaa3b1542a1fdddcfaff99737a24..60e1d85ea9c08a51763fdaf08853f8d9b67347e5 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -458,7 +458,7 @@ def scattered_embedding_lookup_sparse(params,
     return embeddings
 
 
-def embedding_lookup_unique(params, ids, name=None):
+def embedding_lookup_unique(params, ids, partition_strategy="mod", name=None):
   """Version of embedding_lookup that avoids duplicate lookups.
 
   This can save communication in the case of repeated ids.
@@ -470,6 +470,9 @@ def embedding_lookup_unique(params, ids, name=None):
       `PartitionedVariable`. Shape `[index, d1, d2, ...]`.
     ids: A one-dimensional `Tensor` with type `int32` or `int64` containing
       the ids to be looked up in `params`. Shape `[ids1, ids2, ...]`.
+    partition_strategy: A string specifying the partitioning strategy, relevant
+      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
+      is `"mod"`.
     name: A name for this operation (optional).
 
   Returns:
@@ -485,7 +488,8 @@ def embedding_lookup_unique(params, ids, name=None):
     ids_flat = array_ops.reshape(
         ids, math_ops.reduce_prod(shape, keepdims=True))
     unique_ids, idx = array_ops.unique(ids_flat)
-    unique_embeddings = embedding_ops.embedding_lookup(params, unique_ids)
+    unique_embeddings = embedding_ops.embedding_lookup(params, unique_ids,
+                                                       partition_strategy)
     embeds_flat = array_ops.gather(unique_embeddings, idx)
     embed_shape = array_ops.concat(
         [shape, array_ops.shape(unique_embeddings)[1:]], 0)
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index b01fd5d5c95ac15c76f9dbe7c77f7e76f12149a9..56e9194cebbe46907707f7ac0996f9a56fb53c0f 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -1333,7 +1333,7 @@ class DropoutTest(test.TestCase):
     with self.test_session():
       images = np.random.uniform(size=(5, height, width, 3))
       output = _layers.dropout(images)
-      self.assertEqual(output.op.name, 'Dropout/dropout/mul')
+      self.assertEqual(output.op.name, 'Dropout/dropout_1/mul')
       output.get_shape().assert_is_compatible_with(
           ops.convert_to_tensor(images).get_shape())
 
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 3b053cd4c66952cf6c494186b16c17f38801bcaf..0fdbe8f6308e30db2043c400f37d7dcb6058d1f2 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -434,6 +434,7 @@ py_test(
     name = "kmeans_test",
     size = "medium",
     srcs = ["python/learn/estimators/kmeans_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
         "noasan",  # b/73741358
@@ -485,6 +486,7 @@ py_test(
     name = "state_saving_rnn_estimator_test",
     size = "medium",
     srcs = ["python/learn/estimators/state_saving_rnn_estimator_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["noasan"],
     deps = [
@@ -744,7 +746,7 @@ py_test(
 
 tf_py_test(
     name = "graph_io_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/learn/learn_io/graph_io_test.py"],
     additional_deps = [
         ":learn",
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index e28e6854a5097d66cb486be3e82f3726f5cc70fd..339c4e0e360ed9ef9906f0e51b64a0dc13826259 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -1862,12 +1862,12 @@ def _get_arguments(func):
   if hasattr(func, "__code__"):
     # Regular function.
     return tf_inspect.getargspec(func)
-  elif hasattr(func, "__call__"):
-    # Callable object.
-    return _get_arguments(func.__call__)
   elif hasattr(func, "func"):
     # Partial function.
     return _get_arguments(func.func)
+  elif hasattr(func, "__call__"):
+    # Callable object.
+    return _get_arguments(func.__call__)
 
 
 def _verify_loss_fn_args(loss_fn):
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 3744abd860e7f460133873eb534fd75887182f78..541da9061732ad271f6d5456446a9c30b81e58dd 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -38,19 +38,19 @@ from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.tpu.python.tpu import tpu_estimator
 from tensorflow.python.estimator import estimator as core_estimator
-from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import basic_session_run_hooks
 from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
+from tensorflow.python.util import function_utils
 
 __all__ = ["Experiment"]
 
 
 def _get_standardized_predicate_fn(predicate_fn):
-  pred_fn_args = estimator_util.fn_args(predicate_fn)
+  pred_fn_args = function_utils.fn_args(predicate_fn)
   if "checkpoint_path" not in pred_fn_args:
     # pylint: disable=unused-argument
     def _pred_fn_wrapper(eval_results, checkpoint_path):
@@ -468,10 +468,15 @@ class Experiment(object):
         on which that evaluation was based.
         At the beginning of evaluation, the passed `eval_results` will be None
         so it's expected that the predicate function handles that gracefully.
-        When `predicate_fn` is not specified, continuous eval will run in an
-        infinite loop (if `train_steps` is None). or exit once global step
-        reaches `train_steps`.
-
+        Continuous eval behavior under different conditions:
+          * When `predicate_fn` is specified:
+            + if `train_steps` is None, run until `predicate_fn` returns False.
+            + if `train_steps` is specified, run until either global step
+              reaches `train_steps` or `predicate_fn` returns False.
+          * When `predicate_fn` is not specified:
+            + if `train_steps` is None, run in an infinite loop.
+            + if `train_steps` is specified, run until global step reaches
+              `train_steps`.
       export: Whether to export from this step. Default is 'True'.
 
     Raises:
diff --git a/tensorflow/contrib/lite/Android.bp b/tensorflow/contrib/lite/Android.bp
index 8301f9263693eb8254ae8351d3177f9d6165bb0b..bd470696c5879822f9a75b77b3ec132500cd0d34 100644
--- a/tensorflow/contrib/lite/Android.bp
+++ b/tensorflow/contrib/lite/Android.bp
@@ -45,6 +45,7 @@ cc_library_static {
         "graph_info.cc",
         "interpreter.cc",
         "model.cc",
+        "op_resolver.cc",	
         "nnapi_delegate.cc",
         "optional_debug_tools.cc",
         "simple_memory_arena.cc",
diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 10065e894c48d48b8b7136895c55599c8854e03b..55b984f260ec49ab9b52be6402885a46226cba70 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -6,8 +6,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
 
-exports_files(["LICENSE"])
-
 exports_files(glob([
     "testdata/*.bin",
     "testdata/*.pb",
@@ -114,6 +112,7 @@ cc_library(
         "interpreter.cc",
         "model.cc",
         "nnapi_delegate.cc",
+        "op_resolver.cc",
         "optional_debug_tools.cc",
     ],
     hdrs = [
@@ -124,6 +123,7 @@ cc_library(
         "interpreter.h",
         "model.h",
         "nnapi_delegate.h",
+        "op_resolver.h",
         "optional_debug_tools.h",
     ],
     copts = tflite_copts(),
@@ -226,6 +226,18 @@ cc_test(
     ],
 )
 
+# Test OpResolver.
+cc_test(
+    name = "op_resolver_test",
+    size = "small",
+    srcs = ["op_resolver_test.cc"],
+    deps = [
+        ":framework",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 # Test the C extension API code.
 cc_test(
     name = "context_test",
diff --git a/tensorflow/contrib/lite/Makefile b/tensorflow/contrib/lite/Makefile
index e4f86e258afe3df9ba149c82066b6d145f332488..cc8a8035d1dadeec98886ba1dae4cdf403f26de4 100644
--- a/tensorflow/contrib/lite/Makefile
+++ b/tensorflow/contrib/lite/Makefile
@@ -29,7 +29,7 @@ GENDIR := $(MAKEFILE_DIR)/gen/obj/
 CXX := $(CC_PREFIX)gcc
 CXXFLAGS := --std=c++11 -O3 -DNDEBUG
 CC := $(CC_PREFIX)gcc
-CFLAGS := -O3 -DNDEBUG
+CCFLAGS := -O3 -DNDEBUG
 LDOPTS :=
 LDOPTS += -L/usr/local/lib
 ARFLAGS := -r
diff --git a/tensorflow/contrib/lite/RELEASE.md b/tensorflow/contrib/lite/RELEASE.md
new file mode 100644
index 0000000000000000000000000000000000000000..8fd63d5cee7db38fadf63ab8530bef7a3d99dd0d
--- /dev/null
+++ b/tensorflow/contrib/lite/RELEASE.md
@@ -0,0 +1,8 @@
+# Release 0.1.7
+
+* TensorFlow Lite 0.1.7 is based on tag `tflite-v0.1.7` (git commit
+  fa1db5eb0da85b5baccc2a46d534fdeb3bb473d0).
+* To reproduce the iOS library, it's required to cherry pick git commit
+  f1f1d5172fe5bfeaeb2cf657ffc43ba744187bee to fix a dependency issue.
+* The code is based on TensorFlow 1.8.0 release candidate and it's very close
+  to TensorFlow 1.8.0 release.
diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index 85216776823eab2ab3ac2a3bc666f21e312acc6c..9bfc0a0fbeff38fb77b6d67c1a2df37a6807528c 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -1,4 +1,8 @@
 """Generate Flatbuffer binary from json."""
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
 
 def tflite_copts():
   """Defines compile time flags."""
@@ -185,32 +189,102 @@ def json_to_tflite(name, src, out):
       tools = [flatc],
   )
 
-def gen_zipped_test_files(name, files):
+# This is the master list of generated examples that will be made into tests. A
+# function called make_XXX_tests() must also appear in generate_examples.py.
+# Disable a test by commenting it out. If you do, add a link to a bug or issue.
+def generated_test_models():
+    return [
+        "add",
+        "arg_max",
+        "avg_pool",
+        "batch_to_space_nd",
+        "concat",
+        "constant",
+        "control_dep",
+        "conv",
+        "depthwiseconv",
+        "div",
+        "exp",
+        "floor",
+        "fully_connected",
+        "fused_batch_norm",
+        "gather",
+        "global_batch_norm",
+        "greater",
+        "greater_equal",
+        "l2_pool",
+        "l2norm",
+        "less",
+        "less_equal",
+        "local_response_norm",
+        "log_softmax",
+        "max_pool",
+        "maximum",
+        "mean",
+        "minimum",
+        "mul",
+        "neg",
+        "pad",
+        "padv2",
+        # "prelu",
+        "relu",
+        "relu1",
+        "relu6",
+        "reshape",
+        "resize_bilinear",
+        "sigmoid",
+        "sin",
+        "slice",
+        "softmax",
+        "space_to_batch_nd",
+        "space_to_depth",
+        "split",
+        "squeeze",
+        "strided_slice",
+        "strided_slice_1d_exhaustive",
+        "sub",
+        "topk",
+        "transpose",
+        "transpose_conv",
+        "where",
+    ]
+
+def gen_zip_test(name, test_name, **kwargs):
+  """Generate a zipped-example test and its dependent zip files.
+
+  Args:
+    name: Resulting cc_test target name
+    test_name: Test targets this model. Comes from the list above.
+    **kwargs: tf_cc_test kwargs.
+  """
+  gen_zipped_test_file(
+      name = "zip_%s" % test_name,
+      file = "%s.zip" % test_name,
+  )
+  tf_cc_test(name, **kwargs)
+
+def gen_zipped_test_file(name, file):
   """Generate a zip file of tests by using :generate_examples.
 
   Args:
-    name: Name of output. We will produce "`name`_files" as a target.
-    files: A list of zip file basenames.
+    name: Name of output. We will produce "`file`.files" as a target.
+    file: The name of one of the generated_examples targets, e.g. "transpose"
   """
   toco = "//tensorflow/contrib/lite/toco:toco"
-  out_files = []
-  for f in files:
-    out_file = name + "/" + f
-    out_files.append(out_file)
-    native.genrule(
-        name = name + "_" + f + ".files",
-        cmd = ("$(locations :generate_examples) --toco $(locations %s) " % toco
-               + " --zip_to_output " + f + " $(@D)"),
-        outs = [out_file],
-        tools = [
-            ":generate_examples",
-            toco,
-        ],
-    )
+  native.genrule(
+      name = file + ".files",
+      cmd = ("$(locations :generate_examples) --toco $(locations %s) " % toco
+             + " --zip_to_output " + file + " $(@D)"),
+      outs = [file],
+      tools = [
+          ":generate_examples",
+          toco,
+      ],
+  )
 
   native.filegroup(
       name = name,
-      srcs = out_files,
+      srcs = [file],
   )
 
 def gen_selected_ops(name, model):
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 4910c89eaebabb7bd9a4e003b75fa6de4d5af69d..8660c653ae4c0c69e4f5ad8fae739c8c1db7414c 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -161,6 +161,9 @@ typedef struct {
 typedef struct {
 } TfLitePadParams;
 
+typedef struct {
+} TfLitePadV2Params;
+
 typedef struct {
   // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
   // For now we will fix the maximum possible number of dimensions.
@@ -227,6 +230,12 @@ typedef struct {
   TfLiteType output_type;
 } TfLiteArgMaxParams;
 
+typedef struct {
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+} TfLiteTransposeConvParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index 962a7a8970703268b1860875b449a0ff85f449e0..7e285186f45a61a451fd7328b061e16059049ea5 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -85,6 +85,14 @@ typedef enum {
   kTfLiteBuiltinMinimum = 57,
   kTfLiteBuiltinLess = 58,
   kTfLiteBuiltinNeg = 59,
+  kTfLiteBuiltinPadv2 = 60,
+  kTfLiteBuiltinGreater = 61,
+  kTfLiteBuiltinGreaterEqual = 62,
+  kTfLiteBuiltinLessEqual = 63,
+  kTfLiteBuiltinSelect = 64,
+  kTfLiteBuiltinSlice = 65,
+  kTfLiteBuiltinSin = 66,
+  kTfLiteBuiltinTransposeConv = 67,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h
index 12841d233cc1d3c5e1219fc505b1975d2a7fa3e3..4eb66cc225eb04923be9aaa445a335ad822c8a6f 100644
--- a/tensorflow/contrib/lite/context.h
+++ b/tensorflow/contrib/lite/context.h
@@ -370,13 +370,21 @@ typedef struct _TfLiteRegistration {
 
   // Builtin codes. If this kernel refers to a builtin this is the code
   // of the builtin. This is so we can do marshaling to other frameworks like
-  // NN API. Note, it is the responsibility of the registration binder to
-  // set this properly.
+  // NN API.
+  // Note: It is the responsibility of the registration binder to set this
+  // properly.
   int32_t builtin_code;
 
   // Custom op name. If the op is a builtin, this will be null.
+  // Note: It is the responsibility of the registration binder to set this
+  // properly.
   // WARNING: This is an experimental interface that is subject to change.
   const char* custom_name;
+
+  // The version of the op.
+  // Note: It is the responsibility of the registration binder to set this
+  // properly.
+  int version;
 } TfLiteRegistration;
 
 // WARNING: This is an experimental interface that is subject to change.
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
index 2a64c1de725b601e9b6e9325d9faacb37df0e626..e36218e4f12057a362af47c48454f7930fc495f2 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
@@ -62,8 +62,8 @@ void resize(T* out, uint8_t* in, int image_height, int image_width,
       {1, wanted_height, wanted_width, wanted_channels}, quant);
 
   ops::builtin::BuiltinOpResolver resolver;
-  TfLiteRegistration* resize_op =
-      resolver.FindOp(BuiltinOperator_RESIZE_BILINEAR);
+  const TfLiteRegistration* resize_op =
+      resolver.FindOp(BuiltinOperator_RESIZE_BILINEAR, 1);
   auto* params = reinterpret_cast<TfLiteResizeBilinearParams*>(
       malloc(sizeof(TfLiteResizeBilinearParams)));
   params->align_corners = false;
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.cc b/tensorflow/contrib/lite/examples/label_image/label_image.cc
index 456c5c6dc782f4e21a5062e353635117a39cacb9..966fcd2a31fd4d4ff2c3e91633550a8effa81ee8 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.cc
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.cc
@@ -77,14 +77,13 @@ void PrintProfilingInfo(const profiling::ProfileEvent* e, uint32_t op_index,
   // time (ms) , Node xxx, OpCode xxx, symblic name
   //      5.352, Node   5, OpCode   4, DEPTHWISE_CONV_2D
 
-
   LOG(INFO) << std::fixed << std::setw(10) << std::setprecision(3)
             << (e->end_timestamp_us - e->begin_timestamp_us) / 1000.0
             << ", Node " << std::setw(3) << std::setprecision(3) << op_index
             << ", OpCode " << std::setw(3) << std::setprecision(3)
             << registration.builtin_code << ", "
             << EnumNameBuiltinOperator(
-                   (BuiltinOperator)registration.builtin_code)
+                   static_cast<BuiltinOperator>(registration.builtin_code))
             << "\n";
 }
 
@@ -190,13 +189,13 @@ void RunInference(Settings* s) {
   if (s->profiling) profiler->StartProfiling();
 
   struct timeval start_time, stop_time;
-  gettimeofday(&start_time, NULL);
+  gettimeofday(&start_time, nullptr);
   for (int i = 0; i < s->loop_count; i++) {
     if (interpreter->Invoke() != kTfLiteOk) {
       LOG(FATAL) << "Failed to invoke tflite!\n";
     }
   }
-  gettimeofday(&stop_time, NULL);
+  gettimeofday(&stop_time, nullptr);
   LOG(INFO) << "invoked \n";
   LOG(INFO) << "average time: "
             << (get_us(stop_time) - get_us(start_time)) / (s->loop_count * 1000)
@@ -271,17 +270,17 @@ int Main(int argc, char** argv) {
   int c;
   while (1) {
     static struct option long_options[] = {
-        {"accelerated", required_argument, 0, 'a'},
-        {"count", required_argument, 0, 'c'},
-        {"verbose", required_argument, 0, 'v'},
-        {"image", required_argument, 0, 'i'},
-        {"labels", required_argument, 0, 'l'},
-        {"tflite_model", required_argument, 0, 'm'},
-        {"profiling", required_argument, 0, 'p'},
-        {"threads", required_argument, 0, 't'},
-        {"input_mean", required_argument, 0, 'b'},
-        {"input_std", required_argument, 0, 's'},
-        {0, 0, 0, 0}};
+        {"accelerated", required_argument, nullptr, 'a'},
+        {"count", required_argument, nullptr, 'c'},
+        {"verbose", required_argument, nullptr, 'v'},
+        {"image", required_argument, nullptr, 'i'},
+        {"labels", required_argument, nullptr, 'l'},
+        {"tflite_model", required_argument, nullptr, 'm'},
+        {"profiling", required_argument, nullptr, 'p'},
+        {"threads", required_argument, nullptr, 't'},
+        {"input_mean", required_argument, nullptr, 'b'},
+        {"input_std", required_argument, nullptr, 's'},
+        {nullptr, 0, nullptr, 0}};
 
     /* getopt_long stores the option index here. */
     int option_index = 0;
@@ -294,15 +293,14 @@ int Main(int argc, char** argv) {
 
     switch (c) {
       case 'a':
-        s.accel = strtol(  // NOLINT(runtime/deprecated_fn)
-            optarg, (char**)NULL, 10);
+        s.accel = strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
         break;
       case 'b':
-        s.input_mean = strtod(optarg, NULL);
+        s.input_mean = strtod(optarg, nullptr);
         break;
       case 'c':
-        s.loop_count = strtol(  // NOLINT(runtime/deprecated_fn)
-            optarg, (char**)NULL, 10);
+        s.loop_count =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
         break;
       case 'i':
         s.input_bmp_name = optarg;
@@ -314,19 +312,19 @@ int Main(int argc, char** argv) {
         s.model_name = optarg;
         break;
       case 'p':
-        s.profiling = strtol(  // NOLINT(runtime/deprecated_fn)
-            optarg, (char**)NULL, 10);
+        s.profiling =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
         break;
       case 's':
-        s.input_std = strtod(optarg, NULL);
+        s.input_std = strtod(optarg, nullptr);
         break;
       case 't':
         s.number_of_threads = strtol(  // NOLINT(runtime/deprecated_fn)
-            optarg, (char**)NULL, 10);
+            optarg, nullptr, 10);
         break;
       case 'v':
-        s.verbose = strtol(  // NOLINT(runtime/deprecated_fn)
-            optarg, (char**)NULL, 10);
+        s.verbose =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
         break;
       case 'h':
       case '?':
diff --git a/tensorflow/contrib/lite/g3doc/custom_operators.md b/tensorflow/contrib/lite/g3doc/custom_operators.md
index d7cc854ebac08e79d346df0aca6e1fa56b490156..972e57f73e82961ebc5e341dd7a41bc00acc5d21 100644
--- a/tensorflow/contrib/lite/g3doc/custom_operators.md
+++ b/tensorflow/contrib/lite/g3doc/custom_operators.md
@@ -39,7 +39,7 @@ TfLiteStatus SinPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
   int num_dims = NumDimensions(input);
@@ -54,7 +54,7 @@ TfLiteStatus SinPrepare(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
   using namespace tflite;
-  TfLiteTensor* input = GetInput(context, node,0);
+  const TfLiteTensor* input = GetInput(context, node,0);
   TfLiteTensor* output = GetOutput(context, node,0);
 
   float* input_data = input->data.f;
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index 0051ee84ec38f8acd77804c3d5a005001a44d258..d8c46e633151cba94ff3d2a3c8b0ab5c230f245e 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -132,9 +132,7 @@ TensorFlow operation not listed above are likely unsupported. Notably, the
 following common ops are not supported at the moment:
 
 *   [tf.depth_to_space](https://www.tensorflow.org/api_docs/python/tf/depth_to_space)
-*   [tf.gather](https://www.tensorflow.org/api_docs/python/tf/gather)
 *   [tf.image.resize_bilinear](https://www.tensorflow.org/api_docs/python/tf/image/resize_bilinear)
-*   [tf.slice](https://www.tensorflow.org/api_docs/python/tf/slice)
 *   [tf.tanh](https://www.tensorflow.org/api_docs/python/tf/tanh)
 
 ## TensorFlow Lite Operations
@@ -222,6 +220,23 @@ Options {
 }
 ```
 
+**CONV_2D_TRANSPOSE**
+
+```
+Inputs {
+  0: output_shape
+  1: filter
+  2: 4D tensor
+}
+Outputs {
+  0: the transpose (gradient) of conv2d
+}
+Options {
+  padding: SAME|VALID
+  stride_w,stride_h: stride of the filter window
+}
+```
+
 **DEPTHWISE_CONV_2D**
 
 ```
@@ -281,6 +296,45 @@ Options {
 }
 ```
 
+**GATHER**
+
+```
+Inputs {
+  0: params tensor
+  1: indices tensor
+  2: axis tensor (optional)
+}
+Outputs {
+  0: a tensor with same type as the params tensor.
+}
+```
+
+**GREATER**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: a tensor of type bool, true whenever an element of the first tensor is
+  greater than the corresponding element of the second tensor.
+}
+```
+
+**GREATER_EQUAL**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: a tensor of type bool, true whenever an element of the first tensor is
+  greater than or equal to the corresponding element of the second tensor.
+}
+```
+
 **L2_NORMALIZATION**
 
 ```
@@ -325,6 +379,19 @@ Outputs {
 }
 ```
 
+**LESS_EQUAL**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: a tensor of type bool, true whenever an element of the first tensor is less
+  than or equal to the corresponding element of the second tensor.
+}
+```
+
 **LOCAL_RESPONSE_NORMALIZATION**
 
 ```
@@ -484,6 +551,19 @@ Options {
 }
 ```
 
+**SLICE**
+
+```
+Inputs {
+  0: tensor
+  1: 1D tensor
+  2: 1D tensor
+}
+Outputs {
+  0: slice of the input tensor of the given size from the given begin index.
+}
+```
+
 **SOFTMAX**
 
 ```
@@ -569,7 +649,7 @@ Outputs {
   0: slice of the input tensor of the given size
 }
 Options {
-  begin_mask: mask for begin indicies
+  begin_mask: mask for begin indices
   end_mask: mask for end indices
   shrink_axis_mask: mask that indicates which dimensions to remove
 }
@@ -584,7 +664,7 @@ Inputs {
 }
 Outputs {
   0: k largest element along each last dimensional slice
-  1: indicies of values within the last dimension of the input ensor
+  1: indices of values within the last dimension of the input ensor
 }
 ```
 
@@ -600,6 +680,20 @@ Outputs {
 }
 ```
 
+**SELECT**
+
+```
+Inputs {
+  0: tensor
+  1: tensor
+  2: tensor
+}
+Outputs {
+  0: tensor that contains the elementwise values of 'tensor 1' if the
+  corresponding value of 'tensor 0' is true or the value of 'tensor 2' if false.
+}
+```
+
 And these are TensorFlow Lite operations that are present but not ready for
 custom models yet:
 
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 1074f64263b5d7d64cac57d011c5df1779abffc1..7315d8360680ca0d3c405dc80b593762275815ee 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -201,7 +201,7 @@ class Interpreter {
   // Overrides execution plan. This bounds checks indices sent in.
   TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan);
 
-  // Get a tensor data structure.
+  // Get a mutable tensor data structure.
   // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
   // read/write access to structure
   TfLiteTensor* tensor(int tensor_index) {
@@ -210,9 +210,14 @@ class Interpreter {
     return &context_.tensors[tensor_index];
   }
 
+  // Get an immutable tensor data structure.
+  const TfLiteTensor* tensor(int tensor_index) const {
+    if (tensor_index >= context_.tensors_size || tensor_index < 0)
+      return nullptr;
+    return &context_.tensors[tensor_index];
+  }
+
   // Get a pointer to an operation and registration data structure if in bounds.
-  // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
-  // read/write access to structure
   const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
       int node_index) const {
     if (node_index >= nodes_and_registration_.size() || node_index < 0)
@@ -220,7 +225,8 @@ class Interpreter {
     return &nodes_and_registration_[node_index];
   }
 
-  // Perform a checked cast to the appropriate tensor type.
+  // Perform a checked cast to the appropriate tensor type (mutable pointer
+  // version).
   template <class T>
   T* typed_tensor(int tensor_index) {
     if (TfLiteTensor* tensor_ptr = tensor(tensor_index)) {
@@ -231,20 +237,46 @@ class Interpreter {
     return nullptr;
   }
 
-  // Return a pointer into the data of a given input tensor. The given index
-  // must be between 0 and inputs().size().
+  // Perform a checked cast to the appropriate tensor type (immutable pointer
+  // version).
+  template <class T>
+  const T* typed_tensor(int tensor_index) const {
+    if (const TfLiteTensor* tensor_ptr = tensor(tensor_index)) {
+      if (tensor_ptr->type == typeToTfLiteType<T>()) {
+        return reinterpret_cast<const T*>(tensor_ptr->data.raw);
+      }
+    }
+    return nullptr;
+  }
+
+  // Return a mutable pointer into the data of a given input tensor. The given
+  // index must be between 0 and inputs().size().
   template <class T>
   T* typed_input_tensor(int index) {
     return typed_tensor<T>(inputs_[index]);
   }
 
-  // Return a pointer into the data of a given output tensor. The given index
-  // must be between 0 and outputs().size().
+  // Return an immutable pointer into the data of a given input tensor. The
+  // given index must be between 0 and inputs().size().
+  template <class T>
+  const T* typed_input_tensor(int index) const {
+    return typed_tensor<T>(inputs_[index]);
+  }
+
+  // Return a mutable pointer into the data of a given output tensor. The given
+  // index must be between 0 and outputs().size().
   template <class T>
   T* typed_output_tensor(int index) {
     return typed_tensor<T>(outputs_[index]);
   }
 
+  // Return an immutable pointer into the data of a given output tensor. The
+  // given index must be between 0 and outputs().size().
+  template <class T>
+  const T* typed_output_tensor(int index) const {
+    return typed_tensor<T>(outputs_[index]);
+  }
+
   // Change the dimensionality of a given tensor. Note, this is only acceptable
   // for tensor indices that are inputs.
   // Returns status of failure or success.
diff --git a/tensorflow/contrib/lite/java/BUILD b/tensorflow/contrib/lite/java/BUILD
index 1dda55b8edf8f85293c473b51b8a19066bac5f73..593af81a18a1e20a41dcc8d9bb3a1d815876e294 100644
--- a/tensorflow/contrib/lite/java/BUILD
+++ b/tensorflow/contrib/lite/java/BUILD
@@ -1,7 +1,9 @@
 # Description:
 # TensorFlow Lite Java API.
 
-package(default_visibility = ["//visibility:private"])
+package(default_visibility = [
+    "//tensorflow/contrib/lite/java/ovic:__pkg__",
+])
 
 licenses(["notice"])  # Apache 2.0
 
@@ -46,23 +48,6 @@ android_library(
     ],
 )
 
-java_library(
-    name = "ovicbenchmarkerlib",
-    srcs = [
-        "ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java",
-        "ovic/src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java",
-    ],
-    javacopts = JAVACOPTS,
-    visibility = ["//visibility:public"],
-    deps = [
-        ":libtensorflowlite_jni.so",
-        ":tensorflowlite_java",
-        "//tensorflow/contrib/lite/java/src/main/native",
-        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
-    ],
-)
-
 java_library(
     name = "tensorflowlitelib",
     srcs = glob(
@@ -165,28 +150,6 @@ java_test(
     ],
 )
 
-java_test(
-    name = "OvicClassifierTest",
-    size = "medium",
-    srcs = ["ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
-    data = [
-        "ovic/src/testdata/float_model.lite",
-        "ovic/src/testdata/labels.txt",
-        "ovic/src/testdata/low_res_model.lite",
-        "ovic/src/testdata/quantized_model.lite",
-        "ovic/src/testdata/test_image_128.jpg",
-        "ovic/src/testdata/test_image_224.jpg",
-    ],
-    javacopts = JAVACOPTS,
-    test_class = "org.tensorflow.ovic.OvicClassifierTest",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":ovicbenchmarkerlib",
-        "@com_google_truth",
-        "@junit",
-    ],
-)
-
 filegroup(
     name = "libtensorflowlite_jni",
     srcs = select({
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/AndroidManifest.xml b/tensorflow/contrib/lite/java/demo/app/src/main/AndroidManifest.xml
index ba63dce5d9a7192a2c3c4c5561333d39a3ecc024..95b6b7016f2818127a89d2e9212aa231a5ec24b9 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/AndroidManifest.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/AndroidManifest.xml
@@ -31,6 +31,7 @@
         android:theme="@style/MaterialTheme">
 
         <activity android:name="com.example.android.tflitecamerademo.CameraActivity"
+                  android:screenOrientation="portrait"
                   android:label="@string/app_name">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
index 20f520814d7154764932638c5e9dddc32639b677..ef8a9e08450d72e392815756606f5ef8301cdd58 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
@@ -13,51 +13,55 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 -->
-<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:app="http://schemas.android.com/apk/res-auto"
+
+<LinearLayout
+    xmlns:android="http://schemas.android.com/apk/res/android"
     android:layout_width="match_parent"
-    android:layout_height="match_parent">
+    android:layout_height="match_parent"
+    android:background="#bb7700"
+    android:orientation="horizontal">
+
+  <com.example.android.tflitecamerademo.AutoFitTextureView
+      android:id="@+id/texture"
+      android:layout_width="0dp"
+      android:layout_height="match_parent"
+      android:layout_weight=".8"/>
+
+  <LinearLayout
+      android:layout_width="0dp"
+      android:layout_height="match_parent"
+      android:layout_weight=".2"
+      android:orientation="vertical">
+
+    <ImageView
+        android:id="@+id/logoview"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:scaleType="centerInside"
+        android:src="@drawable/logo"/>
 
-    <LinearLayout
+    <ToggleButton
+        android:id="@+id/button"
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"
+        android:textOff="@string/tflite"
+        android:textOn="@string/nnapi"/>
+    <NumberPicker
+        android:id="@+id/np"
+        android:layout_width="wrap_content"
+        android:layout_height="47dp"
+        android:layout_gravity="center_horizontal"
+        android:visibility="visible"/>
+
+    <TextView
+        android:id="@+id/text"
+        android:textStyle="bold"
         android:layout_width="match_parent"
         android:layout_height="match_parent"
-        android:background="#bb7700"
-        android:orientation="horizontal"
-        android:weightSum="100">
-
-        <LinearLayout
-            android:layout_width="match_parent"
-            android:layout_height="match_parent"
-            android:layout_weight="30"
-            android:orientation="vertical">
-
-            <com.example.android.tflitecamerademo.AutoFitTextureView
-                android:id="@+id/texture"
-                android:layout_width="match_parent"
-                android:layout_height="match_parent"
-                android:layout_weight="100" />
-
-            <ImageView
-                android:id="@+id/logoview"
-                android:layout_width="match_parent"
-                android:layout_height="wrap_content"
-                android:layout_weight="100"
-                android:scaleType="centerCrop"
-                android:src="@drawable/logo" />
-
-        </LinearLayout>
-
-        <TextView
-            android:id="@+id/text"
-            android:layout_width="match_parent"
-            android:layout_height="match_parent"
-            android:layout_weight="70"
-            android:paddingLeft="5dp"
-            android:paddingTop="20dp"
-            android:textColor="#FFF"
-            android:textSize="20sp"
-            android:textStyle="bold" />
-
-    </LinearLayout>
-
-</RelativeLayout>
+        android:paddingTop="20dp"
+        android:textColor="#FFF"
+        android:textSize="20sp"/>
+
+  </LinearLayout>
+</LinearLayout>
+
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
index 72a229ecdb19f5309994e994d82e0b5b5ed617a2..ddb099a950c2f83d7b2867f8f35d96885229536d 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
@@ -28,7 +28,7 @@
     <LinearLayout
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
-        android:layout_alignParentBottom="true"
+        android:layout_above="@+id/bottom_info_view"
         android:layout_alignParentEnd="false"
         android:layout_alignParentStart="true"
         android:layout_alignParentTop="false"
@@ -57,32 +57,39 @@
             android:textStyle="bold" />
 
     </LinearLayout>
+    <LinearLayout
+        android:orientation="horizontal"
+        android:background="#513400"
+        android:layout_alignParentBottom="true"
 
-    <RelativeLayout
-        android:id="@+id/control2"
         android:layout_width="match_parent"
-        android:layout_height="135dp"
-        android:layout_alignParentLeft="true"
-        android:layout_alignParentStart="true"
-        android:layout_alignTop="@+id/control"
-        android:layout_marginLeft="300dp"
-        android:layout_marginStart="300dp"
-        android:background="#bb7700">
-
+        android:id="@+id/bottom_info_view"
+        android:layout_marginBottom="10dp"
+        android:layout_height="50dp">
+        <TextView
+            android:layout_width="wrap_content"
+            android:layout_height="match_parent"
+            android:textColor="@android:color/white"
+            android:textAlignment="center"
+            android:gravity="center"
+            android:text="Threads:"/>
+        <NumberPicker
+            android:id="@+id/np"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_marginLeft="10dp"
+            android:theme="@style/AppTheme.Picker"
+            android:visibility="visible" />
         <ToggleButton
             android:id="@+id/button"
             android:textOff="@string/tflite"
             android:textOn="@string/nnapi"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:layout_alignParentLeft="true"
-            android:layout_alignParentStart="true" />
+            android:layout_marginLeft="10dp"
+            android:background="#0000000f"
+            android:textColor="@android:color/white" />
+    </LinearLayout>
+
 
-        <NumberPicker
-            android:id="@+id/np"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_below="@+id/button"
-            android:visibility="visible" />
-    </RelativeLayout>
 </RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
index d12435d5abda45917b8a4f12c4b3179997eae689..e567009a424ed77384bee193c47d4f4d253f5767 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
@@ -15,101 +15,80 @@
 -->
 <RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
     xmlns:app="http://schemas.android.com/apk/res-auto"
-    xmlns:tools="http://schemas.android.com/tools"
     android:layout_width="match_parent"
-    android:layout_height="match_parent">
+    android:layout_height="match_parent"
+    android:background="#bb7700">
 
-    <LinearLayout
+    <com.example.android.tflitecamerademo.AutoFitTextureView
+        android:id="@+id/texture"
         android:layout_width="match_parent"
         android:layout_height="match_parent"
-        android:orientation="vertical"
-        android:weightSum="60">
+        android:layout_weight="1" />
 
-        <FrameLayout
-            android:id="@+id/control"
-            android:layout_width="match_parent"
-            android:layout_height="match_parent"
-            android:layout_alignParentBottom="true"
-            android:layout_alignParentStart="true"
-            android:layout_weight="60"
-            android:background="#cc7700"
-            android:paddingLeft="20dp"
-            android:paddingStart="20dp">
-
-        </FrameLayout>
-
-    <com.example.android.tflitecamerademo.AutoFitTextureView
-        android:id="@+id/texture"
+    <LinearLayout
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
+        android:layout_above="@+id/bottom_info_view"
+        android:layout_alignParentEnd="false"
         android:layout_alignParentStart="true"
-        android:layout_alignParentLeft="true"
-        android:layout_alignParentTop="true" />
+        android:layout_alignParentTop="false"
+        android:background="#bb7700"
+        android:orientation="vertical"
+        android:weightSum="100">
+
+        <ImageView
+            android:id="@+id/logoview2"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_weight="30"
+            android:scaleType="fitStart"
+            android:src="@drawable/logo" />
 
         <TextView
             android:id="@+id/text"
             android:layout_width="match_parent"
-            android:layout_height="match_parent"
-            android:layout_weight="20"
+            android:layout_height="wrap_content"
+            android:layout_alignParentBottom="true"
+            android:layout_alignParentEnd="true"
+            android:layout_alignParentRight="true"
+            android:layout_weight="30"
             android:textColor="#FFF"
             android:textSize="20sp"
             android:textStyle="bold" />
+
     </LinearLayout>
+    <LinearLayout
+        android:orientation="horizontal"
+        android:background="#aa7700"
+        android:layout_alignParentBottom="true"
 
-    <RelativeLayout
-        android:id="@+id/control2"
         android:layout_width="match_parent"
-        android:layout_height="135dp"
-        android:layout_alignParentLeft="true"
-        android:layout_alignParentStart="true"
-        android:layout_alignTop="@+id/control"
-        android:layout_marginLeft="300dp"
-        android:layout_marginStart="300dp"
-        android:background="#bb7700">
-
-        <ToggleButton
-            android:id="@+id/button"
-            android:textOff="@string/tflite"
-            android:textOn="@string/nnapi"
+        android:id="@+id/bottom_info_view"
+        android:layout_marginBottom="10dp"
+        android:layout_height="50dp">
+        <TextView
             android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_alignParentLeft="true"
-            android:layout_alignParentStart="true" />
-
+            android:layout_height="match_parent"
+            android:textColor="@android:color/white"
+            android:textAlignment="center"
+            android:gravity="center"
+            android:text="@string/threads" />
         <NumberPicker
             android:id="@+id/np"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:layout_below="@+id/button"
+            android:layout_marginLeft="10dp"
+            android:theme="@style/AppTheme.Picker"
             android:visibility="visible" />
-    </RelativeLayout>
-
-    <RelativeLayout
-        android:id="@+id/control2"
-        android:layout_width="match_parent"
-        android:layout_height="135dp"
-        android:layout_alignParentLeft="true"
-        android:layout_alignParentStart="true"
-        android:layout_alignTop="@+id/control"
-        android:layout_marginLeft="300dp"
-        android:layout_marginStart="300dp"
-        android:background="@color/control_background">
-
         <ToggleButton
             android:id="@+id/button"
             android:textOff="@string/tflite"
             android:textOn="@string/nnapi"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:layout_alignParentLeft="true"
-            android:layout_alignParentStart="true" />
-
-        <NumberPicker
-            android:id="@+id/np"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_below="@+id/button"
-            android:visibility="visible" />
-    </RelativeLayout>
+            android:layout_marginLeft="10dp"
+            android:background="#0000000f"
+            android:textColor="@android:color/white" />
 
+    </LinearLayout>
 </RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml
index 0a71dbd0e8010f5e3a176de1f7e8321331289f7c..7af8f3a98c6319da7723928ce61802ed4c5497ec 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml
@@ -16,7 +16,7 @@
 -->
 
 <resources>
-    <string name="app_name">TfLiteCameraDemo</string>
+    <string name="app_name">TfLite Camera Demo</string>
     <string name="intro_message">
         <![CDATA[
 
@@ -27,4 +27,5 @@
 
         ]]>
     </string>
+    <string name="threads">Threads:</string>
 </resources>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/styles.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/styles.xml
index 3f3bdfb49480e779c108cd15da854ae82a118d52..1752b3b5f97e288d8b59106dfece1d84fe21d0ba 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/styles.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/styles.xml
@@ -14,5 +14,10 @@
  limitations under the License.
 -->
 <resources>
-    <style name="MaterialTheme" parent="android:Theme.Material.Light.NoActionBar.Fullscreen" />
+  <style name="MaterialTheme" parent="android:Theme.Material.Light.NoActionBar.Fullscreen" />
+   <style name="AppTheme.Picker" parent="android:Theme.Material.Light.NoActionBar.Fullscreen" >
+    <item name="android:textColorPrimary">@android:color/white</item>
+
+</style>
+
 </resources>
diff --git a/tensorflow/contrib/lite/java/ovic/BUILD b/tensorflow/contrib/lite/java/ovic/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..362d93636f72205ddcda6d97fa9fae376ff211f1
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/BUILD
@@ -0,0 +1,68 @@
+# Description:
+# OVIC Benchmarker Java API.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
+
+java_test(
+    name = "OvicClassifierTest",
+    size = "medium",
+    srcs = ["src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
+    data = [
+        "//tensorflow/contrib/lite/java/ovic/src/testdata:labels.txt",
+        "//tensorflow/contrib/lite/java/ovic/src/testdata:ovic_testdata",
+    ],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.ovic.OvicClassifierTest",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/lite/java/ovic:ovicbenchmarkerlib_java",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
+java_binary(
+    name = "ovic_validator",
+    srcs = ["src/main/java/org/tensorflow/ovic/OvicValidator.java"],
+    data = [
+        "//tensorflow/contrib/lite/java/ovic/src/testdata:labels.txt",
+    ],
+    main_class = "org.tensorflow.ovic.OvicValidator",
+    deps = [
+        "//tensorflow/contrib/lite/java/ovic:ovicbenchmarkerlib_java",
+    ],
+)
+
+android_library(
+    name = "ovicbenchmarkerlib",
+    srcs = [
+        "src/main/java/org/tensorflow/ovic/OvicClassifier.java",
+        "src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java",
+    ],
+    manifest = "//tensorflow/contrib/lite/java:AndroidManifest.xml",
+    deps = [
+        "//tensorflow/contrib/lite/java:tensorflowlite",
+        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@org_checkerframework_qual",
+    ],
+)
+
+java_library(
+    name = "ovicbenchmarkerlib_java",
+    srcs = [
+        "src/main/java/org/tensorflow/ovic/OvicClassifier.java",
+        "src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java",
+    ],
+    javacopts = JAVACOPTS,
+    deps = [
+        "//tensorflow/contrib/lite/java:libtensorflowlite_jni.so",
+        "//tensorflow/contrib/lite/java:tensorflowlite_java",
+        "//tensorflow/contrib/lite/java/src/main/native",
+        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@org_checkerframework_qual",
+    ],
+)
diff --git a/tensorflow/contrib/lite/java/ovic/README.md b/tensorflow/contrib/lite/java/ovic/README.md
index 76c33838bfe5b8596d78cae7d022c51d2a379e76..26349347faebac135ae555e0c5d8219046ab1c29 100644
--- a/tensorflow/contrib/lite/java/ovic/README.md
+++ b/tensorflow/contrib/lite/java/ovic/README.md
@@ -2,11 +2,11 @@
 
 This folder contains building code for track one of the [Low Power ImageNet Recognition Challenge workshop at CVPR 2018.](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018)
 
-## Pre-requesits
+## Pre-requisite
 
 Follow the steps [here](https://www.tensorflow.org/mobile/tflite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
 
-## To test the benchmarker:
+## Test the benchmarker:
 
 The testing utilities helps the developers (you) to make sure that your submissions in TfLite format will be processed as expected in the competition's benchmarking system.
 
@@ -37,47 +37,122 @@ unzip -j /tmp/ovic.zip -d tensorflow/contrib/lite/java/ovic/src/testdata/
 You can run test with Bazel as below. This helps to ensure that the installation is correct.
 
 ```sh
-bazel test --cxxopt=--std=c++11 //tensorflow/contrib/lite/java:OvicClassifierTest --test_output=all
+bazel test --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/ovic:OvicClassifierTest --cxxopt=-Wno-all --test_output=all
 ```
 
 ### Test your submissions
 
-Once you have a submission that follows the instructions from the [competition site](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018), you can verify it as below.
+Once you have a submission that follows the instructions from the [competition site](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018), you can verify it in two ways:
 
-* Move your submission to the testdata folder:
+#### Validate using randomly generated images
 
-Let say the submission file is located at `/tmp/my_model.lite`, then
+You can call the validator binary below to verify that your model fits the format requirements. This often helps you to catch size mismatches (e.g. output should be [1, 1001] instead of [1,1,1,1001]). Let say the submission file is located at `/path/to/my_model.lite`, then call:
 
 ```sh
-cp /tmp/my_model.lite tensorflow/contrib/lite/java/ovic/src/testdata/
+bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
+bazel-bin/tensorflow/contrib/lite/java/ovic/ovic_validator /path/to/my_model.lite
+```
+
+Successful validation should print the following message to terminal:
+
+```
+Successfully validated /path/to/my_model.lite.
+
+```
+
+#### Test that the model produces sensible outcomes
+
+You can go a step further to verify that the model produces results as expected. This helps you catch bugs during TOCO conversion (e.g. using the wrong mean and std values).
+
+* Move your submission to the testdata folder:
+
+```sh
+cp /path/to/my_model.lite tensorflow/contrib/lite/java/ovic/src/testdata/
 ```
 
 * Resize the test image to the resolutions that are expected by your submission:
 
 The test images can be found at `tensorflow/contrib/lite/java/ovic/src/testdata/test_image_*.jpg`. You may reuse these images if your image resolutions are 128x128 or 224x224.
 
-* Add your model and test image to the BUILD rule:
+* Add your model and test image to the BUILD rule at `tensorflow/contrib/lite/java/ovic/src/testdata/BUILD`:
 
 ```JSON
-java_test(
-  name = "OvicClassifierTest",
-  size = "medium",
-  srcs = ["ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
-  data = [
-      "ovic/src/testdata/float_model.lite",
-      "ovic/src/testdata/labels.txt",
-      "ovic/src/testdata/low_res_model.lite",
-      "ovic/src/testdata/quantized_model.lite",
-      "ovic/src/testdata/test_image_128.jpg",
-      "ovic/src/testdata/test_image_224.jpg",
-      "ovic/src/testdata/my_model.lite",        # <--- Your submission.
-      "ovic/src/testdata/my_test_image.jpg",    # <--- Your test image.
-  ],
-      ...
+filegroup(
+    name = "ovic_testdata",
+    srcs = [
+        "@tflite_ovic_testdata//:float_model.lite",
+        "@tflite_ovic_testdata//:low_res_model.lite",
+        "@tflite_ovic_testdata//:quantized_model.lite",
+        "@tflite_ovic_testdata//:test_image_128.jpg",
+        "@tflite_ovic_testdata//:test_image_224.jpg"
+        "my_model.lite",        # <--- Your submission.
+        "my_test_image.jpg",    # <--- Your test image.
+    ],
+    ...
 ```
 
 * Modify `OvicClassifierTest.java` to test your model.
 
-Change `TEST_IMAGE_PATH` to `testdata/my_test_image.jpg`. If your model runs inference in floating point, change `FLOAT_MODEL_PATH` to `testdata/my_model.lite`. If your model runs [quantized inference](https://www.tensorflow.org/performance/quantization), change `QUANTIZED_MODEL_PATH` to `testdata/my_model.lite`.
+Change `TEST_IMAGE_PATH` to `my_test_image.jpg`. Change either `FLOAT_MODEL_PATH` or `QUANTIZED_MODEL_PATH` to `my_model.lite` depending on whether your model runs inference in float or [8-bit](https://www.tensorflow.org/performance/quantization).
 
 Now you can run the bazel tests to catch any runtime issues with the submission.
+
+Note: Please make sure that your submission passes the test. If a submission fails to pass the test it will not be processed by the submission server.
+
+## Measure on-device latency
+
+We provide two ways to measure the on-device latency of your submission. The first is through our competition server, which is reliable and repeatable, but is limited to a few trials per day. The second is through the benchmarker Apk, which requires a device and may not be as accurate as the server, but has a fast turn-around and no access limitations. We recommend that the participants use the benchmarker apk for early development, and reserve the competition server for evaluating promising submissions.
+
+### Running the benchmarker app
+
+Make sure that you have followed instructions in [Test your submissions](#test-your-submissions) to add your model to the testdata folder and to the corresponding build rules.
+
+Modify `tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java`:
+
+* Add your model to the benchmarker apk by changing `MODEL_PATH` and `TEST_IMAGE_PATH` below to your submission and test image.
+
+```
+  private static final String TEST_IMAGE_PATH = "my_test_image.jpg";
+  private static final String MODEL_PATH = "my_model.lite";
+```
+
+* Adjust the benchmark parameters when needed:
+
+You can chnage the length of each experiment, and the processor affinity below. `BIG_CORE_MASK` is an integer whose binary encoding represents the set of used cores. This number is phone-specific. For example, Pixel 2 has 8 cores: the 4 little cores are represented by the 4 less significant bits, and the 4 big cores by the 4 more significant bits. Therefore a mask value of 16, or in binary `00010000`, represents using only the first big core. The mask 32, or in binary `00100000` uses the second big core and should deliver identical results as the mask 16 because the big cores are interchangeable.
+
+```
+  /** Wall time for each benchmarking experiment. */
+  private static final double WALL_TIME = 3000;
+  /** Maximum number of iterations in each benchmarking experiment. */
+  private static final int MAX_ITERATIONS = 100;
+  /** Mask for binding to a single big core. Pixel 1 (4), Pixel 2 (16). */
+  private static final int BIG_CORE_MASK = 16;
+```
+
+Note: You'll need ROOT access to the phone to change processor affinity.
+
+* Build and install the app.
+
+```
+bazel build -c opt --cxxopt=--std=c++11 --cxxopt=-Wno-all //tensorflow/contrib/lite/java/ovic/demo/app:ovic_benchmarker_binary
+adb install -r bazel-bin/tensorflow/contrib/lite/java/ovic/demo/app/ovic_benchmarker_binary.apk
+```
+
+Start the app and click the `Start` button in dark green. The button should turn bright green, signaling that the experiment is running. The benchmarking results will be displayed after about the `WALL_TIME` you specified above. For example:
+
+```
+my_model.lite: Average latency=158.6ms after 20 runs.
+```
+
+### Sample latencies
+
+Note: the benchmarking results can be quite different depending on the background processes running on the phone. A few things that help stabilize the app's readings are placing the phone on a cooling plate, restarting the phone, and shutting down internet access.
+
+| Model                | Pixel 1 latency (ms)  | Pixel 2 latency (ms) |
+| -------------------- |:---------------------:| --------------------:|
+|  float_model.lite    | 120                   | 155                  |
+| quantized_model.lite | 85                    | 74                   |
+|  low_res_model.lite  | 4.2                   | 4.0                  |
+
+Since Pixel 2 has excellent support for 8-bit quantized models, we strongly recommend you to check out the [quantization training tutorial](https://www.tensorflow.org/performance/quantization).
+
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/AndroidManifest.xml b/tensorflow/contrib/lite/java/ovic/demo/app/AndroidManifest.xml
new file mode 100644
index 0000000000000000000000000000000000000000..55f2961fd717bdeebf5f3f1e66bb537f53cbe4e0
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/AndroidManifest.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+ Copyright 2018 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="ovic.demo.app"
+    android:versionCode="1"
+    android:versionName="1.0" >
+
+    <uses-sdk
+        android:minSdkVersion="19"
+        android:targetSdkVersion="21" />
+
+    <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
+    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
+    <uses-permission android:name="android.permission.READ_PHONE_STATE" />
+
+    <application
+        android:allowBackup="true"
+        android:icon="@drawable/ic_launcher"
+        android:largeHeap="true"
+        android:label="@string/app_name">
+        <activity
+            android:name="ovic.demo.app.OvicBenchmarkerActivity"
+            android:label="@string/app_name"
+            android:screenOrientation="portrait">
+
+          <intent-filter>
+              <action android:name="android.intent.action.MAIN" />
+              <category android:name="android.intent.category.LAUNCHER" />
+          </intent-filter>
+      </activity>
+  </application>
+
+</manifest>
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/BUILD b/tensorflow/contrib/lite/java/ovic/demo/app/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..83974f4b337baedebaf9c9ffc0a03501418a3e36
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/BUILD
@@ -0,0 +1,29 @@
+# Sample app for OVIC benchmarking.
+licenses(["notice"])  # Apache 2.0
+
+android_binary(
+    name = "ovic_benchmarker_binary",
+    srcs = [
+        "OvicBenchmarker.java",
+        "OvicBenchmarkerActivity.java",
+    ],
+    assets = [
+        "//tensorflow/contrib/lite/java/ovic/src/testdata:ovic_testdata",
+        "//tensorflow/contrib/lite/java/ovic/src/testdata:labels.txt",
+    ],
+    assets_dir = "",
+    custom_package = "ovic.demo.app",
+    manifest = "AndroidManifest.xml",
+    nocompress_extensions = [
+        ".lite",
+        ".tflite",
+    ],
+    resource_files = glob(["res/**"]),
+    tags = ["manual"],
+    deps = [
+        "//tensorflow/contrib/lite/java:tensorflowlite",
+        "//tensorflow/contrib/lite/java/ovic:ovicbenchmarkerlib",
+        "@androidsdk//com.android.support:support-v13-25.2.0",
+        "@androidsdk//com.android.support:support-v4-25.2.0",
+    ],
+)
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java b/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarker.java
similarity index 97%
rename from tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
rename to tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarker.java
index d0102883e6b41f5c33a0061c5fd53b5f69b8ab54..113ab74a20dabc7e283804348509702b7f412917 100644
--- a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarker.java
@@ -1,4 +1,4 @@
-/*Copyright 2018 Google LLC
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-package org.tensorflow.ovic;
+package ovic.demo.app;
 
 import android.graphics.Bitmap;
 import android.os.SystemClock;
@@ -22,6 +22,8 @@ import java.io.InputStream;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
+import org.tensorflow.ovic.OvicClassifier;
+import org.tensorflow.ovic.OvicSingleImageResult;
 
 /**
  * Class that benchmarks image classifier models.
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java b/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..59457c308ad7caa17c52563f6a70df79e8a17914
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java
@@ -0,0 +1,247 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package ovic.demo.app;
+
+import android.app.Activity;
+import android.content.res.AssetFileDescriptor;
+import android.content.res.AssetManager;
+import android.graphics.Bitmap;
+import android.graphics.BitmapFactory;
+import android.os.Bundle;
+import android.os.Process;
+import android.os.SystemClock;
+import android.util.Log;
+import android.view.View;
+import android.widget.TextView;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.text.DecimalFormat;
+import org.tensorflow.ovic.OvicSingleImageResult;
+
+/** Class that benchmark image classifier models. */
+public class OvicBenchmarkerActivity extends Activity {
+  /** Tag for the {@link Log}. */
+  private static final String TAG = "OvicBenchmarkerActivity";
+
+  /** Name of the label file stored in Assets. */
+  private static final String LABEL_PATH = "labels.txt";
+
+  private static final String TEST_IMAGE_PATH = "test_image_224.jpg";
+  private static final String MODEL_PATH = "float_model.lite";
+  /**
+   * Each bottom press will launch a benchmarking experiment. The experiment stops when either the
+   * total native latency reaches WALL_TIME or the number of iterations reaches MAX_ITERATIONS,
+   * whichever comes first.
+   */
+  /** Wall time for each benchmarking experiment. */
+  private static final double WALL_TIME = 3000;
+  /** Maximum number of iterations in each benchmarking experiment. */
+  private static final int MAX_ITERATIONS = 100;
+  /** Mask for binding to a single big core. Pixel 1 (4), Pixel 2 (16). */
+  private static final int BIG_CORE_MASK = 16;
+  /** Amount of time in milliseconds to wait for affinity to set. */
+  private static final int WAIT_TIME_FOR_AFFINITY = 1000;
+
+  /* The model to be benchmarked. */
+  private MappedByteBuffer model = null;
+  private InputStream labelInputStream = null;
+  private OvicBenchmarker benchmarker;
+  /** Inference result of each iteration. */
+  OvicSingleImageResult iterResult = null;
+
+  private TextView textView = null;
+  // private Button startButton = null;
+  private static final DecimalFormat df2 = new DecimalFormat(".##");
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+    setContentView(R.layout.activity_main);
+
+    // TextView used to display the progress, for information purposes only.
+    textView = (TextView) findViewById(R.id.textView);
+  }
+
+  private Bitmap loadTestBitmap() throws IOException {
+    InputStream imageStream = getAssets().open(TEST_IMAGE_PATH);
+    return BitmapFactory.decodeStream(imageStream);
+  }
+
+  public void initializeTest() throws IOException {
+    Log.i(TAG, "Initializing benchmarker.");
+    benchmarker = new OvicBenchmarker(WALL_TIME);
+    AssetManager am = getAssets();
+    AssetFileDescriptor fileDescriptor = am.openFd(MODEL_PATH);
+    FileInputStream modelInputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+    FileChannel fileChannel = modelInputStream.getChannel();
+    long startOffset = fileDescriptor.getStartOffset();
+    long declaredLength = fileDescriptor.getDeclaredLength();
+    model = fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+    labelInputStream = am.open(LABEL_PATH);
+  }
+
+  public Boolean doTestIteration() throws IOException, InterruptedException {
+    if (benchmarker == null) {
+      throw new RuntimeException("Benchmarker has not been initialized.");
+    }
+    if (benchmarker.shouldStop()) {
+      return false;
+    }
+    if (!benchmarker.readyToTest()) {
+      Log.i(TAG, "getting ready to test.");
+      benchmarker.getReadyToTest(labelInputStream, model);
+      if (!benchmarker.readyToTest()) {
+        throw new RuntimeException("Failed to get the benchmarker ready.");
+      }
+    }
+    Log.i(TAG, "Going to do test iter.");
+    // Start testing.
+    Bitmap testImageBitmap = loadTestBitmap();
+    iterResult = benchmarker.doTestIteration(testImageBitmap);
+    testImageBitmap.recycle();
+    if (iterResult == null) {
+      throw new RuntimeException("Inference failed to produce a result.");
+    }
+    Log.i(TAG, iterResult.toString());
+    return true;
+  }
+
+  public void startPressed(View view) throws IOException {
+    Log.i(TAG, "Start pressed");
+    try {
+      initializeTest();
+    } catch (IOException e) {
+      Log.e(TAG, "Can't initialize benchmarker.", e);
+      throw e;
+    }
+    String displayText = "";
+    try {
+      setProcessorAffinity(BIG_CORE_MASK);
+    } catch (IOException e) {
+      Log.e(TAG, e.getMessage());
+      displayText = e.getMessage() + "\n";
+    }
+    Log.i(TAG, "Successfully initialized benchmarker.");
+    int testIter = 0;
+    Boolean iterSuccess = false;
+    double totalLatency = 0.0f;
+    while (testIter < MAX_ITERATIONS) {
+      try {
+        iterSuccess = doTestIteration();
+      } catch (IOException e) {
+        Log.e(TAG, "Error during iteration " + testIter);
+        throw e;
+      } catch (InterruptedException e) {
+        Log.e(TAG, "Interrupted at iteration " + testIter);
+      }
+      if (!iterSuccess) {
+        break;
+      }
+      testIter++;
+      totalLatency += (double) iterResult.latency;
+    }
+    ;
+    Log.i(TAG, "Benchmarking finished");
+
+    if (textView != null) {
+      if (testIter > 0) {
+        textView.setText(
+            displayText
+                + MODEL_PATH
+                + ": Average latency="
+                + df2.format(totalLatency / testIter)
+                + "ms after "
+                + testIter
+                + " runs.");
+      } else {
+        textView.setText("Benchmarker failed to run on more than one images.");
+      }
+    }
+  }
+
+  private static void setProcessorAffinity(int mask) throws IOException {
+    int myPid = Process.myPid();
+    Log.i(TAG, String.format("Setting processor affinity to 0x%02x", mask));
+
+    String command = String.format("taskset -a -p %x %d", mask, myPid);
+    try {
+      Runtime.getRuntime().exec(command).waitFor();
+    } catch (InterruptedException e) {
+      throw new IOException("Interrupted: " + e);
+    }
+
+    // Make sure set took effect - try for a second to confirm the change took.  If not then fail.
+    long startTimeMs = SystemClock.elapsedRealtime();
+    while (true) {
+      int readBackMask = readCpusAllowedMask();
+      if (readBackMask == mask) {
+        Log.i(TAG, String.format("Successfully set affinity to 0x%02x", mask));
+        return;
+      }
+      if (SystemClock.elapsedRealtime() > startTimeMs + WAIT_TIME_FOR_AFFINITY) {
+        throw new IOException(
+            String.format(
+                "Core-binding failed: affinity set to 0x%02x but read back as 0x%02x\n"
+                    + "please root device.",
+                mask, readBackMask));
+      }
+
+      try {
+        Thread.sleep(50);
+      } catch (InterruptedException e) {
+        // Ignore sleep interrupted, will sleep again and compare is final cross-check.
+      }
+    }
+  }
+
+  public static int readCpusAllowedMask() throws IOException {
+    // Determine how many CPUs there are total
+    final String pathname = "/proc/self/status";
+    final String resultPrefix = "Cpus_allowed:";
+    File file = new File(pathname);
+    String line = "<NO LINE READ>";
+    String allowedCPU = "";
+    Integer allowedMask = null;
+    BufferedReader bufReader = null;
+    try {
+      bufReader = new BufferedReader(new FileReader(file));
+      while ((line = bufReader.readLine()) != null) {
+        if (line.startsWith(resultPrefix)) {
+          allowedMask = Integer.valueOf(line.substring(resultPrefix.length()).trim(), 16);
+          allowedCPU = bufReader.readLine();
+          break;
+        }
+      }
+    } catch (RuntimeException e) {
+      throw new IOException(
+          "Invalid number in " + pathname + " line: \"" + line + "\": " + e.getMessage());
+    } finally {
+      if (bufReader != null) {
+        bufReader.close();
+      }
+    }
+    if (allowedMask == null) {
+      throw new IOException(pathname + " missing " + resultPrefix + " line");
+    }
+    Log.i(TAG, allowedCPU);
+    return allowedMask;
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/build.gradle b/tensorflow/contrib/lite/java/ovic/demo/app/build.gradle
new file mode 100644
index 0000000000000000000000000000000000000000..c5d19bad89a93988a6830a17fe2fb4a60e2fb00f
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/build.gradle
@@ -0,0 +1,58 @@
+apply plugin: 'com.android.application'
+
+android {
+    compileSdkVersion 26
+    buildToolsVersion "26.0.1"
+    defaultConfig {
+        applicationId "android.example.com.ovicbenchmarker"
+        minSdkVersion 15
+        targetSdkVersion 26
+        versionCode 1
+        versionName "1.0"
+        testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner"
+
+        // Remove this block.
+        jackOptions {
+            enabled true
+        }
+    }
+    lintOptions {
+        abortOnError false
+    }
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+        }
+    }
+    aaptOptions {
+        noCompress "lite", "tflite"
+    }
+
+    compileOptions {
+        sourceCompatibility JavaVersion.VERSION_1_8
+        targetCompatibility JavaVersion.VERSION_1_8
+    }
+}
+
+repositories {
+    maven {
+        url 'https://google.bintray.com/tensorflow'
+    }
+}
+
+dependencies {
+    compile fileTree(dir: 'libs', include: ['*.jar'])
+    androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', {
+        exclude group: 'com.android.support', module: 'support-annotations'
+    })
+    compile 'com.android.support:appcompat-v7:25.2.0'
+    compile 'com.android.support.constraint:constraint-layout:1.0.2'
+    compile 'com.android.support:design:25.2.0'
+    compile 'com.android.support:support-annotations:25.3.1'
+    compile 'com.android.support:support-v13:25.2.0'
+
+    compile 'org.tensorflow:tensorflow-lite:+'
+
+    testCompile 'junit:junit:4.12'
+}
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-mdpi/ic_launcher.png b/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-mdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..715d1b6d69c0f4dc4d1ae58c8262c22856b20f43
Binary files /dev/null and b/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-mdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-xhdpi/ic_launcher.png b/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-xhdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..9beff0885fd4c8c65ea30c99c838370dcd745f3c
Binary files /dev/null and b/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-xhdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable/start_button_color.xml b/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable/start_button_color.xml
new file mode 100644
index 0000000000000000000000000000000000000000..93f5c6a016b499f1bd7bacde9b4b94a4ee9fdb6b
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable/start_button_color.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+ Copyright 2018 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<selector
+  xmlns:android="http://schemas.android.com/apk/res/android">
+  <item
+    android:state_enabled="false">
+    <shape android:shape="rectangle">
+      <solid android:color="#808080"/>
+    </shape>
+  </item>
+  <item
+    android:state_enabled="true"
+    android:state_pressed="true">
+    <shape android:shape="rectangle">
+      <solid android:color="#44ff44"/>
+    </shape>
+  </item>
+  <item
+    android:state_enabled="true"
+    android:state_pressed="false">
+    <shape android:shape="rectangle"  >
+      <solid android:color="#227f22"/>
+    </shape>
+  </item>
+</selector>
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/layout/activity_main.xml b/tensorflow/contrib/lite/java/ovic/demo/app/res/layout/activity_main.xml
new file mode 100644
index 0000000000000000000000000000000000000000..e9d83bae543ae62ba8749c4c91b36b20bf09a176
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/res/layout/activity_main.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+ Copyright 2018 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<RelativeLayout
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:paddingBottom="@dimen/activity_vertical_margin"
+    android:paddingLeft="@dimen/activity_horizontal_margin"
+    android:paddingRight="@dimen/activity_horizontal_margin"
+    android:paddingTop="@dimen/activity_vertical_margin"
+    tools:context="ovic.demo.app.OvicBenchmarkerActivity">
+
+  <TextView
+    android:layout_width="wrap_content"
+    android:layout_height="wrap_content"
+    android:text="@string/initial_status_msg"
+    android:id="@+id/textView"
+    android:layout_above="@+id/button_start"
+    android:layout_alignParentTop="true"/>
+
+  <Button
+    android:layout_width="wrap_content"
+    android:layout_height="wrap_content"
+    android:text="@string/start_label"
+    android:id="@id/button_start"
+    android:layout_alignParentBottom="true"
+    android:layout_alignParentLeft="true"
+    android:background="@drawable/start_button_color"
+    android:padding="10dp"
+    android:layout_marginRight="30dp"
+    android:layout_marginLeft="100dp"
+    android:layout_marginTop="10dp"
+    android:foreground="#000000"
+    android:textColor="#ffffff"
+    android:enabled="true"
+    style="?android:attr/buttonBarButtonStyle"
+    android:onClick="startPressed"/>
+
+</RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/values/dimens.xml b/tensorflow/contrib/lite/java/ovic/demo/app/res/values/dimens.xml
new file mode 100644
index 0000000000000000000000000000000000000000..250b581430a11588ea119b42e709fbfa7cac20d3
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/res/values/dimens.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+ Copyright 2018 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <dimen name="activity_vertical_margin">20dp</dimen>
+    <dimen name="activity_horizontal_margin">16dp</dimen>
+</resources>
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/values/strings.xml b/tensorflow/contrib/lite/java/ovic/demo/app/res/values/strings.xml
new file mode 100644
index 0000000000000000000000000000000000000000..d26beb1d27d3a38201cf9206bf4eab98b754b4f7
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/res/values/strings.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+ Copyright 2018 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <string name="app_name" translatable="false">Benchmarker</string>
+
+    <string name="start_label" translatable="false">Start</string>
+    <string name="initial_status_msg" translatable="false"> Press start to run the benchmarks.</string>
+</resources>
diff --git a/tensorflow/contrib/lite/java/ovic/demo/build.gradle b/tensorflow/contrib/lite/java/ovic/demo/build.gradle
new file mode 100644
index 0000000000000000000000000000000000000000..b78a0b86c939620b6f05483ce45c4d3ef0ef595e
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/build.gradle
@@ -0,0 +1,23 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+
+buildscript {
+    repositories {
+        jcenter()
+    }
+    dependencies {
+        classpath 'com.android.tools.build:gradle:2.3.1'
+
+        // NOTE: Do not place your application dependencies here; they belong
+        // in the individual module build.gradle files
+    }
+}
+
+allprojects {
+    repositories {
+        jcenter()
+    }
+}
+
+task clean(type: Delete) {
+    delete rootProject.buildDir
+}
diff --git a/tensorflow/contrib/lite/java/ovic/demo/gradle.properties b/tensorflow/contrib/lite/java/ovic/demo/gradle.properties
new file mode 100644
index 0000000000000000000000000000000000000000..aac7c9b4614ccfde6c721f24994cf30885a791d0
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/gradle.properties
@@ -0,0 +1,17 @@
+# Project-wide Gradle settings.
+
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+org.gradle.jvmargs=-Xmx1536m
+
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
diff --git a/tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.jar b/tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000000000000000000000000000000000000..13372aef5e24af05341d49695ee84e5f9b594659
Binary files /dev/null and b/tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties b/tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000000000000000000000000000000000000..fa7a38a0e43eecd1e7292dd49efa79a5d0742e2a
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Thu Sep 28 09:01:41 PDT 2017
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-3.3-all.zip
diff --git a/tensorflow/contrib/lite/java/ovic/demo/gradlew b/tensorflow/contrib/lite/java/ovic/demo/gradlew
new file mode 100755
index 0000000000000000000000000000000000000000..9d82f78915133e1c35a6ea51252590fb38efac2f
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/gradlew
@@ -0,0 +1,160 @@
+#!/usr/bin/env bash
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn ( ) {
+    echo "$*"
+}
+
+die ( ) {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+esac
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
+function splitJvmOpts() {
+    JVM_OPTS=("$@")
+}
+eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
+JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
+
+exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
diff --git a/tensorflow/contrib/lite/java/ovic/demo/gradlew.bat b/tensorflow/contrib/lite/java/ovic/demo/gradlew.bat
new file mode 100644
index 0000000000000000000000000000000000000000..8a0b282aa6885fb573c106b3551f7275c5f17e8e
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/gradlew.bat
@@ -0,0 +1,90 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windowz variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+if "%@eval[2+2]" == "4" goto 4NT_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+goto execute
+
+:4NT_args
+@rem Get arguments from the 4NT Shell from JP Software
+set CMD_LINE_ARGS=%$
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/tensorflow/contrib/lite/java/ovic/demo/settings.gradle b/tensorflow/contrib/lite/java/ovic/demo/settings.gradle
new file mode 100644
index 0000000000000000000000000000000000000000..e7b4def49cb53d9aa04228dd3edb14c9e635e003
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/settings.gradle
@@ -0,0 +1 @@
+include ':app'
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
index b2dfd8f2e710324f6c11a3098b858ffee8b28b3c..4cf51bb0fac682ac8c5ed0116c646cb78d28d418 100644
--- a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
@@ -67,7 +67,7 @@ public class OvicClassifier {
           });
 
   /** Initializes an {@code OvicClassifier}. */
-  OvicClassifier(InputStream labelInputStream, MappedByteBuffer model)
+  public OvicClassifier(InputStream labelInputStream, MappedByteBuffer model)
       throws IOException, RuntimeException {
     if (model == null) {
       throw new RuntimeException("Input model is empty.");
@@ -106,7 +106,7 @@ public class OvicClassifier {
 
   /** Classifies a {@link ByteBuffer} image. */
   // @throws RuntimeException if model is uninitialized.
-  OvicSingleImageResult classifyByteBuffer(ByteBuffer imgData) throws RuntimeException {
+  public OvicSingleImageResult classifyByteBuffer(ByteBuffer imgData) {
     if (tflite == null) {
       throw new RuntimeException(TAG + ": ImageNet classifier has not been initialized; Failed.");
     }
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java
new file mode 100644
index 0000000000000000000000000000000000000000..a504ec74a9d0a124f877a6377cae155f204849a5
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java
@@ -0,0 +1,94 @@
+/*Copyright 2018 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.Random;
+
+/** Validate a submission model. */
+public class OvicValidator {
+  private static void printUsage(PrintStream s) {
+    s.println("Java program that validates a submission model.");
+    s.println();
+    s.println("Usage: ovic_validator <submission file>");
+    s.println();
+    s.println("Where:");
+    s.println("<submission file> is the model in TfLite format;");
+  }
+
+  public static void main(String[] args) {
+    if (args.length != 1) {
+      printUsage(System.err);
+      System.exit(1);
+    }
+    final String labelPath =
+        "tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt";
+
+    final String modelFile = args[0];
+    try {
+      File labelsfile = new File(labelPath);
+      InputStream labelsInputStream = new FileInputStream(labelsfile);
+      MappedByteBuffer model = loadModelFile(modelFile);
+      OvicClassifier classifier = new OvicClassifier(labelsInputStream, model);
+      ByteBuffer imgData = createByteBufferForClassifier(classifier);
+      OvicSingleImageResult testResult = classifier.classifyByteBuffer(imgData);
+      if (testResult.topKClasses.isEmpty()) {
+        throw new RuntimeException("Failed to return top K predictions.");
+      }
+      System.out.printf("Successfully validated %s.%n", modelFile);
+    } catch (Exception e) {
+      System.out.println(e.getMessage());
+      System.out.printf("Failed to validate %s.%n", modelFile);
+    }
+  }
+
+  private static ByteBuffer createByteBufferForClassifier(OvicClassifier classifier) {
+    if (classifier == null) {
+      throw new RuntimeException("Cannot create image buffer with the classifier.");
+    }
+    int[] inputDims = classifier.getInputDims();
+    int imgHeight = inputDims[1];
+    int imgWidth = inputDims[2];
+    ByteBuffer imgData = ByteBuffer.allocateDirect(imgHeight * imgWidth * 3);
+    imgData.order(ByteOrder.nativeOrder());
+    Random rand = new Random();
+    for (int y = 0; y < imgHeight; y++) {
+      for (int x = 0; x < imgWidth; x++) {
+        int val = rand.nextInt();
+        imgData.put((byte) ((val >> 16) & 0xFF));
+        imgData.put((byte) ((val >> 8) & 0xFF));
+        imgData.put((byte) (val & 0xFF));
+      }
+    }
+    return imgData;
+  }
+
+  private static MappedByteBuffer loadModelFile(String modelFilePath) throws IOException {
+    File modelfile = new File(modelFilePath);
+    FileInputStream inputStream = new FileInputStream(modelfile);
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = 0L;
+    long declaredLength = fileChannel.size();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
index 098ed8ceba52b6d44868353b80cab0862e334d4d..56f3e7604a5b172e907edbe862b017957594397f 100644
--- a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
+++ b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
@@ -45,17 +45,17 @@ public final class OvicClassifierTest {
   private ByteBuffer lowResTestImage = null;
   private OvicSingleImageResult testResult = null;
   private static final String LABELS_PATH =
-      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt";
+      "tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt";
   private static final String QUANTIZED_MODEL_PATH =
-      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/quantized_model.lite";
+      "external/tflite_ovic_testdata/quantized_model.lite";
   private static final String LOW_RES_MODEL_PATH =
-      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/low_res_model.lite";
+      "external/tflite_ovic_testdata/low_res_model.lite";
   private static final String FLOAT_MODEL_PATH =
-      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/float_model.lite";
+      "external/tflite_ovic_testdata/float_model.lite";
   private static final String TEST_IMAGE_PATH =
-      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/test_image_224.jpg";
+      "external/tflite_ovic_testdata/test_image_224.jpg";
   private static final String TEST_LOW_RES_IMAGE_PATH =
-      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/test_image_128.jpg";
+      "external/tflite_ovic_testdata/test_image_128.jpg";
   private static final int TEST_IMAGE_GROUNDTRUTH = 653; // "military uniform"
 
   @Before
diff --git a/tensorflow/contrib/lite/java/ovic/src/testdata/BUILD b/tensorflow/contrib/lite/java/ovic/src/testdata/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..1021ea30dd90b751e876567ae23bb160950e1cd2
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/testdata/BUILD
@@ -0,0 +1,19 @@
+# Testdata for OVIC benchmarker demo App and tests.
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "ovic_testdata",
+    srcs = [
+        "@tflite_ovic_testdata//:float_model.lite",
+        "@tflite_ovic_testdata//:low_res_model.lite",
+        "@tflite_ovic_testdata//:quantized_model.lite",
+        "@tflite_ovic_testdata//:test_image_128.jpg",
+        "@tflite_ovic_testdata//:test_image_224.jpg",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+exports_files(
+    ["labels.txt"],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index e84ee7112983ec584308b7cbcd919f119eccbcc9..644ce4cb3e0beaed2b9ae542cdacbb912ab0f010 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -16,7 +16,7 @@ limitations under the License.
 package org.tensorflow.lite;
 
 import java.io.File;
-import java.nio.MappedByteBuffer;
+import java.nio.ByteBuffer;
 import java.util.HashMap;
 import java.util.Map;
 import org.checkerframework.checker.nullness.qual.NonNull;
@@ -81,24 +81,26 @@ public final class Interpreter implements AutoCloseable {
   }
 
   /**
-   * Initializes a {@code Interpreter} with a {@code MappedByteBuffer} to the model file.
+   * Initializes a {@code Interpreter} with a {@code ByteBuffer} of a model file.
    *
-   * <p>The {@code MappedByteBuffer} should remain unchanged after the construction of a {@code
-   * Interpreter}.
+   * <p>The ByteBuffer should not be modified after the construction of a {@code Interpreter}. The
+   * {@code ByteBuffer} can be either a {@code MappedByteBuffer} that memory-maps a model file, or a
+   * direct {@code ByteBuffer} of nativeOrder() that contains the bytes content of a model.
    */
-  public Interpreter(@NonNull MappedByteBuffer mappedByteBuffer) {
-    wrapper = new NativeInterpreterWrapper(mappedByteBuffer);
+  public Interpreter(@NonNull ByteBuffer byteBuffer) {
+    wrapper = new NativeInterpreterWrapper(byteBuffer);
   }
 
   /**
-   * Initializes a {@code Interpreter} with a {@code MappedByteBuffer} to the model file and
-   * specifies the number of threads used for inference.
+   * Initializes a {@code Interpreter} with a {@code ByteBuffer} of a model file and specifies the
+   * number of threads used for inference.
    *
-   * <p>The {@code MappedByteBuffer} should remain unchanged after the construction of a {@code
-   * Interpreter}.
+   * <p>The ByteBuffer should not be modified after the construction of a {@code Interpreter}. The
+   * {@code ByteBuffer} can be either a {@code MappedByteBuffer} that memory-maps a model file, or a
+   * direct {@code ByteBuffer} of nativeOrder() that contains the bytes content of a model.
    */
-  public Interpreter(@NonNull MappedByteBuffer mappedByteBuffer, int numThreads) {
-    wrapper = new NativeInterpreterWrapper(mappedByteBuffer, numThreads);
+  public Interpreter(@NonNull ByteBuffer byteBuffer, int numThreads) {
+    wrapper = new NativeInterpreterWrapper(byteBuffer, numThreads);
   }
 
   /**
@@ -215,11 +217,11 @@ public final class Interpreter implements AutoCloseable {
     }
   }
 
-  public void setNumThreads(int num_threads) {
+  public void setNumThreads(int numThreads) {
     if (wrapper == null) {
       throw new IllegalStateException("The interpreter has already been closed.");
     }
-    wrapper.setNumThreads(num_threads);
+    wrapper.setNumThreads(numThreads);
   }
 
   /** Release resources associated with the {@code Interpreter}. */
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index a43251cad13a4ed0b35367e796948b4b9a9faa67..2ae6c516b03ef4292667bbd944c73d2eeaf82db3 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -43,21 +43,31 @@ final class NativeInterpreterWrapper implements AutoCloseable {
   }
 
   /**
-   * Initializes a {@code NativeInterpreterWrapper} with a {@code MappedByteBuffer}. The
-   * MappedByteBuffer should not be modified after the construction of a {@code
-   * NativeInterpreterWrapper}.
+   * Initializes a {@code NativeInterpreterWrapper} with a {@code ByteBuffer}. The ByteBuffer should
+   * not be modified after the construction of a {@code NativeInterpreterWrapper}. The {@code
+   * ByteBuffer} can be either a {@code MappedByteBuffer} that memory-maps a model file, or a direct
+   * {@code ByteBuffer} of nativeOrder() that contains the bytes content of a model.
    */
-  NativeInterpreterWrapper(MappedByteBuffer mappedByteBuffer) {
-    this(mappedByteBuffer, /* numThreads= */ -1);
+  NativeInterpreterWrapper(ByteBuffer byteBuffer) {
+    this(byteBuffer, /* numThreads= */ -1);
   }
 
   /**
-   * Initializes a {@code NativeInterpreterWrapper} with a {@code MappedByteBuffer} and specifies
-   * the number of inference threads. The MappedByteBuffer should not be modified after the
-   * construction of a {@code NativeInterpreterWrapper}.
+   * Initializes a {@code NativeInterpreterWrapper} with a {@code ByteBuffer} and specifies the
+   * number of inference threads. The ByteBuffer should not be modified after the construction of a
+   * {@code NativeInterpreterWrapper}. The {@code ByteBuffer} can be either a {@code
+   * MappedByteBuffer} that memory-maps a model file, or a direct {@code ByteBuffer} of
+   * nativeOrder() that contains the bytes content of a model.
    */
-  NativeInterpreterWrapper(MappedByteBuffer mappedByteBuffer, int numThreads) {
-    modelByteBuffer = mappedByteBuffer;
+  NativeInterpreterWrapper(ByteBuffer buffer, int numThreads) {
+    if (buffer == null
+        || (!(buffer instanceof MappedByteBuffer)
+            && (!buffer.isDirect() || buffer.order() != ByteOrder.nativeOrder()))) {
+      throw new IllegalArgumentException(
+          "Model ByteBuffer should be either a MappedByteBuffer of the model file, or a direct "
+              + "ByteBuffer using ByteOrder.nativeOrder() which contains bytes of model content.");
+    }
+    modelByteBuffer = buffer;
     errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
     modelHandle = createModelWithBuffer(modelByteBuffer, errorHandle);
     interpreterHandle = createInterpreter(modelHandle, errorHandle, numThreads);
@@ -90,9 +100,10 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       dataTypes[i] = dataType.getNumber();
       if (dataType == DataType.BYTEBUFFER) {
         ByteBuffer buffer = (ByteBuffer) inputs[i];
-        if (buffer.order() != ByteOrder.nativeOrder()) {
+        if (buffer == null || !buffer.isDirect() || buffer.order() != ByteOrder.nativeOrder()) {
           throw new IllegalArgumentException(
-              "Input error: ByteBuffer shoud use ByteOrder.nativeOrder().");
+              "Input error: ByteBuffer should be a direct ByteBuffer that uses "
+                  + "ByteOrder.nativeOrder().");
         }
         numsOfBytes[i] = buffer.limit();
         sizes[i] = getInputDims(interpreterHandle, i, numsOfBytes[i]);
@@ -153,8 +164,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     useNNAPI(interpreterHandle, useNNAPI);
   }
 
-  void setNumThreads(int num_threads) {
-    numThreads(interpreterHandle, num_threads);
+  void setNumThreads(int numThreads) {
+    numThreads(interpreterHandle, numThreads);
   }
 
   /** Gets index of an input given its name. */
@@ -314,7 +325,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private long inferenceDurationNanoseconds = -1;
 
-  private MappedByteBuffer modelByteBuffer;
+  private ByteBuffer modelByteBuffer;
 
   private Map<String, Integer> inputsIndexes;
 
@@ -328,13 +339,13 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native void useNNAPI(long interpreterHandle, boolean state);
 
-  private static native void numThreads(long interpreterHandle, int num_threads);
+  private static native void numThreads(long interpreterHandle, int numThreads);
 
   private static native long createErrorReporter(int size);
 
   private static native long createModel(String modelPathOrBuffer, long errorHandle);
 
-  private static native long createModelWithBuffer(MappedByteBuffer modelBuffer, long errorHandle);
+  private static native long createModelWithBuffer(ByteBuffer modelBuffer, long errorHandle);
 
   private static native long createInterpreter(long modelHandle, long errorHandle, int numThreads);
 
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 45f510da1d940a288e2794cb3e08f66451956b64..1fb6997fb9ba180e9a3f3a89a6d177086440c0d7 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -387,7 +387,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
   jlong capacity = env->GetDirectBufferCapacity(model_buffer);
   if (!VerifyModel(buf, capacity)) {
     throwException(env, kIllegalArgumentException,
-                   "MappedByteBuffer is not a valid flatbuffer model");
+                   "ByteBuffer is not a valid flatbuffer model");
     return 0;
   }
 
@@ -395,8 +395,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
       buf, static_cast<size_t>(capacity), error_reporter);
   if (!model) {
     throwException(env, kIllegalArgumentException,
-                   "MappedByteBuffer does not encode a valid "
-                   "TensorFlowLite model: %s",
+                   "ByteBuffer does not encode a valid model: %s",
                    error_reporter->CachedErrorMessage());
     return 0;
   }
@@ -426,7 +425,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
   status = interpreter->AllocateTensors();
   if (status != kTfLiteOk) {
     throwException(env, kNullPointerException,
-                   "Internal error: Cannot allocate memory for the interpreter",
+                   "Internal error: Cannot allocate memory for the interpreter:"
+                   " %s",
                    error_reporter->CachedErrorMessage());
     return 0;
   }
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index 210d9437241f117ab281b627a4352fce7d340bcb..82007a6ab5be3492495125b1c20ed155907ae5a0 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -19,6 +19,8 @@ import static com.google.common.truth.Truth.assertThat;
 import static org.junit.Assert.fail;
 
 import java.io.File;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.file.Files;
@@ -69,6 +71,49 @@ public final class InterpreterTest {
     fileChannel.close();
   }
 
+  @Test
+  public void testRunWithDirectByteBufferModel() throws Exception {
+    Path path = MODEL_FILE.toPath();
+    FileChannel fileChannel =
+        (FileChannel) Files.newByteChannel(path, EnumSet.of(StandardOpenOption.READ));
+    ByteBuffer byteBuffer = ByteBuffer.allocateDirect((int) fileChannel.size());
+    byteBuffer.order(ByteOrder.nativeOrder());
+    fileChannel.read(byteBuffer);
+    Interpreter interpreter = new Interpreter(byteBuffer);
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    interpreter.run(fourD, parsedOutputs);
+    float[] outputOneD = parsedOutputs[0][0][0];
+    float[] expected = {3.69f, 19.62f, 23.43f};
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    interpreter.close();
+    fileChannel.close();
+  }
+
+  @Test
+  public void testRunWithInvalidByteBufferModel() throws Exception {
+    Path path = MODEL_FILE.toPath();
+    FileChannel fileChannel =
+        (FileChannel) Files.newByteChannel(path, EnumSet.of(StandardOpenOption.READ));
+    ByteBuffer byteBuffer = ByteBuffer.allocate((int) fileChannel.size());
+    byteBuffer.order(ByteOrder.nativeOrder());
+    fileChannel.read(byteBuffer);
+    try {
+      Interpreter interpreter = new Interpreter(byteBuffer);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Model ByteBuffer should be either a MappedByteBuffer"
+                  + " of the model file, or a direct ByteBuffer using ByteOrder.nativeOrder()");
+    }
+    fileChannel.close();
+  }
+
   @Test
   public void testRun() {
     Interpreter interpreter = new Interpreter(MODEL_FILE);
diff --git a/tensorflow/contrib/lite/kernels/Android.bp b/tensorflow/contrib/lite/kernels/Android.bp
index 0f3cdf6a628e306a2a3f9e5bde5f54e0bd6099d3..59262d398ef173008d18ed912fc64d7ea5cb97d9 100644
--- a/tensorflow/contrib/lite/kernels/Android.bp
+++ b/tensorflow/contrib/lite/kernels/Android.bp
@@ -40,8 +40,8 @@ cc_library_static {
         "arg_max.cc",
         "basic_rnn.cc",
         "batch_to_space_nd.cc",
-	"bidirectional_sequence_rnn.cc",
         "bidirectional_sequence_lstm.cc",
+        "bidirectional_sequence_rnn.cc",
         "cast.cc",
         "comparisons.cc",
         "concatenation.cc",
@@ -49,6 +49,7 @@ cc_library_static {
         "depthwise_conv.cc",
         "dequantize.cc",
         "div.cc",
+        "elementwise.cc", 	
         "embedding_lookup.cc",
         "embedding_lookup_sparse.cc",
         "exp.cc",
@@ -70,16 +71,19 @@ cc_library_static {
         "register.cc",
         "reshape.cc",
         "resize_bilinear.cc",
+        "select.cc",
         "skip_gram.cc",
+        "slice.cc",
         "space_to_batch_nd.cc",
         "space_to_depth.cc",
-	"split.cc",
+        "split.cc",
         "squeeze.cc",
         "strided_slice.cc",
         "sub.cc",
         "svdf.cc",
-	"topk_v2.cc",
+        "topk_v2.cc",
         "transpose.cc",
+        "transpose_conv.cc",
         "unidirectional_sequence_lstm.cc",
         "unidirectional_sequence_rnn.cc",
 	"internal/kernel_utils.cc",
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index feab18b5c23b43e5462707761812e08f58b01d7f..b7291dd379a6c09a70a78de7bc6c2f217b293b26 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -143,6 +143,7 @@ cc_library(
         "depthwise_conv.cc",
         "dequantize.cc",
         "div.cc",
+        "elementwise.cc",
         "embedding_lookup.cc",
         "embedding_lookup_sparse.cc",
         "exp.cc",
@@ -164,7 +165,9 @@ cc_library(
         "register.cc",
         "reshape.cc",
         "resize_bilinear.cc",
+        "select.cc",
         "skip_gram.cc",
+        "slice.cc",
         "space_to_batch_nd.cc",
         "space_to_depth.cc",
         "split.cc",
@@ -174,6 +177,7 @@ cc_library(
         "svdf.cc",
         "topk_v2.cc",
         "transpose.cc",
+        "transpose_conv.cc",
         "unidirectional_sequence_lstm.cc",
         "unidirectional_sequence_rnn.cc",
     ],
@@ -453,6 +457,19 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "elementwise_test",
+    size = "small",
+    srcs = ["elementwise_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "unidirectional_sequence_lstm_test",
     size = "small",
@@ -870,6 +887,53 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "select_test",
+    size = "small",
+    srcs = [
+        "select_test.cc",
+    ],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "slice_test",
+    size = "small",
+    srcs = [
+        "slice_test.cc",
+    ],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "transpose_conv_test",
+    size = "small",
+    srcs = ["transpose_conv_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc
index 39a54c93962b33f3a787b3387d9a133119d0e80a..add36b46c0b8a4deab1e842d50194c8b99a3a20c 100644
--- a/tensorflow/contrib/lite/kernels/activations.cc
+++ b/tensorflow/contrib/lite/kernels/activations.cc
@@ -55,7 +55,7 @@ void Free(TfLiteContext* context, void* buffer) {
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
@@ -68,7 +68,7 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
@@ -95,7 +95,7 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
@@ -126,7 +126,7 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
@@ -153,9 +153,9 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TfLiteTensor* alpha = GetInput(context, node, 1);
+  const TfLiteTensor* alpha = GetInput(context, node, 1);
 
   output->type = input->type;
 
@@ -179,7 +179,7 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -191,13 +191,14 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently.");
+      context->ReportError(context, "Only float32 supported currently, got %d.",
+                           input->type);
       return kTfLiteError;
   }
 }
 
 TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -211,13 +212,14 @@ TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently.");
+      context->ReportError(context, "Only float32 supported currently, got %d.",
+                           input->type);
       return kTfLiteError;
   }
 }
 
 TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -229,14 +231,15 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently.");
+      context->ReportError(context, "Only float32 supported currently, got %d.",
+                           input->type);
       return kTfLiteError;
   }
 }
 
 TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -256,7 +259,8 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently.");
+      context->ReportError(context, "Only float32 supported currently, got %d.",
+                           input->type);
       return kTfLiteError;
   }
 }
@@ -265,7 +269,7 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -285,14 +289,15 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
       break;
     }
     default:
-      context->ReportError(context, "Only float32 supported currently.");
+      context->ReportError(context, "Only float32 supported currently, got %d.",
+                           input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
 }
 
 // Takes a 2D tensor and perform softmax along the second dimension.
-void Softmax2DFloat(TfLiteTensor* input, TfLiteTensor* output,
+void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output,
                     TfLiteSoftmaxParams* params) {
   const int batch_size = input->dims->data[0];
   const int input_size = input->dims->data[1];
@@ -327,7 +332,7 @@ void Softmax2DFloat(TfLiteTensor* input, TfLiteTensor* output,
   }
 }
 
-void Softmax2DQuantized(TfLiteTensor* input, TfLiteTensor* output,
+void Softmax2DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
                         TfLiteSoftmaxParams* params, OpData* data) {
   // TODO(ahentz): this is arguably a dirty trick. Since the implementation
   // always traverses the last dimension of a 4D tensor, we will pretend our 2D
@@ -343,14 +348,14 @@ void Softmax2DQuantized(TfLiteTensor* input, TfLiteTensor* output,
 }
 
 // Takes a 4D tensor and perform softmax along the forth dimension.
-void Softmax4DFloat(TfLiteTensor* input, TfLiteTensor* output,
+void Softmax4DFloat(const TfLiteTensor* input, TfLiteTensor* output,
                     TfLiteSoftmaxParams* params) {
   optimized_ops::Softmax(GetTensorData<float>(input), GetTensorDims(input),
                          params->beta, GetTensorData<float>(output),
                          GetTensorDims(output));
 }
 
-void Softmax4DQuantized(TfLiteTensor* input, TfLiteTensor* output,
+void Softmax4DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
                         TfLiteSoftmaxParams* params, OpData* data) {
   optimized_ops::Softmax(GetTensorData<uint8_t>(input), GetTensorDims(input),
                          data->input_multiplier, data->input_left_shift,
@@ -362,7 +367,7 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
   // TODO(ahentz): consider an implementation that works for many (all?)
@@ -377,8 +382,9 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
         Softmax4DFloat(input, output, params);
         return kTfLiteOk;
       }
-      context->ReportError(context,
-                           "Only 2D and 4D tensors supported currently.");
+      context->ReportError(
+          context, "Only 2D and 4D tensors supported currently, got %dD.",
+          NumDimensions(input));
       return kTfLiteError;
     }
     case kTfLiteUInt8: {
@@ -390,19 +396,21 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
         Softmax4DQuantized(input, output, params, data);
         return kTfLiteOk;
       }
-      context->ReportError(context,
-                           "Only 2D and 4D tensors supported currently.");
+      context->ReportError(
+          context, "Only 2D and 4D tensors supported currently, got %dD.",
+          NumDimensions(input));
       return kTfLiteError;
     }
     default:
-      context->ReportError(context,
-                           "Only float32 and uint8_t supported currently.");
+      context->ReportError(
+          context, "Only float32 and uint8_t supported currently, got %d.",
+          input->type);
       return kTfLiteError;
   }
 }
 
 TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32:
@@ -411,18 +419,20 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
           GetTensorData<float>(output), GetTensorDims(output));
       return kTfLiteOk;
     default:
-      context->ReportError(context, "Only float32 supported currently.");
+      context->ReportError(context, "Only float32 supported currently., got %d",
+                           input->type);
       return kTfLiteError;
   }
 }
 
 TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* alpha = GetInput(context, node, 1);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* alpha = GetInput(context, node, 1);
+  const TfLiteTensor* output = GetOutput(context, node, 0);
 
   if (input->type != kTfLiteFloat32) {
-    context->ReportError(context, "Only float32 supported currently.");
+    context->ReportError(context, "Only float32 supported currently, got %d.",
+                         input->type);
     return kTfLiteError;
   }
   TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc
index e0aa070e2d02cecb9c6ff500ab32b8ad2159db6e..7ca1e35489cba3b5d2567bc04e532fedf8a527a7 100644
--- a/tensorflow/contrib/lite/kernels/add.cc
+++ b/tensorflow/contrib/lite/kernels/add.cc
@@ -57,8 +57,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
@@ -80,7 +80,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 template <KernelType kernel_type>
 void EvalAddFloat(TfLiteContext* context, TfLiteNode* node,
                   TfLiteAddParams* params, const OpData* data,
-                  TfLiteTensor* input1, TfLiteTensor* input2,
+                  const TfLiteTensor* input1, const TfLiteTensor* input2,
                   TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
@@ -109,7 +109,7 @@ void EvalAddFloat(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                       TfLiteAddParams* params, const OpData* data,
-                      TfLiteTensor* input1, TfLiteTensor* input2,
+                      const TfLiteTensor* input1, const TfLiteTensor* input2,
                       TfLiteTensor* output) {
   auto input1_offset = -input1->params.zero_point;
   auto input2_offset = -input2->params.zero_point;
@@ -164,8 +164,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
diff --git a/tensorflow/contrib/lite/kernels/arg_max.cc b/tensorflow/contrib/lite/kernels/arg_max.cc
index a2c5e4ceadbc905d22eb02b450c88745a351f58f..26f57e88962116f446e72fbc164d2747e8b633b4 100644
--- a/tensorflow/contrib/lite/kernels/arg_max.cc
+++ b/tensorflow/contrib/lite/kernels/arg_max.cc
@@ -33,8 +33,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* axis = GetInput(context, node, kAxis);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* axis = GetInput(context, node, kAxis);
   // Make sure the axis is only 1 dimension.
   TF_LITE_ENSURE_EQ(context, NumElements(axis), 1);
 
@@ -52,7 +52,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       output->type = kTfLiteInt64;
       break;
     default:
-      context->ReportError(context, "Unknown index output data type");
+      context->ReportError(context, "Unknown index output data type: %d",
+                           params->output_type);
       return kTfLiteError;
   }
 
@@ -64,7 +65,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       break;
 
     default:
-      context->ReportError(context, "Only float32 and int types are supported");
+      context->ReportError(
+          context,
+          "Unkonwn input type: %d, only float32 and int types are supported",
+          input->type);
       return kTfLiteError;
   }
 
@@ -79,12 +83,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 // The current impl actually ignores the axis argument.
 // Only determine the index of the maximum value in the last dimension.
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* axis = GetInput(context, node, kAxis);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* axis = GetInput(context, node, kAxis);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
 #define TF_LITE_ARG_MAX(data_type, axis_type, output_type)                     \
-  TF_LITE_ENSURE_EQ(context, GetTensorData<axis_type>(axis)[0], 3);            \
   optimized_ops::ArgMax(GetTensorData<axis_type>(axis),                        \
                         GetTensorData<data_type>(input), GetTensorDims(input), \
                         GetTensorData<output_type>(output),                    \
diff --git a/tensorflow/contrib/lite/kernels/audio_spectrogram.cc b/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
index 602f3888c10b3790dc0328c817bdd83276544b25..91d8dd3fa71b4f2ac70c64c4923c5240b61a2b25 100644
--- a/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
+++ b/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
@@ -72,7 +72,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
@@ -102,7 +102,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteAudioSpectrogramParams*>(node->user_data);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE(context, params->spectrogram->Initialize(params->window_size,
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn.cc b/tensorflow/contrib/lite/kernels/basic_rnn.cc
index 2c5074eca3176c7f33a6f051b492dc41333257ed..7dc0c5656dca02a86339c558f4fe2babb4961695 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/basic_rnn.cc
@@ -12,18 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
+#include <stddef.h>
+#include <stdint.h>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
 #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -35,20 +31,29 @@ constexpr int kInputTensor = 0;
 constexpr int kWeightsTensor = 1;
 constexpr int kRecurrentWeightsTensor = 2;
 constexpr int kBiasTensor = 3;
-constexpr int KHiddenStateTensor = 0;
+constexpr int kHiddenStateTensor = 0;
 constexpr int kOutputTensor = 1;
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* scratch_tensor_index = new int;
+  context->AddTensors(context, /*tensors_to_add=*/2, scratch_tensor_index);
+  return scratch_tensor_index;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<int*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Check we have all the inputs and outputs we need.
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
 
-  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
-  TfLiteTensor* input_weights =
-      &context->tensors[node->inputs->data[kWeightsTensor]];
-  TfLiteTensor* recurrent_weights =
-      &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
-  TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input_weights = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* recurrent_weights =
+      GetInput(context, node, kRecurrentWeightsTensor);
+  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
 
   // Check all the parameters of tensor match within themselves and match the
   // input configuration.
@@ -58,10 +63,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ASSERT_EQ(input_weights->dims->data[0], bias->dims->data[0]);
   TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[0], bias->dims->data[0]);
   TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[1], bias->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, input_weights->type, recurrent_weights->type);
 
-  TfLiteTensor* hidden_state =
-      &context->tensors[node->outputs->data[KHiddenStateTensor]];
-  TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
+  TfLiteTensor* hidden_state = GetOutput(context, node, kHiddenStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Resize state.
   TfLiteIntArray* hidden_state_size_array = TfLiteIntArrayCreate(2);
@@ -80,25 +86,44 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, output, output_size_array));
 
+  // Allocate temporary tensors to store quantized values of input and
+  // hidden_state tensors.
+  if (input->type == kTfLiteFloat32 && input_weights->type == kTfLiteUInt8) {
+    int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+    TfLiteIntArrayFree(node->temporaries);
+    node->temporaries = TfLiteIntArrayCreate(2);
+    node->temporaries->data[0] = *scratch_tensor_index;
+    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
+    input_quantized->type = kTfLiteUInt8;
+    input_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
+      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
+                                                       input_quantized_size));
+    }
+    node->temporaries->data[1] = *scratch_tensor_index + 1;
+    TfLiteTensor* hidden_state_quantized =
+        GetTemporary(context, node, /*index=*/1);
+    hidden_state_quantized->type = kTfLiteUInt8;
+    hidden_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(hidden_state_quantized->dims,
+                             hidden_state->dims)) {
+      TfLiteIntArray* hidden_state_quantized_size =
+          TfLiteIntArrayCopy(hidden_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, hidden_state_quantized,
+                                              hidden_state_quantized_size));
+    }
+  }
+
   return kTfLiteOk;
 }
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteRNNParams*>(node->builtin_data);
-
-  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
-  TfLiteTensor* input_weights =
-      &context->tensors[node->inputs->data[kWeightsTensor]];
-  TfLiteTensor* recurrent_weights =
-      &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
-  TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
-  TfLiteTensor* hidden_state =
-      &context->tensors[node->outputs->data[KHiddenStateTensor]];
-  TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
-
-  // Initialize the pointer bias.
-  const float* bias_ptr = bias->data.f;
-
+TfLiteStatus EvalFloat(const TfLiteTensor* input,
+                       const TfLiteTensor* input_weights,
+                       const TfLiteTensor* recurrent_weights,
+                       const TfLiteTensor* bias, const TfLiteRNNParams* params,
+                       TfLiteTensor* hidden_state, TfLiteTensor* output) {
   const int batch_size = input->dims->data[0];
   const int num_units = input_weights->dims->data[0];
   const int input_size = input->dims->data[1];
@@ -108,9 +133,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Initialize the pointer to input and output.
   const float* input_ptr_batch = input->data.f;
   float* output_ptr_batch = output->data.f;
-  // Initialize input_weights and recurrent_weights.
+  // Initialize input_weights, recurrent_weights and bias.
   const float* input_weights_ptr = input_weights->data.f;
   const float* recurrent_weights_ptr = recurrent_weights->data.f;
+  const float* bias_ptr = bias->data.f;
 
   kernel_utils::RnnBatchStep(input_ptr_batch, input_weights_ptr,
                              recurrent_weights_ptr, bias_ptr, input_size,
@@ -119,11 +145,82 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus EvalQuantized(const TfLiteTensor* input,
+                           const TfLiteTensor* input_weights,
+                           const TfLiteTensor* recurrent_weights,
+                           const TfLiteTensor* bias,
+                           const TfLiteRNNParams* params,
+                           TfLiteTensor* input_scratch,
+                           TfLiteTensor* hidden_state_scratch,
+                           TfLiteTensor* hidden_state, TfLiteTensor* output) {
+  const int batch_size = input->dims->data[0];
+  const int num_units = input_weights->dims->data[0];
+  const int input_size = input->dims->data[1];
+
+  // Initialize the pointer to hidden state.
+  float* hidden_state_ptr_batch = hidden_state->data.f;
+  // Initialize the pointer to input and output.
+  const float* input_ptr_batch = input->data.f;
+  float* output_ptr_batch = output->data.f;
+  // Initialize input_weights, recurrent_weights and bias.
+  const int8_t* input_weights_ptr =
+      reinterpret_cast<const int8_t*>(input_weights->data.uint8);
+  const int8_t* recurrent_weights_ptr =
+      reinterpret_cast<const int8_t*>(recurrent_weights->data.uint8);
+  const float* bias_ptr = bias->data.f;
+  // Get the scale of the quantized weights.
+  float input_weights_scale = input_weights->params.scale;
+  float recurrent_weights_scale = recurrent_weights->params.scale;
+  // Initialize temporary storage for quantized values.
+  int8_t* quantized_input_ptr =
+      reinterpret_cast<int8_t*>(input_scratch->data.uint8);
+  int8_t* quantized_hidden_state_ptr =
+      reinterpret_cast<int8_t*>(hidden_state_scratch->data.uint8);
+
+  kernel_utils::RnnBatchStep(
+      input_ptr_batch, input_weights_ptr, input_weights_scale,
+      recurrent_weights_ptr, recurrent_weights_scale, bias_ptr, input_size,
+      num_units, batch_size, params->activation, quantized_input_ptr,
+      quantized_hidden_state_ptr, hidden_state_ptr_batch, output_ptr_batch);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteRNNParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input_weights = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* recurrent_weights =
+      GetInput(context, node, kRecurrentWeightsTensor);
+  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
+  TfLiteTensor* hidden_state = GetOutput(context, node, kHiddenStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // We already checked that weight types are consistent, so branch on one.
+  switch (input_weights->type) {
+    case kTfLiteFloat32:
+      return EvalFloat(input, input_weights, recurrent_weights, bias, params,
+                       hidden_state, output);
+    case kTfLiteUInt8: {
+      // TODO(mirkov): implement eval with quantized inputs as well.
+      TfLiteTensor* input_quantized = GetTemporary(context, node, 0);
+      TfLiteTensor* hidden_state_quantized = GetTemporary(context, node, 1);
+      return EvalQuantized(input, input_weights, recurrent_weights, bias,
+                           params, input_quantized, hidden_state_quantized,
+                           hidden_state, output);
+    }
+    default:
+      context->ReportError(context, "Type %d not currently supported.",
+                           input_weights->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 }  // namespace rnn
 
 TfLiteRegistration* Register_RNN() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
-                                 rnn::Prepare, rnn::Eval};
+  static TfLiteRegistration r = {rnn::Init, rnn::Free, rnn::Prepare, rnn::Eval};
   return &r;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
index fa7ef525db47c93f98951604cd04da66196422d7..96465fcaf0a78527237faa7b82ddbc32ec56d114 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
+++ b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite RNN op.
 
-#include <iomanip>
+#include <string.h>
+#include <initializer_list>
+#include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -122,13 +124,62 @@ static float rnn_golden_output[] = {
     0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
     0.628881,   3.58099,    1.49974,   0};
 
+static std::initializer_list<float> rnn_weights = {
+    0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
+    0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
+    0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
+    -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
+    -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
+    -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
+    -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
+    0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
+    0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
+    0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
+    -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
+    0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
+    -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
+    -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
+    0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
+    0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
+    0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
+    -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
+    0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
+    0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
+    -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
+    0.277308,    0.415818};
+
+static std::initializer_list<float> rnn_recurrent_weights = {
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1};
+
+static std::initializer_list<float> rnn_bias = {
+    0.065691948, -0.69055247, 0.1107955,  -0.97084129, -0.23957068, -0.23566568,
+    -0.389184,   0.47481549,  -0.4791103, 0.29931796,  0.10463274,  0.83918178,
+    0.37197268,  0.61957061,  0.3956964,  -0.37609905};
+
 class RNNOpModel : public SingleOpModel {
  public:
-  RNNOpModel(int batches, int units, int size)
+  RNNOpModel(int batches, int units, int size,
+             const TensorType& weights = TensorType_FLOAT32,
+             const TensorType& recurrent_weights = TensorType_FLOAT32)
       : batches_(batches), units_(units), input_size_(size) {
     input_ = AddInput(TensorType_FLOAT32);
-    weights_ = AddInput(TensorType_FLOAT32);
-    recurrent_weights_ = AddInput(TensorType_FLOAT32);
+    weights_ = AddInput(weights);
+    recurrent_weights_ = AddInput(recurrent_weights);
     bias_ = AddInput(TensorType_FLOAT32);
     hidden_state_ = AddOutput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
@@ -173,7 +224,7 @@ class RNNOpModel : public SingleOpModel {
   int num_units() { return units_; }
   int num_batches() { return batches_; }
 
- private:
+ protected:
   int input_;
   int weights_;
   int recurrent_weights_;
@@ -186,53 +237,26 @@ class RNNOpModel : public SingleOpModel {
   int input_size_;
 };
 
-TEST(FullyConnectedOpTest, BlackBoxTest) {
+// The hybrid model has quantized weights and recurrent_weights.
+class HybridRNNOpModel : public RNNOpModel {
+ public:
+  HybridRNNOpModel(int batches, int units, int size)
+      : RNNOpModel(batches, units, size, TensorType_UINT8, TensorType_UINT8) {}
+
+  void SetWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(weights_, f);
+  }
+
+  void SetRecurrentWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_weights_, f);
+  }
+};
+
+TEST(RnnOpTest, BlackBoxTest) {
   RNNOpModel rnn(2, 16, 8);
-  rnn.SetWeights(
-      {0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
-       0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
-       0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
-       -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
-       -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
-       -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
-       -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
-       0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
-       0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
-       0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
-       -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
-       0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
-       -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
-       -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
-       0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
-       0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
-       0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
-       -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
-       0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
-       0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
-       -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
-       0.277308,    0.415818});
-
-  rnn.SetBias({0.065691948, -0.69055247, 0.1107955, -0.97084129, -0.23957068,
-               -0.23566568, -0.389184, 0.47481549, -0.4791103, 0.29931796,
-               0.10463274, 0.83918178, 0.37197268, 0.61957061, 0.3956964,
-               -0.37609905});
-
-  rnn.SetRecurrentWeights({0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1});
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
 
   rnn.ResetHiddenState();
   const int input_sequence_size = sizeof(rnn_input) / sizeof(float) /
@@ -256,6 +280,35 @@ TEST(FullyConnectedOpTest, BlackBoxTest) {
   }
 }
 
+TEST(HybridRnnOpTest, BlackBoxTest) {
+  HybridRNNOpModel rnn(2, 16, 8);
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
+
+  rnn.ResetHiddenState();
+  const int input_sequence_size = sizeof(rnn_input) / sizeof(float) /
+                                  (rnn.input_size() * rnn.num_batches());
+
+  for (int i = 0; i < input_sequence_size; i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    rnn.SetInput(0, batch_start, batch_end);
+    rnn.SetInput(rnn.input_size(), batch_start, batch_end);
+
+    rnn.Invoke();
+
+    float* golden_start = rnn_golden_output + i * rnn.num_units();
+    float* golden_end = golden_start + rnn.num_units();
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+    expected.insert(expected.end(), golden_start, golden_end);
+
+    EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                     expected, /*max_abs_error=*/0.0104)));
+  }
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
index 90edf4f9e3683f2987dd8299a1cd5233caa24479..c8cee88edfdbf42f422f66e4d0ca6eeb5eccbf8d 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
@@ -40,9 +40,9 @@ struct BatchToSpaceNDContext {
     crops = GetInput(context, node, 2);
     output = GetOutput(context, node, 0);
   }
-  TfLiteTensor* input;
-  TfLiteTensor* block_shape;
-  TfLiteTensor* crops;
+  const TfLiteTensor* input;
+  const TfLiteTensor* block_shape;
+  const TfLiteTensor* crops;
   TfLiteTensor* output;
 };
 
@@ -66,12 +66,10 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->crops),
                     kSpatialDimensionNum);
 
-  // TODO(ycling): Add crops as part of calculation. Remove check for a crops
-  // containing all zeroes.
-  TF_LITE_ENSURE_EQ(context, crops[0], 0);
-  TF_LITE_ENSURE_EQ(context, crops[1], 0);
-  TF_LITE_ENSURE_EQ(context, crops[2], 0);
-  TF_LITE_ENSURE_EQ(context, crops[3], 0);
+  TF_LITE_ENSURE(context, crops[0] >= 0);
+  TF_LITE_ENSURE(context, crops[1] >= 0);
+  TF_LITE_ENSURE(context, crops[2] >= 0);
+  TF_LITE_ENSURE(context, crops[3] >= 0);
 
   // Number of batch must be multiple of (block_shape[0] * block_shape[1]).
   TF_LITE_ENSURE_EQ(context,
@@ -79,8 +77,16 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
 
   const int output_batch_size =
       input_size->data[0] / (block_shape[0] * block_shape[1]);
-  const int output_height = input_size->data[1] * block_shape[0];
-  const int output_width = input_size->data[2] * block_shape[1];
+
+  const int crops_top = crops[0];
+  const int crops_bottom = crops[1];
+  const int crops_left = crops[2];
+  const int crops_right = crops[3];
+  const int output_height =
+      input_size->data[1] * block_shape[0] - crops_top - crops_bottom;
+  const int output_width =
+      input_size->data[2] * block_shape[1] - crops_left - crops_right;
+
   const int output_channel_size = input_size->data[3];
 
   TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
@@ -157,8 +163,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     default:
-      context->ReportError(context,
-                           "Type is currently not supported by BatchToSpace.");
+      context->ReportError(
+          context, "Type %d is currently not supported by BatchToSpace.",
+          op_context.input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_BATCH_TO_SPACE_ND
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
index 8485cde1b40066f2070855bca91ea78a9f80e83c..95b025c1b30cc627cf5858ec17f8ff7c57f7bd95 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
@@ -120,16 +120,16 @@ TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
 }
 
 TEST(BatchToSpaceNDOpTest, InvalidCropsConstTest) {
-  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 1}),
-               "1 != 0");
+  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, -1}),
+               "crops.3. >= 0 was not true.");
 }
 
 TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
   BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
-  m.SetCrops({0, 0, 1, 0});
-  EXPECT_DEATH(m.Invoke(), "1 != 0");
+  m.SetCrops({0, 0, -1, 0});
+  EXPECT_DEATH(m.Invoke(), "crops.2. >= 0 was not true.");
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
index a35ba23cedec437206caa780f4965272d0afb7a8..3425288f027a6fd9eb65f730bc7d039c832ace1c 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
@@ -135,7 +135,7 @@ TfLiteStatus CheckLstmTensorDimensions(
   TF_LITE_ENSURE(context, params->cell_clip >= 0);
   TF_LITE_ENSURE(context, params->proj_clip >= 0);
 
-  TfLiteTensor* input_to_input_weights =
+  const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, input_to_input_weights_tensor);
   if (input_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
@@ -143,19 +143,19 @@ TfLiteStatus CheckLstmTensorDimensions(
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
   }
 
-  TfLiteTensor* input_to_forget_weights =
+  const TfLiteTensor* input_to_forget_weights =
       GetInput(context, node, input_to_forget_weights_tensor);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
 
-  TfLiteTensor* input_to_cell_weights =
+  const TfLiteTensor* input_to_cell_weights =
       GetInput(context, node, input_to_cell_weights_tensor);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
 
-  TfLiteTensor* recurrent_to_input_weights =
+  const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, recurrent_to_input_weights_tensor);
   if (recurrent_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
@@ -165,7 +165,7 @@ TfLiteStatus CheckLstmTensorDimensions(
                       n_output);
   }
 
-  TfLiteTensor* recurrent_to_forget_weights =
+  const TfLiteTensor* recurrent_to_forget_weights =
       GetInput(context, node, recurrent_to_forget_weights_tensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
@@ -173,7 +173,7 @@ TfLiteStatus CheckLstmTensorDimensions(
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
                     n_output);
 
-  TfLiteTensor* recurrent_to_cell_weights =
+  const TfLiteTensor* recurrent_to_cell_weights =
       GetInput(context, node, recurrent_to_cell_weights_tensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
@@ -189,21 +189,21 @@ TfLiteStatus CheckLstmTensorDimensions(
        (recurrent_to_input_weights == nullptr));
   TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
 
-  TfLiteTensor* cell_to_input_weights =
+  const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, cell_to_input_weights_tensor);
   if (cell_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* cell_to_forget_weights =
+  const TfLiteTensor* cell_to_forget_weights =
       GetOptionalInputTensor(context, node, cell_to_forget_weights_tensor);
   if (cell_to_forget_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* cell_to_output_weights =
+  const TfLiteTensor* cell_to_output_weights =
       GetOptionalInputTensor(context, node, cell_to_output_weights_tensor);
   if (cell_to_output_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
@@ -222,7 +222,7 @@ TfLiteStatus CheckLstmTensorDimensions(
   TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
 
   // Make sure the input gate bias is present only when not a CIFG-LSTM.
-  TfLiteTensor* input_gate_bias =
+  const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, input_gate_bias_tensor);
   if (use_cifg) {
     TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
@@ -231,21 +231,22 @@ TfLiteStatus CheckLstmTensorDimensions(
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* forget_gate_bias =
+  const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, forget_gate_bias_tensor);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* cell_bias = GetInput(context, node, cell_gate_bias_tensor);
+  const TfLiteTensor* cell_bias =
+      GetInput(context, node, cell_gate_bias_tensor);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* output_gate_bias =
+  const TfLiteTensor* output_gate_bias =
       GetInput(context, node, output_gate_bias_tensor);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* projection_weights =
+  const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, projection_weights_tensor);
   if (projection_weights) {
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
@@ -253,7 +254,7 @@ TfLiteStatus CheckLstmTensorDimensions(
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
   }
 
-  TfLiteTensor* projection_bias =
+  const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, projection_bias_tensor);
   if (projection_bias) {
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
@@ -312,20 +313,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Inferring batch size, number of outputs and sequence length and
   // number of cells from the input tensors.
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TF_LITE_ENSURE(context, input->dims->size > 1);
   const int max_time = input->dims->data[0];
   const int n_batch = input->dims->data[1];
   const int n_input = input->dims->data[2];
 
-  TfLiteTensor* fw_input_to_output_weights =
+  const TfLiteTensor* fw_input_to_output_weights =
       GetInput(context, node, kFwInputToOutputWeightsTensor);
   const int n_fw_cell = fw_input_to_output_weights->dims->data[0];
   TF_LITE_ENSURE_EQ(context, fw_input_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, fw_input_to_output_weights->dims->data[1],
                     n_input);
 
-  TfLiteTensor* fw_recurrent_to_output_weights =
+  const TfLiteTensor* fw_recurrent_to_output_weights =
       GetInput(context, node, kFwRecurrentToOutputWeightsTensor);
   TF_LITE_ENSURE_EQ(context, fw_recurrent_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, fw_recurrent_to_output_weights->dims->data[0],
@@ -373,7 +374,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   fw_output_state->allocation_type = kTfLiteArenaRwPersistent;
   fw_cell_state->allocation_type = kTfLiteArenaRwPersistent;
 
-  TfLiteTensor* fw_input_to_input_weights =
+  const TfLiteTensor* fw_input_to_input_weights =
       GetOptionalInputTensor(context, node, kFwInputToInputWeightsTensor);
   const bool fw_use_cifg = (fw_input_to_input_weights == nullptr);
   TfLiteIntArray* fw_scratch_buffer_size = TfLiteIntArrayCreate(2);
@@ -388,14 +389,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_scratch_buffer,
                                                    fw_scratch_buffer_size));
   // Same for the backward cell.
-  TfLiteTensor* bw_input_to_output_weights =
+  const TfLiteTensor* bw_input_to_output_weights =
       GetInput(context, node, kBwInputToOutputWeightsTensor);
   const int n_bw_cell = bw_input_to_output_weights->dims->data[0];
   TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->data[1],
                     n_input);
 
-  TfLiteTensor* bw_recurrent_to_output_weights =
+  const TfLiteTensor* bw_recurrent_to_output_weights =
       GetInput(context, node, kBwRecurrentToOutputWeightsTensor);
   TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->data[0],
@@ -441,7 +442,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   bw_output_state->allocation_type = kTfLiteArenaRwPersistent;
   bw_cell_state->allocation_type = kTfLiteArenaRwPersistent;
 
-  TfLiteTensor* bw_input_to_input_weights =
+  const TfLiteTensor* bw_input_to_input_weights =
       GetOptionalInputTensor(context, node, kBwInputToInputWeightsTensor);
   const bool bw_use_cifg = (bw_input_to_input_weights == nullptr);
   TfLiteIntArray* bw_scratch_buffer_size = TfLiteIntArrayCreate(2);
@@ -463,48 +464,49 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
 
   // Input tensor.
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const int max_time = input->dims->data[0];
   const int n_batch = input->dims->data[1];
   const int n_input = input->dims->data[2];
 
   // Tensors for the forward cell.
-  TfLiteTensor* fw_input_to_input_weights =
+  const TfLiteTensor* fw_input_to_input_weights =
       GetOptionalInputTensor(context, node, kFwInputToInputWeightsTensor);
-  TfLiteTensor* fw_input_to_forget_weights =
+  const TfLiteTensor* fw_input_to_forget_weights =
       GetInput(context, node, kFwInputToForgetWeightsTensor);
-  TfLiteTensor* fw_input_to_cell_weights =
+  const TfLiteTensor* fw_input_to_cell_weights =
       GetInput(context, node, kFwInputToCellWeightsTensor);
-  TfLiteTensor* fw_input_to_output_weights =
+  const TfLiteTensor* fw_input_to_output_weights =
       GetInput(context, node, kFwInputToOutputWeightsTensor);
 
-  TfLiteTensor* fw_recurrent_to_input_weights =
+  const TfLiteTensor* fw_recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kFwRecurrentToInputWeightsTensor);
-  TfLiteTensor* fw_recurrent_to_forget_weights =
+  const TfLiteTensor* fw_recurrent_to_forget_weights =
       GetInput(context, node, kFwRecurrentToForgetWeightsTensor);
-  TfLiteTensor* fw_recurrent_to_cell_weights =
+  const TfLiteTensor* fw_recurrent_to_cell_weights =
       GetInput(context, node, kFwRecurrentToCellWeightsTensor);
-  TfLiteTensor* fw_recurrent_to_output_weights =
+  const TfLiteTensor* fw_recurrent_to_output_weights =
       GetInput(context, node, kFwRecurrentToOutputWeightsTensor);
 
-  TfLiteTensor* fw_cell_to_input_weights =
+  const TfLiteTensor* fw_cell_to_input_weights =
       GetOptionalInputTensor(context, node, kFwCellToInputWeightsTensor);
-  TfLiteTensor* fw_cell_to_forget_weights =
+  const TfLiteTensor* fw_cell_to_forget_weights =
       GetOptionalInputTensor(context, node, kFwCellToForgetWeightsTensor);
-  TfLiteTensor* fw_cell_to_output_weights =
+  const TfLiteTensor* fw_cell_to_output_weights =
       GetOptionalInputTensor(context, node, kFwCellToOutputWeightsTensor);
 
-  TfLiteTensor* fw_input_gate_bias =
+  const TfLiteTensor* fw_input_gate_bias =
       GetOptionalInputTensor(context, node, kFwInputGateBiasTensor);
-  TfLiteTensor* fw_forget_gate_bias =
+  const TfLiteTensor* fw_forget_gate_bias =
       GetInput(context, node, kFwForgetGateBiasTensor);
-  TfLiteTensor* fw_cell_bias = GetInput(context, node, kFwCellGateBiasTensor);
-  TfLiteTensor* fw_output_gate_bias =
+  const TfLiteTensor* fw_cell_bias =
+      GetInput(context, node, kFwCellGateBiasTensor);
+  const TfLiteTensor* fw_output_gate_bias =
       GetInput(context, node, kFwOutputGateBiasTensor);
 
-  TfLiteTensor* fw_projection_weights =
+  const TfLiteTensor* fw_projection_weights =
       GetOptionalInputTensor(context, node, kFwProjectionWeightsTensor);
-  TfLiteTensor* fw_projection_bias =
+  const TfLiteTensor* fw_projection_bias =
       GetOptionalInputTensor(context, node, kFwProjectionBiasTensor);
 
   TfLiteTensor* fw_output_state =
@@ -513,42 +515,43 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
 
   // Tensors for the backward cell.
-  TfLiteTensor* bw_input_to_input_weights =
+  const TfLiteTensor* bw_input_to_input_weights =
       GetOptionalInputTensor(context, node, kBwInputToInputWeightsTensor);
-  TfLiteTensor* bw_input_to_forget_weights =
+  const TfLiteTensor* bw_input_to_forget_weights =
       GetInput(context, node, kBwInputToForgetWeightsTensor);
-  TfLiteTensor* bw_input_to_cell_weights =
+  const TfLiteTensor* bw_input_to_cell_weights =
       GetInput(context, node, kBwInputToCellWeightsTensor);
-  TfLiteTensor* bw_input_to_output_weights =
+  const TfLiteTensor* bw_input_to_output_weights =
       GetInput(context, node, kBwInputToOutputWeightsTensor);
 
-  TfLiteTensor* bw_recurrent_to_input_weights =
+  const TfLiteTensor* bw_recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kBwRecurrentToInputWeightsTensor);
-  TfLiteTensor* bw_recurrent_to_forget_weights =
+  const TfLiteTensor* bw_recurrent_to_forget_weights =
       GetInput(context, node, kBwRecurrentToForgetWeightsTensor);
-  TfLiteTensor* bw_recurrent_to_cell_weights =
+  const TfLiteTensor* bw_recurrent_to_cell_weights =
       GetInput(context, node, kBwRecurrentToCellWeightsTensor);
-  TfLiteTensor* bw_recurrent_to_output_weights =
+  const TfLiteTensor* bw_recurrent_to_output_weights =
       GetInput(context, node, kBwRecurrentToOutputWeightsTensor);
 
-  TfLiteTensor* bw_cell_to_input_weights =
+  const TfLiteTensor* bw_cell_to_input_weights =
       GetOptionalInputTensor(context, node, kBwCellToInputWeightsTensor);
-  TfLiteTensor* bw_cell_to_forget_weights =
+  const TfLiteTensor* bw_cell_to_forget_weights =
       GetOptionalInputTensor(context, node, kBwCellToForgetWeightsTensor);
-  TfLiteTensor* bw_cell_to_output_weights =
+  const TfLiteTensor* bw_cell_to_output_weights =
       GetOptionalInputTensor(context, node, kBwCellToOutputWeightsTensor);
 
-  TfLiteTensor* bw_input_gate_bias =
+  const TfLiteTensor* bw_input_gate_bias =
       GetOptionalInputTensor(context, node, kBwInputGateBiasTensor);
-  TfLiteTensor* bw_forget_gate_bias =
+  const TfLiteTensor* bw_forget_gate_bias =
       GetInput(context, node, kBwForgetGateBiasTensor);
-  TfLiteTensor* bw_cell_bias = GetInput(context, node, kBwCellGateBiasTensor);
-  TfLiteTensor* bw_output_gate_bias =
+  const TfLiteTensor* bw_cell_bias =
+      GetInput(context, node, kBwCellGateBiasTensor);
+  const TfLiteTensor* bw_output_gate_bias =
       GetInput(context, node, kBwOutputGateBiasTensor);
 
-  TfLiteTensor* bw_projection_weights =
+  const TfLiteTensor* bw_projection_weights =
       GetOptionalInputTensor(context, node, kBwProjectionWeightsTensor);
-  TfLiteTensor* bw_projection_bias =
+  const TfLiteTensor* bw_projection_bias =
       GetOptionalInputTensor(context, node, kBwProjectionBiasTensor);
 
   TfLiteTensor* bw_output_state =
diff --git a/tensorflow/contrib/lite/kernels/cast.cc b/tensorflow/contrib/lite/kernels/cast.cc
index 17ef2c572ebbfa54ba6856f7eebbcd6fd9e63868..60770ca0aa8b85d9710d26beca3d4d603da5db2f 100644
--- a/tensorflow/contrib/lite/kernels/cast.cc
+++ b/tensorflow/contrib/lite/kernels/cast.cc
@@ -32,7 +32,7 @@ constexpr int kOutputTensor = 0;
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // TODO(ahentz): these two checks would make the new implementation
@@ -69,6 +69,9 @@ TfLiteStatus copyToTensor(const FromT* in, TfLiteTensor* out,
     case kTfLiteFloat32:
       copyCast(in, out->data.f, num_elements);
       break;
+    case kTfLiteBool:
+      copyCast(in, out->data.b, num_elements);
+      break;
     default:
       // Unsupported type.
       return kTfLiteError;
@@ -77,7 +80,7 @@ TfLiteStatus copyToTensor(const FromT* in, TfLiteTensor* out,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   const int num_elements = NumElements(input);
   TF_LITE_ENSURE_EQ(context, num_elements, NumElements(output));
@@ -90,6 +93,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return copyToTensor(input->data.uint8, output, num_elements);
     case kTfLiteFloat32:
       return copyToTensor(input->data.f, output, num_elements);
+    case kTfLiteBool:
+      return copyToTensor(input->data.b, output, num_elements);
     default:
       // Unsupported type.
       return kTfLiteError;
diff --git a/tensorflow/contrib/lite/kernels/cast_test.cc b/tensorflow/contrib/lite/kernels/cast_test.cc
index 4e56482a371550b6275a6380e2beebe3cef958ff..53e20007378392467356ab29ecb8b217bb7a9e89 100644
--- a/tensorflow/contrib/lite/kernels/cast_test.cc
+++ b/tensorflow/contrib/lite/kernels/cast_test.cc
@@ -57,6 +57,22 @@ TEST(CastOpModel, CastFloatToInt) {
               ElementsAreArray({100, 20, 3, 0, 0, 1}));
 }
 
+TEST(CastOpModel, CastFloatToBool) {
+  CastOpModel m({TensorType_FLOAT32, {3, 2}}, {TensorType_BOOL, {3, 2}});
+  m.PopulateTensor<float>(m.input(), {100.f, -1.0f, 0.f, 0.4f, 0.999f, 1.1f});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<bool>(m.output()),
+              ElementsAreArray({true, true, false, true, true, true}));
+}
+
+TEST(CastOpModel, CastBoolToFloat) {
+  CastOpModel m({TensorType_BOOL, {3, 2}}, {TensorType_FLOAT32, {3, 2}});
+  m.PopulateTensor<bool>(m.input(), {true, true, false, true, false, true});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray({1.f, 1.0f, 0.f, 1.0f, 0.0f, 1.0f}));
+}
+
 }  // namespace
 }  // namespace tflite
 int main(int argc, char** argv) {
diff --git a/tensorflow/contrib/lite/kernels/comparisons.cc b/tensorflow/contrib/lite/kernels/comparisons.cc
index 87c413cb982dafd239818040d067738e786d43ff..3b81062cd42f04582b33ea919ef2742d3d869c22 100644
--- a/tensorflow/contrib/lite/kernels/comparisons.cc
+++ b/tensorflow/contrib/lite/kernels/comparisons.cc
@@ -28,12 +28,12 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
-TfLiteStatus LessPrepare(TfLiteContext* context, TfLiteNode* node) {
+TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Don't support string and bool.
@@ -56,61 +56,143 @@ TfLiteStatus LessPrepare(TfLiteContext* context, TfLiteNode* node) {
   return context->ResizeTensor(context, output, output_size);
 }
 
-TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+#define TF_LITE_COMPARISON(type, opname, requires_broadcast)    \
+  requires_broadcast                                            \
+      ? reference_ops::Broadcast##opname(                       \
+            GetTensorData<type>(input1), GetTensorDims(input1), \
+            GetTensorData<type>(input2), GetTensorDims(input2), \
+            GetTensorData<bool>(output), GetTensorDims(output)) \
+      : reference_ops::opname(                                  \
+            GetTensorData<type>(input1), GetTensorDims(input1), \
+            GetTensorData<type>(input2), GetTensorDims(input2), \
+            GetTensorData<bool>(output), GetTensorDims(output));
+
+TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  // TODO(renjieliu): Support quantized data.
+  switch (input1->type) {
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, Greater, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, Greater, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, Greater, requires_broadcast);
+      break;
+    default:
+      context->ReportError(context,
+                           "Does not support type %d, requires float|int",
+                           input1->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
 
+TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   bool requires_broadcast = !HaveSameShapes(input1, input2);
+  // TODO(renjieliu): Support quantized data.
+  switch (input1->type) {
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, GreaterEqual, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, GreaterEqual, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, GreaterEqual, requires_broadcast);
+      break;
+    default:
+      context->ReportError(context,
+                           "Does not support type %d, requires float|int",
+                           input1->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
 
-#define TF_LITE_LESS(type, opname)                                          \
-  reference_ops::opname(GetTensorData<type>(input1), GetTensorDims(input1), \
-                        GetTensorData<type>(input2), GetTensorDims(input2), \
-                        GetTensorData<bool>(output), GetTensorDims(output));
+TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  // TODO(renjieliu): Support quantized data.
+  switch (input1->type) {
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, Less, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, Less, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, Less, requires_broadcast);
+      break;
+    default:
+      context->ReportError(context,
+                           "Does not support type %d, requires float|int",
+                           input1->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
 
+TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
   // TODO(renjieliu): Support quantized data.
-  if (requires_broadcast) {
-    switch (input1->type) {
-      case kTfLiteFloat32:
-        TF_LITE_LESS(float, BroadcastLess);
-        break;
-      case kTfLiteInt32:
-        TF_LITE_LESS(int32_t, BroadcastLess);
-        break;
-      case kTfLiteInt64:
-        TF_LITE_LESS(int64_t, BroadcastLess);
-        break;
-      default:
-        context->ReportError(context,
-                             "Does not support type other than float|int");
-        return kTfLiteError;
-    }
-  } else {
-    switch (input1->type) {
-      case kTfLiteFloat32:
-        TF_LITE_LESS(float, Less);
-        break;
-      case kTfLiteInt32:
-        TF_LITE_LESS(int32_t, Less);
-        break;
-      case kTfLiteInt64:
-        TF_LITE_LESS(int64_t, Less);
-        break;
-      default:
-        context->ReportError(context,
-                             "Does not support type other than float|int");
-        return kTfLiteError;
-    }
+  switch (input1->type) {
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, LessEqual, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, LessEqual, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, LessEqual, requires_broadcast);
+      break;
+    default:
+      context->ReportError(context,
+                           "Does not support type %d, requires float|int",
+                           input1->type);
+      return kTfLiteError;
   }
-#undef TF_LITE_LESS
   return kTfLiteOk;
 }
 
 }  // namespace comparisons
 
+TfLiteRegistration* Register_GREATER() {
+  static TfLiteRegistration r = {nullptr, nullptr,
+                                 comparisons::ComparisonPrepare,
+                                 comparisons::GreaterEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_GREATER_EQUAL() {
+  static TfLiteRegistration r = {nullptr, nullptr,
+                                 comparisons::ComparisonPrepare,
+                                 comparisons::GreaterEqualEval};
+  return &r;
+}
+
 TfLiteRegistration* Register_LESS() {
-  static TfLiteRegistration r = {nullptr, nullptr, comparisons::LessPrepare,
-                                 comparisons::LessEval};
+  static TfLiteRegistration r = {
+      nullptr, nullptr, comparisons::ComparisonPrepare, comparisons::LessEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_LESS_EQUAL() {
+  static TfLiteRegistration r = {nullptr, nullptr,
+                                 comparisons::ComparisonPrepare,
+                                 comparisons::LessEqualEval};
   return &r;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/comparisons_test.cc b/tensorflow/contrib/lite/kernels/comparisons_test.cc
index da2d7f858984a4d3bb09ca8e485fe1599bea7ded..835d238d36d1757a27119ae24b3c07232e9d3dc0 100644
--- a/tensorflow/contrib/lite/kernels/comparisons_test.cc
+++ b/tensorflow/contrib/lite/kernels/comparisons_test.cc
@@ -23,6 +23,139 @@ namespace {
 
 using ::testing::ElementsAreArray;
 
+class GreaterOpModel : public SingleOpModel {
+ public:
+  GreaterOpModel(std::initializer_list<int> input1_shape,
+                 std::initializer_list<int> input2_shape,
+                 TensorType input_type) {
+    input1_ = AddInput(input_type);
+    input2_ = AddInput(input_type);
+    output_ = AddOutput(TensorType_BOOL);
+    SetBuiltinOp(BuiltinOperator_GREATER, BuiltinOptions_GreaterOptions,
+                 CreateGreaterOptions(builder_).Union());
+    BuildInterpreter({input1_shape, input2_shape});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(ComparisonsTest, GreaterFloat) {
+  GreaterOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
+  model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, GreaterInt) {
+  GreaterOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, false, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, GreaterBroadcast) {
+  GreaterOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {7});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, false, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, GreaterBroadcastTwoD) {
+  GreaterOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8});
+  model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false,
+                                                   false, true, false, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4}));
+}
+
+class GreaterEqualOpModel : public SingleOpModel {
+ public:
+  GreaterEqualOpModel(std::initializer_list<int> input1_shape,
+                      std::initializer_list<int> input2_shape,
+                      TensorType input_type) {
+    input1_ = AddInput(input_type);
+    input2_ = AddInput(input_type);
+    output_ = AddOutput(TensorType_BOOL);
+    SetBuiltinOp(BuiltinOperator_GREATER_EQUAL,
+                 BuiltinOptions_GreaterEqualOptions,
+                 CreateGreaterEqualOptions(builder_).Union());
+    BuildInterpreter({input1_shape, input2_shape});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(ComparisonsTest, GreaterEqualFloat) {
+  GreaterEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
+  model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, true, true, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, GreaterEqualInt) {
+  GreaterEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, GreaterEqualBroadcast) {
+  GreaterEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {7});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, GreaterEqualBroadcastTwoD) {
+  GreaterEqualOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8});
+  model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false,
+                                                   false, true, true, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4}));
+}
+
 class LessOpModel : public SingleOpModel {
  public:
   LessOpModel(std::initializer_list<int> input1_shape,
@@ -47,7 +180,7 @@ class LessOpModel : public SingleOpModel {
   int output_;
 };
 
-TEST(ArgMaxOpTest, LessFloat) {
+TEST(ComparisonsTest, LessFloat) {
   LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32);
   model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
   model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
@@ -57,7 +190,7 @@ TEST(ArgMaxOpTest, LessFloat) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
 }
 
-TEST(ArgMaxOpTest, LessInt) {
+TEST(ComparisonsTest, LessInt) {
   LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
   model.PopulateTensor<int>(model.input2(), {1, 2, 6, 5});
@@ -67,7 +200,7 @@ TEST(ArgMaxOpTest, LessInt) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
 }
 
-TEST(ArgMaxOpTest, LessBroadcast) {
+TEST(ComparisonsTest, LessBroadcast) {
   LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
   model.PopulateTensor<int>(model.input2(), {7});
@@ -77,7 +210,7 @@ TEST(ArgMaxOpTest, LessBroadcast) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
 }
 
-TEST(ArgMaxOpTest, LessBroadcastTwoD) {
+TEST(ComparisonsTest, LessBroadcastTwoD) {
   LessOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 6, 8});
   model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
@@ -88,6 +221,72 @@ TEST(ArgMaxOpTest, LessBroadcastTwoD) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4}));
 }
 
+class LessEqualOpModel : public SingleOpModel {
+ public:
+  LessEqualOpModel(std::initializer_list<int> input1_shape,
+                   std::initializer_list<int> input2_shape,
+                   TensorType input_type) {
+    input1_ = AddInput(input_type);
+    input2_ = AddInput(input_type);
+    output_ = AddOutput(TensorType_BOOL);
+    SetBuiltinOp(BuiltinOperator_LESS_EQUAL, BuiltinOptions_LessEqualOptions,
+                 CreateLessEqualOptions(builder_).Union());
+    BuildInterpreter({input1_shape, input2_shape});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(ComparisonsTest, LessEqualFloat) {
+  LessEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
+  model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, LessEqualInt) {
+  LessEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, true, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, LessEqualBroadcast) {
+  LessEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {7});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, true, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, LessEqualBroadcastTwoD) {
+  LessEqualOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8});
+  model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true,
+                                                   true, false, true, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index 3b467b3aa284586ab8e67ede55583adffbe06cc7..0b35a220e783572985206e918f3fcd8361f16790 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -212,8 +212,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     } else {
       TF_LITE_ENSURE_EQ(context, bias->type, data_type);
     }
-    TF_LITE_ENSURE_EQ(context, bias->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, bias->dims->data[0], filter->dims->data[0]);
+    TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0));
   }
 
   int channels_out = filter->dims->data[0];
@@ -489,7 +488,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                  bias, im2col, hwcn_weights, output);
       break;
     default:
-      context->ReportError(context, "Type not currently supported.");
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv.cc b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
index eeda1bc3c5ba2da5b6546ce36925a6f20fc9cbae..abb2549f85164e40875456fb732716e53b263127 100644
--- a/tensorflow/contrib/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
@@ -83,9 +83,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   bool hasBias = NumInputs(node) == 3;
 
   TF_LITE_ENSURE(context, hasBias || NumInputs(node) == 2);
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  TfLiteTensor* bias = nullptr;
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias = nullptr;
 
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
@@ -169,8 +169,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                TfLiteDepthwiseConvParams* params, OpData* data,
-               TfLiteTensor* input, TfLiteTensor* filter, TfLiteTensor* bias,
-               TfLiteTensor* output) {
+               const TfLiteTensor* input, const TfLiteTensor* filter,
+               const TfLiteTensor* bias, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
@@ -196,8 +196,8 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                    TfLiteDepthwiseConvParams* params, OpData* data,
-                   TfLiteTensor* input, TfLiteTensor* filter,
-                   TfLiteTensor* bias, TfLiteTensor* output) {
+                   const TfLiteTensor* input, const TfLiteTensor* filter,
+                   const TfLiteTensor* bias, TfLiteTensor* output) {
   auto input_offset = -input->params.zero_point;
   auto filter_offset = -filter->params.zero_point;
   auto output_offset = output->params.zero_point;
@@ -230,9 +230,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  TfLiteTensor* bias =
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias =
       (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
 
   // TODO(aselle): Consider whether float conv and quantized conv should be
@@ -247,7 +247,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                  bias, output);
       break;
     default:
-      context->ReportError(context, "Type not currently supported.");
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/dequantize.cc b/tensorflow/contrib/lite/kernels/dequantize.cc
index e685f2465f627cf30e02564e6f16e1ec69e208e2..672b2170e4990f0a7ca9755071d9d086f5ae5c2b 100644
--- a/tensorflow/contrib/lite/kernels/dequantize.cc
+++ b/tensorflow/contrib/lite/kernels/dequantize.cc
@@ -32,7 +32,7 @@ struct OpContext {
     input = GetInput(context, node, 0);
     output = GetOutput(context, node, 0);
   }
-  TfLiteTensor* input;
+  const TfLiteTensor* input;
   TfLiteTensor* output;
 };
 
diff --git a/tensorflow/contrib/lite/kernels/div.cc b/tensorflow/contrib/lite/kernels/div.cc
index ec380c8e4956e5bcd0d7559bfd8f89a52d9d233c..d264821e30cf622ff5d3d8ad513add46caa9e7ae 100644
--- a/tensorflow/contrib/lite/kernels/div.cc
+++ b/tensorflow/contrib/lite/kernels/div.cc
@@ -57,8 +57,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
@@ -80,7 +80,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                TfLiteDivParams* params, const OpData* data,
-               TfLiteTensor* input1, TfLiteTensor* input2,
+               const TfLiteTensor* input1, const TfLiteTensor* input2,
                TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
@@ -106,22 +106,21 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
 #undef TF_LITE_DIV
 }
 
-
-
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteDivParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
     EvalFloat<kernel_type>(context, node, params, data, input1, input2, output);
   } else {
-    context->ReportError(context,
-                         "Div only supports FLOAT32 and quantized UINT8 now.");
+    context->ReportError(
+        context, "Div only supports FLOAT32 and quantized UINT8 now, got %d.",
+        output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/elementwise.cc b/tensorflow/contrib/lite/kernels/elementwise.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0bd504695074011efd946f4c4d1f8d4854e82730
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/elementwise.cc
@@ -0,0 +1,68 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace elementwise {
+
+TfLiteStatus SinPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  // Quantized float is not supported yet.
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  return context->ResizeTensor(context, output,
+                               TfLiteIntArrayCopy(input->dims));
+}
+
+TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      size_t elements = NumElements(input);
+      const float* in = GetTensorData<float>(input);
+      const float* in_end = in + elements;
+      float* out = output->data.f;
+      for (; in < in_end; in++, out++) *out = std::sin(*in);
+      return kTfLiteOk;
+    }
+    default: {
+      context->ReportError(context, "Input type is %d, requires float32",
+                           input->type);
+      return kTfLiteError;
+    }
+  }
+}
+
+}  // namespace elementwise
+
+TfLiteRegistration* Register_SIN() {
+  static TfLiteRegistration r = {nullptr, nullptr, elementwise::SinPrepare,
+                                 elementwise::SinEval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/elementwise_test.cc b/tensorflow/contrib/lite/kernels/elementwise_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..412ffb04b90fbc24d232d25d2a86ce639752c3e8
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/elementwise_test.cc
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class SinOpModel : public SingleOpModel {
+ public:
+  SinOpModel(std::initializer_list<int> input_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_SIN, BuiltinOptions_NONE, 0);
+    BuildInterpreter({input_shape});
+  }
+
+  int input() const { return input_; }
+  int output() const { return output_; }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(ElementWise, Sin) {
+  SinOpModel m({1, 1, 4, 1});
+  m.PopulateTensor<float>(m.input(), {0, 3.1415926, -3.1415926, 1});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray(ArrayFloatNear({0, 0, 0, 0.84147})));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup.cc b/tensorflow/contrib/lite/kernels/embedding_lookup.cc
index 4e8cb396d43a58f94b08eb8dd8b05d16fd74fd2f..7539c0b30ded921df957217bebdc7b20ea4b40b4 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup.cc
@@ -51,11 +51,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* lookup = GetInput(context, node, 0);
+  const TfLiteTensor* lookup = GetInput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, NumDimensions(lookup), 1);
   TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32);
 
-  TfLiteTensor* value = GetInput(context, node, 1);
+  const TfLiteTensor* value = GetInput(context, node, 1);
   TF_LITE_ENSURE(context, NumDimensions(value) >= 2);
 
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -71,8 +71,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TfLiteTensor* lookup = GetInput(context, node, 0);
-  TfLiteTensor* value = GetInput(context, node, 1);
+  const TfLiteTensor* lookup = GetInput(context, node, 0);
+  const TfLiteTensor* value = GetInput(context, node, 1);
 
   const int row_size = SizeOfDimension(value, 0);
   const int row_bytes = value->bytes / row_size;
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc
index 6c770e7f71efe83eace3640c47e03e0c7ab19e20..d3be36993c3843cf928bf458ae2c8019df7ccf31 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc
@@ -81,19 +81,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 5);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* ids = GetInput(context, node, 0);
+  const TfLiteTensor* ids = GetInput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, NumDimensions(ids), 1);
   TF_LITE_ENSURE_EQ(context, ids->type, kTfLiteInt32);
 
-  TfLiteTensor* indices = GetInput(context, node, 1);
+  const TfLiteTensor* indices = GetInput(context, node, 1);
   TF_LITE_ENSURE_EQ(context, NumDimensions(indices), 2);
   TF_LITE_ENSURE_EQ(context, indices->type, kTfLiteInt32);
 
-  TfLiteTensor* shape = GetInput(context, node, 2);
+  const TfLiteTensor* shape = GetInput(context, node, 2);
   TF_LITE_ENSURE_EQ(context, NumDimensions(shape), 1);
   TF_LITE_ENSURE_EQ(context, shape->type, kTfLiteInt32);
 
-  TfLiteTensor* weights = GetInput(context, node, 3);
+  const TfLiteTensor* weights = GetInput(context, node, 3);
   TF_LITE_ENSURE_EQ(context, NumDimensions(weights), 1);
   TF_LITE_ENSURE_EQ(context, weights->type, kTfLiteFloat32);
 
@@ -102,7 +102,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(indices, 0),
                     SizeOfDimension(weights, 0));
 
-  TfLiteTensor* value = GetInput(context, node, 4);
+  const TfLiteTensor* value = GetInput(context, node, 4);
   TF_LITE_ENSURE(context, NumDimensions(value) >= 2);
 
   // Mark the output as a dynamic tensor.
@@ -139,11 +139,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteEmbeddingLookupSparseParams*>(node->builtin_data);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TfLiteTensor* ids = GetInput(context, node, 0);
-  TfLiteTensor* indices = GetInput(context, node, 1);
-  TfLiteTensor* dense_shape = GetInput(context, node, 2);
-  TfLiteTensor* weights = GetInput(context, node, 3);
-  TfLiteTensor* value = GetInput(context, node, 4);
+  const TfLiteTensor* ids = GetInput(context, node, 0);
+  const TfLiteTensor* indices = GetInput(context, node, 1);
+  const TfLiteTensor* dense_shape = GetInput(context, node, 2);
+  const TfLiteTensor* weights = GetInput(context, node, 3);
+  const TfLiteTensor* value = GetInput(context, node, 4);
 
   const int lookup_rank = SizeOfDimension(indices, 1);
   const int embedding_rank = NumDimensions(value);
diff --git a/tensorflow/contrib/lite/kernels/exp.cc b/tensorflow/contrib/lite/kernels/exp.cc
index a9e79b742dc2c80ce4ed9a3aa786814265dcb660..ce03cdfe26cac861837c4d534a083787e149cef0 100644
--- a/tensorflow/contrib/lite/kernels/exp.cc
+++ b/tensorflow/contrib/lite/kernels/exp.cc
@@ -36,7 +36,7 @@ struct ExpContext {
     input = GetInput(context, node, 0);
     output = GetOutput(context, node, 0);
   }
-  TfLiteTensor* input;
+  const TfLiteTensor* input;
   TfLiteTensor* output;
 };
 
diff --git a/tensorflow/contrib/lite/kernels/floor.cc b/tensorflow/contrib/lite/kernels/floor.cc
index 4b4395f711614a3b7047dc8f144ca3fa36b8a89b..697b777693e275e36d56f7865c8a3638071591a0 100644
--- a/tensorflow/contrib/lite/kernels/floor.cc
+++ b/tensorflow/contrib/lite/kernels/floor.cc
@@ -27,7 +27,7 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -38,7 +38,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   optimized_ops::Floor(GetTensorData<float>(input), GetTensorDims(input),
diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc
index 470b52b7bc4e65596ba1a3f4f8c036819dcaad28..3374923e6e0f2f4520d2d67e698ae5b4fd8e5443 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected.cc
@@ -89,9 +89,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 3);
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Check all the parameters of tensor match within themselves and match the
@@ -106,11 +106,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ASSERT_EQ(input_size, batch_size * filter->dims->data[1]);
   if (bias) {
-    TF_LITE_ASSERT_EQ(bias->dims->data[0], num_units);
+    TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0));
   }
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 2);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(bias), 1);
 
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
@@ -158,8 +157,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus EvalPie(TfLiteContext* context, TfLiteNode* node,
                      TfLiteFullyConnectedParams* params, OpData* data,
-                     TfLiteTensor* input, TfLiteTensor* filter,
-                     TfLiteTensor* bias, TfLiteTensor* output) {
+                     const TfLiteTensor* input, const TfLiteTensor* filter,
+                     const TfLiteTensor* bias, TfLiteTensor* output) {
   int total_input_size = 1;
   for (int i = 0; i < input->dims->size; i++) {
     total_input_size *= input->dims->data[i];
@@ -191,8 +190,10 @@ TfLiteStatus EvalPie(TfLiteContext* context, TfLiteNode* node,
 
 TfLiteStatus EvalPieQuantized(TfLiteContext* context, TfLiteNode* node,
                               TfLiteFullyConnectedParams* params, OpData* data,
-                              TfLiteTensor* input, TfLiteTensor* filter,
-                              TfLiteTensor* bias, TfLiteTensor* input_quantized,
+                              const TfLiteTensor* input,
+                              const TfLiteTensor* filter,
+                              const TfLiteTensor* bias,
+                              TfLiteTensor* input_quantized,
                               TfLiteTensor* output) {
   // Check the types for this hybrid Op.
   TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
@@ -271,8 +272,9 @@ TfLiteStatus EvalPieQuantized(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            TfLiteFullyConnectedParams* params, OpData* data,
-                           TfLiteTensor* input, TfLiteTensor* filter,
-                           TfLiteTensor* bias, TfLiteTensor* output) {
+                           const TfLiteTensor* input,
+                           const TfLiteTensor* filter, const TfLiteTensor* bias,
+                           TfLiteTensor* output) {
   gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
 
   int32_t input_offset = -input->params.zero_point;
@@ -311,8 +313,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
                        TfLiteFullyConnectedParams* params, OpData* data,
-                       TfLiteTensor* input, TfLiteTensor* filter,
-                       TfLiteTensor* bias, TfLiteTensor* output) {
+                       const TfLiteTensor* input, const TfLiteTensor* filter,
+                       const TfLiteTensor* bias, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
@@ -342,9 +344,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   switch (filter->type) {  // Already know in/out types are same.
@@ -355,7 +357,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return EvalQuantized<kernel_type>(context, node, params, data, input,
                                         filter, bias, output);
     default:
-      context->ReportError(context, "Type not currently supported.");
+      context->ReportError(context, "Type %d not currently supported.",
+                           filter->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/gather.cc b/tensorflow/contrib/lite/kernels/gather.cc
index 0e4187d1eac64636a2e2b25e9a1cc45c3a4da557..6a2341461f2c627c78bd4783ee27579b59b5fde3 100644
--- a/tensorflow/contrib/lite/kernels/gather.cc
+++ b/tensorflow/contrib/lite/kernels/gather.cc
@@ -35,8 +35,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   const auto* params =
       reinterpret_cast<const TfLiteGatherParams*>(node->builtin_data);
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* positions = GetInput(context, node, kInputPositions);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* positions = GetInput(context, node, kInputPositions);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   // Only INT32 positions are supported.
   TF_LITE_ENSURE_EQ(context, positions->type, kTfLiteInt32);
@@ -59,8 +59,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
     } break;
     default:
-      context->ReportError(context,
-                           "Only float32 and string types are supported");
+      context->ReportError(
+          context, "Only float32 and string types are supported, got %d",
+          input->type);
       return kTfLiteError;
   }
   const int num_dimensions =
@@ -81,8 +82,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* positions = GetInput(context, node, kInputPositions);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* positions = GetInput(context, node, kInputPositions);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   const int input_rank = NumDimensions(input);
 #define TF_LITE_GATHER(data_type, index_type)                            \
diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup.cc b/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
index 3b82601d119b2e4946db6e3577300168c7e710b6..41211d41aa85a5a2da6ae96dc6f0337c54fb1a45 100644
--- a/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
+++ b/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
@@ -60,15 +60,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 2);
 
-  TfLiteTensor* lookup = GetInput(context, node, 0);
+  const TfLiteTensor* lookup = GetInput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, NumDimensions(lookup), 1);
   TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32);
 
-  TfLiteTensor* key = GetInput(context, node, 1);
+  const TfLiteTensor* key = GetInput(context, node, 1);
   TF_LITE_ENSURE_EQ(context, NumDimensions(key), 1);
   TF_LITE_ENSURE_EQ(context, key->type, kTfLiteInt32);
 
-  TfLiteTensor* value = GetInput(context, node, 2);
+  const TfLiteTensor* value = GetInput(context, node, 2);
   TF_LITE_ENSURE(context, NumDimensions(value) >= 1);
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(key, 0),
                     SizeOfDimension(value, 0));
@@ -102,9 +102,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TfLiteTensor* hits = GetOutput(context, node, 1);
-  TfLiteTensor* lookup = GetInput(context, node, 0);
-  TfLiteTensor* key = GetInput(context, node, 1);
-  TfLiteTensor* value = GetInput(context, node, 2);
+  const TfLiteTensor* lookup = GetInput(context, node, 0);
+  const TfLiteTensor* key = GetInput(context, node, 1);
+  const TfLiteTensor* value = GetInput(context, node, 2);
 
   const int num_rows = SizeOfDimension(value, 0);
   const int row_bytes = value->bytes / num_rows;
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index df29172f83a4b1ea07a06a963eef7b299738971d..aabbb0685c5f9160cd863952cb47526215dc31a6 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -5,6 +5,7 @@ package(default_visibility = [
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 tflite_deps_intel = [
     "@arm_neon_2_x86_sse",
@@ -157,6 +158,7 @@ cc_library(
         ":quantization_util",
         ":strided_slice_logic",
         ":types",
+        ":reference_base",
         ":round",
         "//third_party/eigen3",
         "@gemmlowp",
@@ -300,6 +302,8 @@ cc_library(
     name = "neon_tensor_utils",
     srcs = [
         "optimized/neon_tensor_utils.cc",
+        "reference/portable_tensor_utils.cc",
+        "reference/portable_tensor_utils.h",
     ],
     hdrs = [
         "common.h",
@@ -311,11 +315,11 @@ cc_library(
     copts = NEON_FLAGS_IF_APPLICABLE + HARD_FP_FLAGS_IF_APPLICABLE,
     deps = [
         ":cpu_check",
-        ":portable_tensor_utils",
         ":round",
         ":types",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite/kernels:activation_functor",
+        "//tensorflow/contrib/lite/kernels:op_macros",
         "@arm_neon_2_x86_sse",
         "@gemmlowp",
     ],
@@ -386,6 +390,9 @@ cc_library(
         ":armv7a": [
             ":neon_tensor_utils",
         ],
+        ":haswell": [
+            ":neon_tensor_utils",
+        ],
         ":ios_armv7": [
             ":neon_tensor_utils",
         ],
@@ -424,6 +431,7 @@ cc_test(
         "//conditions:default": [],
     }),
     linkstatic = 1,
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":tensor_utils",
         "//tensorflow/contrib/lite:builtin_op_data",
@@ -458,3 +466,5 @@ cc_test(
 )
 
 exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
+
+tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/kernels/internal/common.h b/tensorflow/contrib/lite/kernels/internal/common.h
index e126774ebc866fee7476e2ba98a5863085965215..4d3803a07fbdaf9d0d27664037c219715884ee76 100644
--- a/tensorflow/contrib/lite/kernels/internal/common.h
+++ b/tensorflow/contrib/lite/kernels/internal/common.h
@@ -115,6 +115,20 @@ inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
                              right_shift);
 }
 
+template <typename T>
+int CountLeadingZeros(T integer_input) {
+  static_assert(std::is_unsigned<T>::value,
+                "Only unsigned integer types handled.");
+  const T one_in_leading_positive = static_cast<T>(1)
+                                    << (std::numeric_limits<T>::digits - 1);
+  int leading_zeros = 0;
+  while (integer_input < one_in_leading_positive) {
+    integer_input <<= 1;
+    ++leading_zeros;
+  }
+  return leading_zeros;
+}
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
index f142374269606bdd3d4184af013749102666ab89..5f9cfc450db1c25ff604f99f93481e5ca590a5a2 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
+
+#include <algorithm>
+
 #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
 
 namespace tflite {
@@ -40,6 +44,76 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
                                         hidden_state_ptr_batch);
 }
 
+void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr,
+                  float input_weights_scale,
+                  const int8_t* recurrent_weights_ptr,
+                  float recurrent_weights_scale, const float* bias_ptr,
+                  int input_size, int num_units, int batch_size,
+                  TfLiteFusedActivation activation,
+                  int8_t* quantized_input_ptr_batch,
+                  int8_t* quantized_hidden_state_ptr_batch,
+                  float* hidden_state_ptr_batch, float* output_ptr_batch) {
+  // Output = bias
+  tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
+                                        output_ptr_batch);
+
+  // TODO(mirkov): change std::minmax_element with a vectorized call.
+  auto minmax_element = std::minmax_element(
+      input_ptr_batch, input_ptr_batch + batch_size * input_size);
+
+  // Save quantization and matmul computation for all zero input.
+  if (!(*minmax_element.first == 0.0 && *minmax_element.second == 0.0)) {
+    // Quantize input from float to uint8 + quantization params (scaling
+    // factor).
+    float unused_min, unused_max;
+    float* scaling_factors = new float[batch_size];
+    for (int b = 0; b < batch_size; ++b) {
+      const int offset = b * input_size;
+      tensor_utils::SymmetricQuantizeFloats(
+          input_ptr_batch + offset, input_size,
+          quantized_input_ptr_batch + offset, &unused_min, &unused_max,
+          &scaling_factors[b]);
+      scaling_factors[b] *= input_weights_scale;
+    }
+
+    // Output += input * input_weights
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_weights_ptr, num_units, input_size, quantized_input_ptr_batch,
+        scaling_factors, batch_size, output_ptr_batch, /*result_stride=*/1);
+    delete[] scaling_factors;
+  }
+
+  minmax_element = std::minmax_element(
+      hidden_state_ptr_batch, hidden_state_ptr_batch + batch_size * num_units);
+  // Save quantization and matmul computation for all zero input.
+  if (!(*minmax_element.first == 0.0 && *minmax_element.second == 0.0)) {
+    // Quantize hidden_state
+    float unused_min, unused_max;
+    float* scaling_factors = new float[batch_size];
+    for (int b = 0; b < batch_size; ++b) {
+      const int offset = b * num_units;
+      tensor_utils::SymmetricQuantizeFloats(
+          hidden_state_ptr_batch + offset, num_units,
+          quantized_hidden_state_ptr_batch + offset, &unused_min, &unused_max,
+          &scaling_factors[b]);
+      scaling_factors[b] *= recurrent_weights_scale;
+    }
+
+    // Output += recurrent_weights * hidden_state
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_weights_ptr, num_units, num_units,
+        quantized_hidden_state_ptr_batch, scaling_factors, batch_size,
+        output_ptr_batch, /*result_stride=*/1);
+    delete[] scaling_factors;
+  }
+
+  // Output = activation(Output) and update hidden_state
+  tensor_utils::ApplyActivationToVector(
+      output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
+  tensor_utils::VectorBatchVectorAssign(output_ptr_batch, num_units, batch_size,
+                                        hidden_state_ptr_batch);
+}
+
 void LstmStep(
     const float* input_ptr_batch, const float* input_to_input_weights_ptr,
     const float* input_to_forget_weights_ptr,
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
index 3ec60ee57a87833959a34ba95d32df15bea188a4..cbfbcbeefcd34fa732799d89f52791b18855857d 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
@@ -35,6 +35,23 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
                   TfLiteFusedActivation activation,
                   float* hidden_state_ptr_batch, float* output_ptr_batch);
 
+// Performs a quantized RNN batch inference step. Same as above, but for
+// quantization purposes, we also pass in quantized_hidden_state_ptr_batch and
+// quantized_input_ptr_batch pointers for temporary storage of the quantized
+// values of hidden_state_ptr_batch and input_ptr_batch, respectively.
+// These temporary storages are expected to be preallocated to the same size as
+// the respective pointers.
+// {input,recurrent}_weights_scale params are used for dequantization/recovery.
+void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr,
+                  float input_weights_scale,
+                  const int8_t* recurrent_weights_ptr,
+                  float recurrent_weights_scale, const float* bias_ptr,
+                  int input_size, int num_units, int batch_size,
+                  TfLiteFusedActivation activation,
+                  int8_t* quantized_input_ptr_batch,
+                  int8_t* quantized_hidden_state_ptr_batch,
+                  float* hidden_state_ptr_batch, float* output_ptr_batch);
+
 // Performs an LSTM batch inference step for input specified by input_ptr_batch.
 // The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and
 // biases (*_bias_ptr), and buffers (*_scratch), along with additional
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
index 0bfb4e9b1f8ee4167cfb629645a38538be1d73d4..27d9224512a835ea58911031f1b4d6dcf5482ba9 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
@@ -129,8 +129,9 @@ class EigenTensorConvFunctor {
       const int conv_width = output_height * output_width;
       Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
       dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
-      EigenMatrix output(output_data, conv_width, filter_count);
-      ConstEigenMatrix input(input_data, conv_width, input_depth);
+      EigenMatrix output(output_data, input_batches * conv_width, filter_count);
+      ConstEigenMatrix input(input_data, input_batches * conv_width,
+                             input_depth);
       ConstEigenMatrix filter(filter_data, input_depth, filter_count);
       MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input,
                                                       filter, dim_pair);
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 65f25168e3a20748973549c2f7385b14863294eb..08f7cfa5a5f9453cd187164078898e754126da52 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -56,9 +56,12 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
       m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1));
 
   // The arrays used to cache the vector.
+  void* aligned_vector_cache_free = nullptr;
   float32x4_t* vector_cache_float32x4 =
-      new float32x4_t[(m_cols / kFloatWeightsPerNeonLane) *
-                      sizeof(float32x4_t)];
+      reinterpret_cast<float32x4_t*>(aligned_alloc(
+          sizeof(float32x4_t), (postamble_start >> 2) * sizeof(float32x4_t),
+          &aligned_vector_cache_free));
+
   const int kUnrollSize = 2;
   for (int b = 0; b < n_batch; b++) {
     float* result_in_batch = result + b * m_rows * result_stride;
@@ -71,7 +74,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
       matrix_ptr1 = matrix + m_cols;
     }
 
-    // Cahce the vector.
+    // Cache the vector.
     for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
       vector_cache_float32x4[c >> 2] = vld1q_f32(vector_in_batch + c);
     }
@@ -128,7 +131,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
       result_in_batch += result_stride;
     }
   }
-  delete[] vector_cache_float32x4;
+  free(aligned_vector_cache_free);
 }
 
 void NeonMatrixBatchVectorMultiplyAccumulate(
@@ -294,9 +297,12 @@ void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
 
   // The arrays used to cache the vector.
+  void* aligned_vector_cache_free = nullptr;
   float32x4_t* vector_cache_float32x4 =
-      new float32x4_t[(v_size / kFloatWeightsPerNeonLane) *
-                      sizeof(float32x4_t)];
+      reinterpret_cast<float32x4_t*>(aligned_alloc(
+          sizeof(float32x4_t), (postamble_start >> 2) * sizeof(float32x4_t),
+          &aligned_vector_cache_free));
+
   for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
     vector_cache_float32x4[v >> 2] = vld1q_f32(vector + v);
   }
@@ -322,7 +328,7 @@ void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
     result_ptr += v_size;
     batch_vector_ptr += v_size;
   }
-  delete[] vector_cache_float32x4;
+  free(aligned_vector_cache_free);
 }
 
 void NeonSub1Vector(const float* vector, int v_size, float* result) {
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index c3dc90b844a87a2e24e76c7e77306c2439fc5dfa..b33e4b5017d6ace16799fdea9b4d08dc6c110660 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
 #include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
@@ -38,6 +39,18 @@ limitations under the License.
 namespace tflite {
 namespace optimized_ops {
 
+// Unoptimized reference ops:
+using reference_ops::BroadcastGreater;
+using reference_ops::BroadcastGreaterEqual;
+using reference_ops::BroadcastLess;
+using reference_ops::BroadcastLessEqual;
+using reference_ops::Greater;
+using reference_ops::GreaterEqual;
+using reference_ops::Less;
+using reference_ops::LessEqual;
+using reference_ops::RankOneSelect;
+using reference_ops::Select;
+
 // Make a local VectorMap typedef allowing to map a float array
 // as a Eigen vector expression. The std::conditional here is to
 // construct the suitable Eigen type for the constness of the
@@ -54,7 +67,7 @@ using VectorMap = typename std::conditional<
 
 template <typename Scalar, int N>
 VectorMap<Scalar> MapAsVector(Scalar* data, const Dims<N>& dims) {
-  const int size = RequiredBufferSizeForDims(dims);
+  const int size = FlatSize(dims);
   return VectorMap<Scalar>(data, size, 1);
 }
 
@@ -236,8 +249,8 @@ inline void AddBiasAndEvalActivationFunction(const float* bias_data,
                                              float output_activation_max) {
 #ifdef USE_NEON
   gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
-  const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3];
-  const int array_size = array_dims.sizes[3] * array_dims.strides[3];
+  const int bias_size = FlatSize(bias_dims);
+  const int array_size = FlatSize(array_dims);
   TFLITE_DCHECK_EQ((array_size % bias_size), 0);
   float* array_ptr = array_data;
   float* array_end_ptr = array_ptr + array_size;
@@ -287,8 +300,8 @@ inline void AddBiasAndEvalActivationFunction(const float* bias_data,
   }
 #else  // not NEON
   gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
-  const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3];
-  const int array_size = array_dims.sizes[3] * array_dims.strides[3];
+  const int bias_size = FlatSize(bias_dims);
+  const int array_size = FlatSize(array_dims);
   TFLITE_DCHECK_EQ((array_size % bias_size), 0);
   for (int array_offset = 0; array_offset < array_size;
        array_offset += bias_size) {
@@ -359,10 +372,8 @@ inline void GEMVForLstmCell(const uint8* input_data, const Dims<4>& input_dims,
   TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-  TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                       ArraySize(output_dims, 3),
-                   1);
-  const int input_size = input_dims.strides[3];
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_dims, 0), 1);
+  const int input_size = FlatSizeSkipDim(input_dims, 3);
   const int output_size = MatchingArraySize(weights_dims, 1, output_dims, 0);
   // This special fast path for quantized LSTM cells does not try to support
   // odd sizes that we haven't encountered in any LSTM cell, that would
@@ -545,10 +556,8 @@ inline void GEMVForLstmCellWithSymmetricRange(
   TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-  TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                       ArraySize(output_dims, 3),
-                   1);
-  const int input_size = input_dims.strides[3];
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_dims, 0), 1);
+  const int input_size = FlatSizeSkipDim(input_dims, 3);
   const int output_size = MatchingArraySize(weights_dims, 1, output_dims, 0);
   // This special fast path for quantized LSTM cells does not try to support
   // odd sizes that we haven't encountered in any LSTM cell, that would
@@ -881,10 +890,8 @@ inline void FullyConnectedAsGEMV(
   TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-  TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                       ArraySize(output_dims, 3),
-                   1);
-  const int input_size = input_dims.strides[3];
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_dims, 0), 1);
+  const int input_size = FlatSizeSkipDim(input_dims, 3);
   const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
   static constexpr int kPeel = 4;
   for (int k = 0; k < input_size; k += 64) {
@@ -1065,8 +1072,7 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
   // but the current --variable_batch hack consists in overwriting the 3rd
   // dimension with the runtime batch size, as we don't keep track for each
   // array of which dimension is the batch dimension in it.
-  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                      ArraySize(output_dims, 3);
+  const int batches = FlatSizeSkipDim(output_dims, 0);
 #ifdef USE_NEON
   const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
   if (batches == 1 && !(output_size % 4)) {
@@ -1122,8 +1128,7 @@ inline void FullyConnected(
   // but the current --variable_batch hack consists in overwriting the 3rd
   // dimension with the runtime batch size, as we don't keep track for each
   // array of which dimension is the batch dimension in it.
-  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                      ArraySize(output_dims, 3);
+  const int batches = FlatSizeSkipDim(output_dims, 0);
   const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0);
   const int accum_depth = ArraySize(filter_dims, 0);
   TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
@@ -1538,8 +1543,7 @@ inline void ExperimentalShuffledFullyConnected(
   // but the current --variable_batch hack consists in overwriting the 3rd
   // dimension with the runtime batch size, as we don't keep track for each
   // array of which dimension is the batch dimension in it.
-  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                      ArraySize(output_dims, 3);
+  const int batches = FlatSizeSkipDim(output_dims, 0);
   const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0);
   const int accum_depth = ArraySize(weights_dims, 0);
   TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
@@ -1975,15 +1979,11 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
   }
 
   const int gemm_input_rows = gemm_input_dims->sizes[0];
-  const int gemm_input_cols = gemm_input_dims->sizes[1] *
-                              gemm_input_dims->sizes[2] *
-                              gemm_input_dims->sizes[3];
+  const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_dims, 0);
   const int filter_rows = filter_dims.sizes[3];
-  const int filter_cols =
-      filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
+  const int filter_cols = FlatSizeSkipDim(filter_dims, 3);
   const int output_rows = output_dims.sizes[0];
-  const int output_cols =
-      output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
+  const int output_cols = FlatSizeSkipDim(output_dims, 0);
   TFLITE_DCHECK_EQ(output_rows, filter_rows);
   TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
   TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
@@ -2137,14 +2137,11 @@ void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims,
                     Ac == FusedActivationFunctionType::kRelu1,
                 "");
   const int input_rows = input_dims.sizes[0];
-  const int input_cols =
-      input_dims.sizes[1] * input_dims.sizes[2] * input_dims.sizes[3];
+  const int input_cols = FlatSizeSkipDim(input_dims, 0);
   const int filter_rows = filter_dims.sizes[3];
-  const int filter_cols =
-      filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
+  const int filter_cols = FlatSizeSkipDim(filter_dims, 3);
   const int output_rows = output_dims.sizes[0];
-  const int output_cols =
-      output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
+  const int output_cols = FlatSizeSkipDim(output_dims, 0);
   TFLITE_DCHECK_EQ(output_rows, filter_rows);
   TFLITE_DCHECK_EQ(output_cols, input_cols);
   TFLITE_DCHECK_EQ(filter_cols, input_rows);
@@ -2208,27 +2205,15 @@ void NonGlobalBatchNormalization(
     const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("NonGlobalBatchNormalization");
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
-                        offset_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
-                        offset_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
-                        offset_dims, 0, output_dims, 0);
+  const int inner_size = MatchingFlatSizeSkipDim(
+      input_dims, 3, mean_dims, multiplier_dims, offset_dims, output_dims);
 
   for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              (input_data[Offset(input_dims, c, x, y, b)] -
-               mean_data[Offset(mean_dims, c, x, y, 0)]) *
-                  multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
-              offset_data[Offset(offset_dims, c, x, y, 0)]);
-        }
-      }
+    for (int i = 0; i < inner_size; ++i) {
+      *output_data = ActivationFunction<Ac>(
+          (*input_data - mean_data[i]) * multiplier_data[i] + offset_data[i]);
+      ++output_data;
+      ++input_data;
     }
   }
 }
@@ -2243,24 +2228,17 @@ void GlobalBatchNormalization(const float* input_data,
                               const Dims<4>& offset_dims, float* output_data,
                               const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("GlobalBatchNormalization");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth =
       MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
                         offset_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              (input_data[Offset(input_dims, c, x, y, b)] -
-               mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
-                  multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
-              offset_data[Offset(offset_dims, c, 0, 0, 0)]);
-        }
-      }
+  for (int i = 0; i < outer_size; ++i) {
+    for (int c = 0; c < depth; ++c) {
+      *output_data = ActivationFunction<Ac>(
+          (*input_data - mean_data[c]) * multiplier_data[c] + offset_data[c]);
+      ++output_data;
+      ++input_data;
     }
   }
 }
@@ -2277,44 +2255,26 @@ inline void Relu(const float* input_data, const Dims<4>& input_dims,
 inline void Relu1(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float upper = 1;
-          const float lower = -1;
-          float clamped = val > upper ? upper : val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float upper = 1;
+    const float lower = -1;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
 inline void Relu6(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float upper = 6;
-          const float lower = 0;
-          float clamped = val > upper ? upper : val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float upper = 6;
+    const float lower = 0;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
@@ -2323,24 +2283,19 @@ void L2Normalization(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("L2Normalization");
   static_assert(Ac == FusedActivationFunctionType::kNone, "");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        float squared_l2_norm = 0;
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          squared_l2_norm += val * val;
-        }
-        float inverse_l2_norm = 1.0f / std::sqrt(squared_l2_norm);
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] * inverse_l2_norm;
-        }
-      }
+  for (int i = 0; i < outer_size; ++i) {
+    float squared_l2_norm = 0;
+    for (int c = 0; c < depth; ++c) {
+      const float val = input_data[c];
+      squared_l2_norm += val * val;
+    }
+    const float l2_norm = std::sqrt(squared_l2_norm);
+    for (int c = 0; c < depth; ++c) {
+      *output_data = *input_data / l2_norm;
+      ++output_data;
+      ++input_data;
     }
   }
 }
@@ -2394,15 +2349,11 @@ inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
                             int32 input_zero_point, uint8* output_data,
                             const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("L2Normalization/8bit");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-  TFLITE_DCHECK_EQ(batches, 1);
-  TFLITE_DCHECK_EQ(height, 1);
-  TFLITE_DCHECK_EQ(width, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
+  TFLITE_DCHECK_EQ(outer_size, 1);
   int32 square_l2_norm = 0;
   for (int i = 0; i < depth; i++) {
     int32 diff = input_data[i] - input_zero_point;
@@ -2428,20 +2379,12 @@ inline void Add(const float* input1_data, const Dims<4>& input1_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Add");
-  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
-                                              output_dims, 3);
-  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
-                                             output_dims, 2);
-  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
-                                            output_dims, 1);
-  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
-                                            output_dims, 0);
   TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
 
   int i = 0;
-  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+  const int size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
 #ifdef USE_NEON
   const auto activation_min = vdupq_n_f32(output_activation_min);
   const auto activation_max = vdupq_n_f32(output_activation_max);
@@ -2488,52 +2431,17 @@ inline void Add(const float* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void Add(const float* input1_data, const Dims<4>& input1_dims,
-         const float* input2_data, const Dims<4>& input2_dims,
-         float* output_data, const Dims<4>& output_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-
-  Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
-      output_activation_max, output_data, output_dims);
-}
-
-template <FusedActivationFunctionType Ac>
-inline void Add(int left_shift, const uint8* input1_data,
-                const Dims<4>& input1_dims, int32 input1_offset,
-                int32 input1_multiplier, int input1_shift,
-                const uint8* input2_data, const Dims<4>& input2_dims,
-                int32 input2_offset, int32 input2_multiplier, int input2_shift,
-                int32 output_offset, int32 output_multiplier, int output_shift,
-                int32 output_activation_min, int32 output_activation_max,
-                uint8* output_data, const Dims<4>& output_dims) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, 0);
-    TFLITE_DCHECK_EQ(output_activation_max, 255);
-  }
-  gemmlowp::ScopedProfilingLabel label("Add/8bit");
-  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
-                                              output_dims, 3);
-  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
-                                             output_dims, 2);
-  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
-                                            output_dims, 1);
-  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
-                                            output_dims, 0);
-  TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void AddElementwise(int size, int left_shift, const uint8* input1_data,
+                           int32 input1_offset, int32 input1_multiplier,
+                           int input1_shift, const uint8* input2_data,
+                           int32 input2_offset, int32 input2_multiplier,
+                           int input2_shift, int32 output_offset,
+                           int32 output_multiplier, int output_shift,
+                           int32 output_activation_min,
+                           int32 output_activation_max, uint8* output_data) {
   int i = 0;
-  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
   TFLITE_DCHECK_GT(input1_offset, -256);
   TFLITE_DCHECK_GT(input2_offset, -256);
   TFLITE_DCHECK_LT(input1_offset, 256);
@@ -2612,6 +2520,54 @@ inline void Add(int left_shift, const uint8* input1_data,
   }
 }
 
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Add(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
+      output_activation_max, output_data, output_dims);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void Add(int left_shift, const uint8* input1_data,
+                const Dims<4>& input1_dims, int32 input1_offset,
+                int32 input1_multiplier, int input1_shift,
+                const uint8* input2_data, const Dims<4>& input2_dims,
+                int32 input2_offset, int32 input2_multiplier, int input2_shift,
+                int32 output_offset, int32 output_multiplier, int output_shift,
+                int32 output_activation_min, int32 output_activation_max,
+                uint8* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  gemmlowp::ScopedProfilingLabel label("Add/8bit");
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+
+  TFLITE_DCHECK_GT(input1_offset, -256);
+  TFLITE_DCHECK_GT(input2_offset, -256);
+  TFLITE_DCHECK_LT(input1_offset, 256);
+  TFLITE_DCHECK_LT(input2_offset, 256);
+  AddElementwise(flat_size, left_shift, input1_data, input1_offset,
+                 input1_multiplier, input1_shift, input2_data, input2_offset,
+                 input2_multiplier, input2_shift, output_offset,
+                 output_multiplier, output_shift, output_activation_min,
+                 output_activation_max, output_data);
+}
+
 template <FusedActivationFunctionType Ac>
 inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
                 int input1_shift, const int16* input2_data,
@@ -2632,9 +2588,7 @@ inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
     TFLITE_DCHECK_EQ(output_activation_max, 32767);
   }
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
 
   TFLITE_DCHECK(input1_shift == 0 || input2_shift == 0);
   TFLITE_DCHECK_GE(input1_shift, 0);
@@ -2670,10 +2624,10 @@ void Add(const int32* input1_data, const Dims<4>& input1_dims,
   auto output_map = MapAsVector(output_data, output_dims);
   if (AreSameDims(input1_dims, input2_dims)) {
     output_map.array() = input1_map.array() + input2_map.array();
-  } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
+  } else if (FlatSize(input2_dims) == 1) {
     auto scalar = input2_data[0];
     output_map.array() = input1_map.array() + scalar;
-  } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
+  } else if (FlatSize(input1_dims) == 1) {
     auto scalar = input1_data[0];
     output_map.array() = scalar + input2_map.array();
   } else {
@@ -2822,27 +2776,11 @@ inline void BroadcastAddFivefold(
       input2_data_ptr = input2_data_reset;
       for (int i2 = 0; i2 < y2; ++i2) {
         for (int i1 = 0; i1 < y1; ++i1) {
-          for (int i0 = 0; i0 < y0; ++i0) {
-            const int32 input1_val = input1_offset + input1_data_ptr[i0];
-            const int32 input2_val = input2_offset + input2_data_ptr[i0];
-            const int32 shifted_input1_val = input1_val * (1 << left_shift);
-            const int32 shifted_input2_val = input2_val * (1 << left_shift);
-            const int32 scaled_input1_val =
-                MultiplyByQuantizedMultiplierSmallerThanOne(
-                    shifted_input1_val, input1_multiplier, input1_shift);
-            const int32 scaled_input2_val =
-                MultiplyByQuantizedMultiplierSmallerThanOne(
-                    shifted_input2_val, input2_multiplier, input2_shift);
-            const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-            const int32 raw_output =
-                MultiplyByQuantizedMultiplierSmallerThanOne(
-                    raw_sum, output_multiplier, output_shift) +
-                output_offset;
-            const int32 clamped_output =
-                std::min(output_activation_max,
-                         std::max(output_activation_min, raw_output));
-            output_data_ptr[i0] = static_cast<uint8>(clamped_output);
-          }
+          AddElementwise(
+              y0, left_shift, input1_data_ptr, input1_offset, input1_multiplier,
+              input1_shift, input2_data_ptr, input2_offset, input2_multiplier,
+              input2_shift, output_offset, output_multiplier, output_shift,
+              output_activation_min, output_activation_max, output_data_ptr);
           input2_data_ptr += y0;
           output_data_ptr += y0;
         }
@@ -2913,20 +2851,12 @@ inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Mul");
-  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
-                                              output_dims, 3);
-  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
-                                             output_dims, 2);
-  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
-                                            output_dims, 1);
-  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
-                                            output_dims, 0);
   TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
 
   int i = 0;
-  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+  const int size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
 #ifdef USE_NEON
   const auto activation_min = vdupq_n_f32(output_activation_min);
   const auto activation_max = vdupq_n_f32(output_activation_max);
@@ -3001,10 +2931,10 @@ void Mul(const int32* input1_data, const Dims<4>& input1_dims,
   auto output_map = MapAsVector(output_data, output_dims);
   if (AreSameDims(input1_dims, input2_dims)) {
     output_map.array() = input1_map.array() * input2_map.array();
-  } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
+  } else if (FlatSize(input2_dims) == 1) {
     auto scalar = input2_data[0];
     output_map.array() = input1_map.array() * scalar;
-  } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
+  } else if (FlatSize(input1_dims) == 1) {
     auto scalar = input1_data[0];
     output_map.array() = scalar * input2_map.array();
   } else {
@@ -3020,9 +2950,7 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims,
   // This is a copy of the reference implementation. We do not currently have a
   // properly optimized version.
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
 
   for (int i = 0; i < flat_size; i++) {
     // F0 uses 0 integer bits, range [-1, 1].
@@ -3044,9 +2972,7 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims,
   // properly optimized version.
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
 
   for (int i = 0; i < flat_size; i++) {
     // F0 uses 0 integer bits, range [-1, 1].
@@ -3189,26 +3115,11 @@ inline void Div(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] /
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
+  for (int i = 0; i < flat_size; i++) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] / input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -3262,26 +3173,12 @@ inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] -
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  gemmlowp::ScopedProfilingLabel label("Sub");
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -3590,15 +3487,9 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
   gemmlowp::ScopedProfilingLabel label(
       "LstmCell/quantized (8bit external, 16bit internal)");
   // Gather dimensions information, and perform consistency checks.
-  const int batches =
-      MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
-                        output_state_dims, 3, output_activ_dims, 3);
-  const int height =
-      MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
-                        output_state_dims, 2, output_activ_dims, 2);
-  const int width =
-      MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
-                        output_state_dims, 1, output_activ_dims, 1);
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_dims, 0, prev_activ_dims, prev_state_dims,
+                              output_state_dims, output_activ_dims);
   TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
   TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
   const int input_depth = ArraySize(input_dims, 0);
@@ -3614,9 +3505,7 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
       MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
                         output_state_dims, 0, output_activ_dims, 0);
   TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
-  const int fc_batches = ArraySize(activ_temp_dims, 1) *
-                         ArraySize(activ_temp_dims, 2) *
-                         ArraySize(activ_temp_dims, 3);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_dims, 0);
   const int fc_output_depth =
       MatchingArraySize(weights_dims, 1, activ_temp_dims, 0);
   const int fc_accum_depth = ArraySize(weights_dims, 0);
@@ -3672,7 +3561,6 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
 
   // Rest of the LSTM cell: tanh and logistic math functions, and some adds
   // and muls, all done in 16-bit fixed-point.
-  const int outer_size = batches * width * height;
   const int16* input_gate_input_ptr = activ_temp_data_int16;
   const int16* input_modulation_gate_input_ptr =
       activ_temp_data_int16 + output_depth;
@@ -3838,20 +3726,15 @@ void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
   gemmlowp::ScopedProfilingLabel label("TensorFlowSplit");
   TFLITE_DCHECK_GE(outputs_count, 1);
   for (int i = 0; i < outputs_count; i++) {
-    /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3);
-    /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2);
-    /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1);
+    MatchingFlatSizeSkipDim(*output_dims[i], 0, input_dims);
   }
-  const int batches = MatchingArraySize(*output_dims[0], 3, input_dims, 3);
-  const int height = MatchingArraySize(*output_dims[0], 2, input_dims, 2);
-  const int width = MatchingArraySize(*output_dims[0], 1, input_dims, 1);
+  const int outer_size = FlatSizeSkipDim(input_dims, 0);
   TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
-  // for now we dont have a model with a TensorFlowSplit
+  // For now we don't have a model with a TensorFlowSplit
   // with fused activation function.
   TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
-  const int whb = width * height * batches;
   const Scalar* input_ptr = input_data;
-  for (int k = 0; k < whb; k++) {
+  for (int k = 0; k < outer_size; k++) {
     for (int i = 0; i < outputs_count; ++i) {
       memcpy(output_data[i] + k * output_dims[i]->sizes[0], input_ptr,
              output_dims[i]->sizes[0] * sizeof(Scalar));
@@ -4376,10 +4259,7 @@ inline void LocalResponseNormalization(const float* input_data,
                                        float* output_data,
                                        const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("LocalResponseNormalization");
-  /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3);
-  /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2);
-  /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1);
-  /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0);
+  MatchingFlatSize(input_dims, output_dims);
 
   const auto data_in = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
   auto data_out = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
@@ -4422,10 +4302,7 @@ inline void Softmax(const float* input_data, const Dims<4>& input_dims,
                     float beta, float* output_data,
                     const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Softmax");
-  /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3);
-  /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2);
-  /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1);
-  /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0);
+  MatchingFlatSize(input_dims, output_dims);
 
   const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
   auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
@@ -4457,13 +4334,9 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
   using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
   gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  const int outer_size = batches * height * width;
-
   for (int b = 0; b < outer_size; ++b) {
     const uint8* input_data_ptr = input_data + b * depth;
     uint8* output_data_ptr = output_data + b * depth;
@@ -4655,35 +4528,30 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
 inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
                        float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("LogSoftmax");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        // Find max element value which we'll use to ensure numerical stability
-        // taking advantage of the following equality:
-        // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
-        float max = std::numeric_limits<float>::lowest();
-        for (int c = 0; c < depth; ++c) {
-          max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]);
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    const float* block_input_data = input_data + i * depth;
+    float* block_output_data = output_data + i * depth;
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c) {
+      max = std::max(max, block_input_data[c]);
+    }
 
-        // Compute sum.
-        float sum = 0.f;
-        for (int c = 0; c < depth; ++c) {
-          sum += std::exp(input_data[Offset(input_dims, c, x, y, b)] - max);
-        }
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c) {
+      sum += std::exp(block_input_data[c] - max);
+    }
 
-        // Compute result.
-        const float log_sum = std::log(sum);
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] - max - log_sum;
-        }
-      }
+    // Compute result.
+    const float log_sum = std::log(sum);
+    for (int c = 0; c < depth; ++c) {
+      block_output_data[c] = block_input_data[c] - max - log_sum;
     }
   }
 }
@@ -4712,15 +4580,16 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
   for (int i = 0; i < outer_size; ++i) {
+    const uint8* block_input_data = input_data + i * depth;
+    uint8* block_output_data = output_data + i * depth;
     uint8 max_in_row = 0;
     for (int c = 0; c < depth; ++c) {
-      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+      max_in_row = std::max(max_in_row, block_input_data[c]);
     }
 
     FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
     for (int c = 0; c < depth; ++c) {
-      int32 input_diff =
-          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      int32 input_diff = static_cast<int32>(block_input_data[c]) - max_in_row;
       if (input_diff >= diff_min) {
         const int32 input_diff_rescaled =
             MultiplyByQuantizedMultiplierGreaterThanOne(
@@ -4754,8 +4623,7 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
                      reverse_scaling_right_shift));
 
     for (int c = 0; c < depth; ++c) {
-      int32 input_diff =
-          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      int32 input_diff = static_cast<int32>(block_input_data[c]) - max_in_row;
       if (input_diff > adjusted_diff_min) {
         const int32 input_diff_rescaled =
             MultiplyByQuantizedMultiplierGreaterThanOne(
@@ -4766,11 +4634,11 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
                 31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
             255;
 
-        output_data[i * depth + c] = static_cast<uint8>(
+        block_output_data[c] = static_cast<uint8>(
             std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
       } else {
         // Set output to smallest value.
-        output_data[i * depth + c] = 0;
+        block_output_data[c] = 0;
       }
     }
   }
@@ -4790,11 +4658,7 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
                      int32 input_multiplier, int input_left_shift,
                      uint8* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Logistic/Uint8");
-  /* batches */ MatchingArraySize(input_dims, 3, output_dims, 3);
-  /* height */ MatchingArraySize(input_dims, 2, output_dims, 2);
-  /* width */ MatchingArraySize(input_dims, 1, output_dims, 1);
-  /* depth */ MatchingArraySize(input_dims, 0, output_dims, 0);
-  const int size = RequiredBufferSizeForDims(input_dims);
+  const int size = MatchingFlatSize(input_dims, output_dims);
 
   int c = 0;
 #ifdef USE_NEON
@@ -4929,8 +4793,7 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
 inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
                      int16* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Logistic/Int16");
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
 
   for (int i = 0; i < flat_size; i++) {
   }
@@ -5001,11 +4864,7 @@ inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
                  uint8* output_data, const Dims<4>& output_dims) {
   // Note that this is almost the exact same code as in Logistic().
   gemmlowp::ScopedProfilingLabel label("Tanh");
-  /* batches */ MatchingArraySize(input_dims, 3, output_dims, 3);
-  /* height */ MatchingArraySize(input_dims, 2, output_dims, 2);
-  /* width */ MatchingArraySize(input_dims, 1, output_dims, 1);
-  /* depth */ MatchingArraySize(input_dims, 0, output_dims, 0);
-  const int size = RequiredBufferSizeForDims(input_dims);
+  const int size = MatchingFlatSize(input_dims, output_dims);
 
   int c = 0;
   int32_t output_zero_point = 128;
@@ -5155,8 +5014,7 @@ inline void Tanh(const int16* input_data, const Dims<4>& input_dims,
   TFLITE_DCHECK_GE(input_left_shift, 0);
   TFLITE_DCHECK_LE(input_left_shift, 1);
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
 
   int c = 0;
   const int16* input_data_ptr = input_data;
@@ -5251,20 +5109,11 @@ inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
                        int32 zero_point, double scale, float* output_data,
                        const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Dequantize");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          int32 val = input_data[Offset(input_dims, c, x, y, b)];
-          float result = static_cast<float>(scale * (val - zero_point));
-          output_data[Offset(output_dims, c, x, y, b)] = result;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    int32 val = input_data[i];
+    float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
   }
 }
 
@@ -5287,25 +5136,15 @@ inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
                          &nudged_max, &nudged_scale);
   const float inv_nudged_scale = 1.0f / nudged_scale;
 
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const float src_val = input_data[Offset(input_dims, c, x, y, b)];
-          const float clamped =
-              std::min(nudged_max, std::max(nudged_min, src_val));
-          const float clamped_shifted = clamped - nudged_min;
-          const float dst_val =
-              TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
-              nudged_min;
-          output_data[Offset(output_dims, c, x, y, b)] = dst_val;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float src_val = input_data[i];
+    const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
+    const float clamped_shifted = clamped - nudged_min;
+    const float dst_val =
+        TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
+        nudged_min;
+    output_data[i] = dst_val;
   }
 }
 
@@ -5851,10 +5690,26 @@ inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
 }
 
 template <typename T>
-inline void Pad(const T* input_data, const Dims<4>& input_dims,
-                const std::vector<int>& left_paddings,
-                const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims, const int32_t pad_value) {
+void TypedMemset(void* ptr, T value, size_t num) {
+  // Optimization for common cases where memset() will suffice.
+  if (value == 0 || std::is_same<T, uint8_t>::value) {
+    memset(ptr, value, num * sizeof(T));
+  } else {
+    // Default implementation for cases where memset() will not preserve the
+    // bytes, e.g., typically when sizeof(T) > sizeof(uint8_t).
+    char* pos = static_cast<char*>(ptr);
+    for (size_t i = 0; i < num; ++i) {
+      memcpy(pos, &value, sizeof(T));
+      pos = pos + sizeof(T);
+    }
+  }
+}
+
+template <typename T>
+inline void PadV2(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& left_paddings,
+                  const std::vector<int>& right_paddings, T* output_data,
+                  const Dims<4>& output_dims, const T pad_value) {
   gemmlowp::ScopedProfilingLabel label("Pad");
   TFLITE_DCHECK_EQ(left_paddings.size(), 4);
   TFLITE_DCHECK_EQ(right_paddings.size(), 4);
@@ -5877,27 +5732,28 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   const int input_depth = ArraySize(input_dims, 0);
 
   if (left_b_padding != 0) {
-    memset(output_data, pad_value,
-           left_b_padding * output_height * output_width * output_depth *
-               sizeof(T));
+    TypedMemset<T>(
+        output_data, pad_value,
+        left_b_padding * output_height * output_width * output_depth);
   }
   for (int out_b = left_b_padding; out_b < output_batch - right_b_padding;
        ++out_b) {
     if (left_h_padding != 0) {
-      memset(output_data + Offset(output_dims, 0, 0, 0, out_b), pad_value,
-             left_h_padding * output_width * output_depth * sizeof(T));
+      TypedMemset<T>(output_data + Offset(output_dims, 0, 0, 0, out_b),
+                     pad_value, left_h_padding * output_width * output_depth);
     }
     for (int out_h = left_h_padding; out_h < output_height - right_h_padding;
          ++out_h) {
       if (left_w_padding != 0) {
-        memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), pad_value,
-               left_w_padding * output_depth * sizeof(T));
+        TypedMemset<T>(output_data + Offset(output_dims, 0, 0, out_h, out_b),
+                       pad_value, left_w_padding * output_depth);
       }
       for (int out_w = left_w_padding; out_w < output_width - right_w_padding;
            ++out_w) {
         if (left_d_padding != 0) {
-          memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b),
-                 pad_value, left_d_padding * sizeof(T));
+          TypedMemset<T>(
+              output_data + Offset(output_dims, 0, out_w, out_h, out_b),
+              pad_value, left_d_padding);
         }
 
         T* out = output_data +
@@ -5908,35 +5764,46 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
         memcpy(out, in, input_depth * sizeof(T));
 
         if (right_d_padding != 0) {
-          memset(
+          TypedMemset<T>(
               output_data + Offset(output_dims, output_depth - right_d_padding,
                                    out_w, out_h, out_b),
-              pad_value, right_d_padding * sizeof(T));
+              pad_value, right_d_padding);
         }
       }
       if (right_w_padding != 0) {
-        memset(
+        TypedMemset<T>(
             output_data + Offset(output_dims, 0, output_width - right_w_padding,
                                  out_h, out_b),
-            pad_value, right_w_padding * output_depth * sizeof(T));
+            pad_value, right_w_padding * output_depth);
       }
     }
     if (right_h_padding != 0) {
-      memset(output_data + Offset(output_dims, 0, 0,
-                                  output_height - right_h_padding, out_b),
-             pad_value,
-             right_h_padding * output_width * output_depth * sizeof(T));
+      TypedMemset<T>(
+          output_data +
+              Offset(output_dims, 0, 0, output_height - right_h_padding, out_b),
+          pad_value, right_h_padding * output_width * output_depth);
     }
   }
   if (right_b_padding != 0) {
-    memset(output_data +
-               Offset(output_dims, 0, 0, 0, output_batch - right_b_padding),
-           0,
-           right_b_padding * output_height * output_width * output_depth *
-               sizeof(T));
+    TypedMemset<T>(
+        output_data +
+            Offset(output_dims, 0, 0, 0, output_batch - right_b_padding),
+        pad_value,
+        right_b_padding * output_height * output_width * output_depth);
   }
 }
 
+// Legacy Pad() method that casts an int32_t to T before padding.
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims, const int32_t pad_value) {
+  const T converted_pad_value = static_cast<T>(pad_value);
+  PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data,
+           output_dims, converted_pad_value);
+}
+
 template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
@@ -6006,10 +5873,10 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims,
       size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
   const int start_h = begin[2];
   const int stop_h =
-      size[2] == -1 ? input_dims.sizes[2] - start_b : start_b + size[2];
+      size[2] == -1 ? input_dims.sizes[2] - start_h : start_h + size[2];
   const int start_w = begin[1];
   const int stop_w =
-      size[1] == -1 ? input_dims.sizes[1] - start_b : start_b + size[1];
+      size[1] == -1 ? input_dims.sizes[1] - start_w : start_w + size[1];
   const int start_d = begin[0];
   const int stop_d =
       size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
@@ -6108,10 +5975,10 @@ void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
   auto output_map = MapAsVector(output_data, output_dims);
   if (AreSameDims(input1_dims, input2_dims)) {
     output_map.array() = input1_map.array() - input2_map.array();
-  } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
+  } else if (FlatSize(input1_dims) == 1) {
     auto scalar = input1_data[0];
     output_map.array() = scalar - input2_map.array();
-  } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
+  } else if (FlatSize(input2_dims) == 1) {
     auto scalar = input2_data[0];
     output_map.array() = input1_map.array() - scalar;
   } else {
@@ -6149,32 +6016,28 @@ void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
 
   // The current ArgMax implemention can only determine the index of the maximum
   // value in the last dimension. So the axis argument is ignored.
-  TFLITE_DCHECK_EQ(axis[0], 3);
 
   // For ArgMax, the number of output dimensions = (number of input dimensions -
   // 1). For the sake of simplicity, the output dimensions are equal to the
   // input dimensions here. We enforce the constraint that the last dimension
   // must always be 1.
   TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1);
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = ArraySize(input_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        auto max_value = input_data[Offset(input_dims, 0, x, y, b)];
-        int max_index = 0;
-        for (int d = 1; d < depth; ++d) {
-          const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)];
-          if (curr_value > max_value) {
-            max_value = curr_value;
-            max_index = d;
-          }
-        }
-        output_data[Offset(output_dims, 0, x, y, b)] = max_index;
+  for (int i = 0; i < outer_size; ++i) {
+    auto max_value = *input_data;
+    ++input_data;
+    int max_index = 0;
+    for (int d = 1; d < depth; ++d) {
+      const auto& curr_value = *input_data;
+      if (curr_value > max_value) {
+        max_value = curr_value;
+        max_index = d;
       }
+      ++input_data;
     }
+    *output_data = max_index;
+    ++output_data;
   }
 }
 
@@ -6254,7 +6117,7 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
           const int out_y_origin = (in_y * stride_height) - pad_height;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-              for (int out_channel = 0; out_channel < input_depth;
+              for (int out_channel = 0; out_channel < output_depth;
                    ++out_channel) {
                 // Compute output element location
                 const int out_x = out_x_origin + filter_x;
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
index 3e9a3c29ee26e96612bb05eb9cd1e1badad10c7a..2d74b3d3849812a2dc95fabcd680aa280c99ca55 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
@@ -167,6 +167,7 @@ TEST(QuantizationUtilTest, ChooseQuantizationParamsZeroPointOnMinBoundary) {
   EXPECT_EQ(qp.zero_point, 0);
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(QuantizationUtilTest, ChooseQuantizationParamsZeroNotInRange) {
   // Assumption is that zero is within the range.
   EXPECT_DEATH(ChooseQuantizationParams<uint8>(10.0, 30.0), "");
@@ -176,6 +177,7 @@ TEST(QuantizationUtilTest, ChooseQuantizationParamsEmptyRangePositive) {
   // Assumption is that zero is within the range.
   EXPECT_DEATH(ChooseQuantizationParams<uint8>(30.0, 30.0), "");
 }
+#endif  // GTEST_HAS_DEATH_TEST
 
 TEST(QuantizationUtilTest, ChooseQuantizationParamsEmptyRangeZero) {
   QuantizationParams qp = ChooseQuantizationParams<uint8>(0.0, 0.0);
@@ -189,6 +191,7 @@ TEST(QuantizationUtilTest, ChooseQuantizationParamsZeroPointOnMaxBoundary) {
   EXPECT_EQ(qp.zero_point, 255);
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(QuantizationUtilTest, ChooseQuantizationParamsInvalidRange) {
   EXPECT_DEATH(ChooseQuantizationParams<uint8>(10.0, -30.0), "");
 }
@@ -261,6 +264,7 @@ TEST(QuantizationUtilTest, PreprocessSoftmaxScaling) {
   EXPECT_THAT(quantize(2.0, 16.0, 5), Pair(2147483647, 31));
   EXPECT_THAT(quantize(2.0, 8.0, 5), Pair(1073741824, 31));
 }
+#endif  // GTEST_HAS_DEATH_TEST
 
 TEST(QuantizationUtilTest, CalculateInputRadius) {
   EXPECT_EQ(CalculateInputRadius(4, 27), 15);
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 8649cbafe6d90a9f61fd84601cccaafb643f7cc7..15fac237edae79ea8d778b6e28694acb8de48668 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -35,35 +35,6 @@ limitations under the License.
 namespace tflite {
 namespace reference_ops {
 
-inline int32 MultiplyByQuantizedMultiplierSmallerThanOne(
-    int32 x, int32 quantized_multiplier, int right_shift) {
-  using gemmlowp::RoundingDivideByPOT;
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  return RoundingDivideByPOT(
-      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift);
-}
-
-inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
-    int32 x, int32 quantized_multiplier, int left_shift) {
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
-                                           quantized_multiplier);
-}
-
-template <typename T>
-int CountLeadingZeros(T integer_input) {
-  static_assert(std::is_unsigned<T>::value,
-                "Only unsigned integer types handled.");
-  const T one_in_leading_positive = static_cast<T>(1)
-                                    << (std::numeric_limits<T>::digits - 1);
-  int leading_zeros = 0;
-  while (integer_input < one_in_leading_positive) {
-    integer_input <<= 1;
-    ++leading_zeros;
-  }
-  return leading_zeros;
-}
-
 // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
 // BROADCASTING.
 //
@@ -922,13 +893,9 @@ inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt,
 inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
                             int32 input_zero_point, uint8* output_data,
                             const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  TFLITE_DCHECK_EQ(batches, 1);
-  TFLITE_DCHECK_EQ(height, 1);
-  TFLITE_DCHECK_EQ(width, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
+  TFLITE_DCHECK_EQ(outer_size, 1);
   int32 square_l2_norm = 0;
   for (int i = 0; i < depth; i++) {
     int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point;
@@ -1050,9 +1017,7 @@ inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
     TFLITE_DCHECK_EQ(output_activation_max, 32767);
   }
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
 
   TFLITE_DCHECK(input1_shift == 0 || input2_shift == 0);
   TFLITE_DCHECK_GE(input1_shift, 0);
@@ -1428,9 +1393,7 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims,
                 int16* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Mul/Int16");
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
 
   for (int i = 0; i < flat_size; i++) {
     // F0 uses 0 integer bits, range [-1, 1].
@@ -1450,9 +1413,7 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims,
   gemmlowp::ScopedProfilingLabel label("Mul/Int16Uint8");
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
 
   for (int i = 0; i < flat_size; i++) {
     // F0 uses 0 integer bits, range [-1, 1].
@@ -1485,33 +1446,6 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
                output_data, output_dims);
 }
 
-inline void Div(const float* input1_data, const Dims<4>& input1_dims,
-                const float* input2_data, const Dims<4>& input2_dims,
-                float output_activation_min, float output_activation_max,
-                float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] /
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
-  }
-}
-
 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
@@ -1553,6 +1487,18 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
+inline void Div(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] / input2_data[i], output_activation_min,
+        output_activation_max);
+  }
+}
+
 inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
@@ -1847,7 +1793,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // The quantization of the input, output arrays is as follows:
 //  - The input activations are quantized as uint8 on the interval
 //    [-1, 127/128].
-//    The rationale for that is that that is the natural interval for output
+//    The rationale for that is that is the natural interval for output
 //    activations (see next point) and these need to be concatenated together.
 //    We could accommodate different ranges by re-scaling, but we empirically
 //    found that setting the input activations range to be [-1, 127/128] in the
@@ -1912,7 +1858,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // However, for a fixed-point implementation in 16-bit integers, using 5
 // integer bits to represent the [-16, 16] range would leave only 11
 // fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
-// representable values. Notice that that is higher than the
+// representable values. Notice that is higher than the
 // worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
 // Using [-8, 8] thus seems like the better compromise overall, enjoying
 // an increment of 2.4e-4 between representable values and a worst-case
@@ -3158,10 +3104,10 @@ inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
 }
 
 template <typename T>
-inline void Pad(const T* input_data, const Dims<4>& input_dims,
-                const std::vector<int>& left_paddings,
-                const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims, const int32_t pad_value) {
+inline void PadV2(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& left_paddings,
+                  const std::vector<int>& right_paddings, T* output_data,
+                  const Dims<4>& output_dims, const T pad_value) {
   TFLITE_DCHECK_EQ(left_paddings.size(), 4);
   TFLITE_DCHECK_EQ(right_paddings.size(), 4);
 
@@ -3194,7 +3140,7 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
               out_w >= output_width - right_w_padding ||
               out_d < left_d_padding ||
               out_d >= output_depth - right_d_padding) {
-            *out_ptr++ = static_cast<T>(pad_value);
+            *out_ptr++ = pad_value;
           } else {
             *out_ptr++ = *in_ptr++;
           }
@@ -3204,6 +3150,17 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Legacy Pad() method that casts an int32_t to T before padding.
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims, const int32_t pad_value) {
+  const T converted_pad_value = static_cast<T>(pad_value);
+  PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data,
+           output_dims, converted_pad_value);
+}
+
 template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
@@ -3274,10 +3231,10 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims,
       size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
   const int start_h = begin[2];
   const int stop_h =
-      size[2] == -1 ? input_dims.sizes[2] - start_b : start_b + size[2];
+      size[2] == -1 ? input_dims.sizes[2] - start_h : start_h + size[2];
   const int start_w = begin[1];
   const int stop_w =
-      size[1] == -1 ? input_dims.sizes[1] - start_b : start_b + size[1];
+      size[1] == -1 ? input_dims.sizes[1] - start_w : start_w + size[1];
   const int start_d = begin[0];
   const int stop_d =
       size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
@@ -3303,11 +3260,11 @@ inline void Exp(const T* input_data, const size_t num_elements,
 }
 
 template <typename T, typename U>
-inline bool Mean(T* input_data, const int* input_dims, const int input_num_dims,
-                 T* output_data, const int* output_dims,
-                 const int output_num_dims, const int* axis,
-                 const int num_axis_dimensions, bool keep_dims, int* temp_index,
-                 int* resolved_axis, U* temp_sum) {
+inline bool Mean(const T* input_data, const int* input_dims,
+                 const int input_num_dims, T* output_data,
+                 const int* output_dims, const int output_num_dims,
+                 const int* axis, const int num_axis_dimensions, bool keep_dims,
+                 int* temp_index, int* resolved_axis, U* temp_sum) {
   // resets output data.
   size_t num_outputs = 1;
   for (int idx = 0; idx < output_num_dims; ++idx) {
@@ -3488,7 +3445,6 @@ void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
             T2* output_data, const Dims<4>& output_dims) {
   // The current ArgMax implemention can only determine the index of the maximum
   // value in the last dimension. So the axis argument is ignored.
-  TFLITE_DCHECK_EQ(axis[0], 3);
 
   // For ArgMax, the number of output dimensions = (number of input dimensions -
   // 1). For the sake of simplicity, the output dimensions are equal to the
@@ -3563,7 +3519,7 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
   // computing their influence on the output, rather than looping through the
   // output elements in the typical "gather" access pattern of a conv. We
   // therefore must initialize the output array to zero.
-  for (int i = 0; i < RequiredBufferSizeForDims(output_dims); i++) {
+  for (int i = 0; i < FlatSize(output_dims); i++) {
     output_data[i] = 0.0f;
   }
 
@@ -3603,50 +3559,201 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
 }
 
 template <typename T>
-inline void Less(int64_t num_elements, const T* input1, const T* input2,
-                 bool* output) {
-  for (int64_t i = 0; i < num_elements; ++i) {
-    output[i] = input1[i] < input2[i];
-  }
+inline bool GreaterFn(T lhs, T rhs) {
+  return lhs > rhs;
 }
-
 template <typename T>
-inline void Less(const T* input1_data, const Dims<4>& input1_dims,
-                 const T* input2_data, const Dims<4>& input2_dims,
-                 bool* output_data, const Dims<4>& output_dims) {
-  const int64_t batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int64_t height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int64_t width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int64_t depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  Less(batches * height * width * depth, input1_data, input2_data, output_data);
+inline bool GreaterEqualFn(T lhs, T rhs) {
+  return lhs >= rhs;
+}
+template <typename T>
+inline bool LessFn(T lhs, T rhs) {
+  return lhs < rhs;
+}
+template <typename T>
+inline bool LessEqualFn(T lhs, T rhs) {
+  return lhs <= rhs;
 }
 
-template <typename T1, typename T2>
-inline void BroadcastLess(T1* input1_data, const Dims<4>& input1_dims,
-                          T2* input2_data, const Dims<4>& input2_dims,
-                          bool* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastLess");
+template <typename T>
+using ComparisonFn = bool (*)(T, T);
+
+template <typename T, ComparisonFn<T> F>
+inline void Comparison(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, const Dims<4>& input2_dims,
+                       bool* output_data, const Dims<4>& output_dims) {
+  const int64_t flatsize =
+      MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    output_data[i] = F(input1_data[i], input2_data[i]);
+  }
+}
+
+template <typename T, ComparisonFn<int32> F>
+inline void Comparison(int left_shift, const T* input1_data,
+                       const Dims<4>& input1_dims, int32 input1_offset,
+                       int32 input1_multiplier, int input1_shift,
+                       const T* input2_data, const Dims<4>& input2_dims,
+                       int32 input2_offset, int32 input2_multiplier,
+                       int input2_shift, bool* output_data,
+                       const Dims<4>& output_dims) {
+  const int64_t flatsize =
+      MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    const int32 input1_val = input1_offset + input1_data[i];
+    const int32 input2_val = input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << left_shift);
+    const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOne(
+        shifted_input1_val, input1_multiplier, input1_shift);
+    const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOne(
+        shifted_input2_val, input2_multiplier, input2_shift);
+    output_data[i] = F(scaled_input1_val, scaled_input2_val);
+  }
+}
+
+template <typename T, ComparisonFn<T> F>
+inline void BroadcastComparison(const T* input1_data,
+                                const Dims<4>& input1_dims,
+                                const T* input2_data,
+                                const Dims<4>& input2_dims, bool* output_data,
+                                const Dims<4>& output_dims) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              F(input1_data[SubscriptToIndex(desc1, c, x, y, b)],
+                input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+        }
+      }
+    }
+  }
+}
 
+template <typename T, ComparisonFn<int32> F>
+inline void BroadcastComparison(int left_shift, const T* input1_data,
+                                const Dims<4>& input1_dims, int32 input1_offset,
+                                int32 input1_multiplier, int input1_shift,
+                                const T* input2_data,
+                                const Dims<4>& input2_dims, int32 input2_offset,
+                                int32 input2_multiplier, int input2_shift,
+                                bool* output_data, const Dims<4>& output_dims) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
   for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 shifted_input1_val = input1_val * (1 << left_shift);
+          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input1_val, input1_multiplier, input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input2_val, input2_multiplier, input2_shift);
           output_data[Offset(output_dims, c, x, y, b)] =
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] <
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+              F(scaled_input1_val, scaled_input2_val);
         }
       }
     }
   }
 }
 
+#define TFLITE_COMPARISON_OP(name)                                            \
+  template <typename T>                                                       \
+  inline void name(const T* input1_data, const Dims<4>& input1_dims,          \
+                   const T* input2_data, const Dims<4>& input2_dims,          \
+                   bool* output_data, const Dims<4>& output_dims) {           \
+    gemmlowp::ScopedProfilingLabel label(#name);                              \
+    Comparison<T, name##Fn>(input1_data, input1_dims, input2_data,            \
+                            input2_dims, output_data, output_dims);           \
+  }                                                                           \
+  template <typename T>                                                       \
+  inline void name(                                                           \
+      int left_shift, const T* input1_data, const Dims<4>& input1_dims,       \
+      int32 input1_offset, int32 input1_multiplier, int input1_shift,         \
+      const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset,  \
+      int32 input2_multiplier, int input2_shift, bool* output_data,           \
+      const Dims<4>& output_dims) {                                           \
+    gemmlowp::ScopedProfilingLabel label(#name "/8bit");                      \
+    Comparison<T, name##Fn>(left_shift, input1_data, input1_dims,             \
+                            input1_offset, input1_multiplier, input1_shift,   \
+                            input2_data, input2_dims, input2_offset,          \
+                            input2_multiplier, input2_shift, output_data,     \
+                            output_dims);                                     \
+  }                                                                           \
+  template <typename T>                                                       \
+  inline void Broadcast##name(                                                \
+      const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, \
+      const Dims<4>& input2_dims, bool* output_data,                          \
+      const Dims<4>& output_dims) {                                           \
+    gemmlowp::ScopedProfilingLabel label("Broadcast" #name);                  \
+    BroadcastComparison<T, name##Fn>(input1_data, input1_dims, input2_data,   \
+                                     input2_dims, output_data, output_dims);  \
+  }                                                                           \
+  template <typename T>                                                       \
+  inline void Broadcast##name(                                                \
+      int left_shift, const T* input1_data, const Dims<4>& input1_dims,       \
+      int32 input1_offset, int32 input1_multiplier, int input1_shift,         \
+      const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset,  \
+      int32 input2_multiplier, int input2_shift, bool* output_data,           \
+      const Dims<4>& output_dims) {                                           \
+    gemmlowp::ScopedProfilingLabel label("Broadcast" #name "/8bit");          \
+    BroadcastComparison<T, name##Fn>(left_shift, input1_data, input1_dims,    \
+                                     input1_offset, input1_multiplier,        \
+                                     input1_shift, input2_data, input2_dims,  \
+                                     input2_offset, input2_multiplier,        \
+                                     input2_shift, output_data, output_dims); \
+  }
+TFLITE_COMPARISON_OP(Greater);
+TFLITE_COMPARISON_OP(GreaterEqual);
+TFLITE_COMPARISON_OP(Less);
+TFLITE_COMPARISON_OP(LessEqual);
+#undef TFLITE_COMPARISON_OP
+
+template <typename D, typename T>
+inline void Select(const D* input_condition_data,
+                   const Dims<4>& input_condition_dims, const T* input_x_data,
+                   const Dims<4>& input_x_dims, const T* input_y_data,
+                   const Dims<4>& input_y_dims, T* output_data,
+                   const Dims<4>& output_dims) {
+  const int64_t flatsize =
+      MatchingFlatSize(input_x_dims, input_y_dims, output_dims);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    output_data[i] =
+        input_condition_data[i] ? input_x_data[i] : input_y_data[i];
+  }
+}
+
+template <typename D, typename T>
+inline void RankOneSelect(const D* input_condition_data,
+                          const Dims<4>& input_condition_dims,
+                          const T* input_x_data, const Dims<4>& input_x_dims,
+                          const T* input_y_data, const Dims<4>& input_y_dims,
+                          T* output_data, const Dims<4>& output_dims) {
+  const int64_t rank = MatchingArraySize(input_condition_dims, 0, input_x_dims,
+                                         3, input_y_dims, 3, output_dims, 3);
+  const int64_t inner_size =
+      MatchingFlatSizeSkipDim(input_x_dims, 3, input_y_dims, output_dims);
+
+  int64_t offset = 0;
+  for (int64_t i = 0; i < rank; i++) {
+    const T* input_data = input_condition_data[i] ? input_x_data : input_y_data;
+    memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T));
+    offset += inner_size;
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor.h b/tensorflow/contrib/lite/kernels/internal/tensor.h
index 62cea143e6afc0631493012be26808a89eb03138..ce887cea8b794b4b0cfd31722581cf9327be625e 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor.h
@@ -49,6 +49,34 @@ inline bool* GetTensorData(TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.b : nullptr;
 }
 
+template <typename T>
+inline const T* GetTensorData(const TfLiteTensor* tensor);
+
+template <>
+inline const float* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.f : nullptr;
+}
+
+template <>
+inline const uint8_t* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.uint8 : nullptr;
+}
+
+template <>
+inline const int32_t* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.i32 : nullptr;
+}
+
+template <>
+inline const int64_t* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.i64 : nullptr;
+}
+
+template <>
+inline const bool* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.b : nullptr;
+}
+
 inline int RemapDim(int max_dimensions, int d) {
   return max_dimensions - d - 1;
 }
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index 3290c364c18224edb733c177ad72bf86b6892434..43c68832785ac87f51c298370b50dc722167dc7f 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -132,11 +132,11 @@ int MatchingArraySize(const ArrayType1& array1, int index1,
 
 template <int N>
 inline int FlatSize(const Dims<N>& dims) {
-  int max_offset = 0;
-  for (int i = 0; i < N; i++) {
-    max_offset += (dims.sizes[i] - 1) * dims.strides[i];
+  int flat_size = 1;
+  for (int i = 0; i < N; ++i) {
+    flat_size *= dims.sizes[i];
   }
-  return max_offset + 1;
+  return flat_size;
 }
 
 // Deprecated. Prefer FlatSize.
@@ -148,7 +148,7 @@ inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
 // arrays.
 template <int N>
 inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
   }
   return FlatSize(dims);
@@ -157,7 +157,7 @@ inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0) {
 template <int N>
 inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
                             const Dims<N>& check_dims_1) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
   }
   return MatchingFlatSize(dims, check_dims_1);
@@ -167,7 +167,7 @@ template <int N>
 inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
                             const Dims<N>& check_dims_1,
                             const Dims<N>& check_dims_2) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
   }
   return FlatSize(dims, check_dims_1, check_dims_2);
@@ -178,7 +178,7 @@ inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
                             const Dims<N>& check_dims_1,
                             const Dims<N>& check_dims_2,
                             const Dims<N>& check_dims_3) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
   }
   return FlatSize(dims, check_dims_1, check_dims_2, check_dims_3);
@@ -191,7 +191,7 @@ template <int N>
 inline int FlatSizeSkipDim(const Dims<N>& dims, int skip_dim) {
   TFLITE_DCHECK(skip_dim >= 0 && skip_dim < N);
   int flat_size = 1;
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     flat_size *= (i == skip_dim) ? 1 : dims.sizes[i];
   }
   return flat_size;
@@ -201,7 +201,7 @@ inline int FlatSizeSkipDim(const Dims<N>& dims, int skip_dim) {
 template <int N>
 inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
                                    const Dims<N>& check_dims_0) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     if (i != skip_dim) {
       TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
     }
@@ -213,7 +213,7 @@ template <int N>
 inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
                                    const Dims<N>& check_dims_0,
                                    const Dims<N>& check_dims_1) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     if (i != skip_dim) {
       TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
     }
@@ -226,7 +226,7 @@ inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
                                    const Dims<N>& check_dims_0,
                                    const Dims<N>& check_dims_1,
                                    const Dims<N>& check_dims_2) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     if (i != skip_dim) {
       TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
     }
@@ -240,7 +240,7 @@ inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
                                    const Dims<N>& check_dims_1,
                                    const Dims<N>& check_dims_2,
                                    const Dims<N>& check_dims_3) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     if (i != skip_dim) {
       TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
     }
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/contrib/lite/kernels/kernel_util.cc
index 955e8c5764c6adad37a0009f4ddf8accb437b174..239b533a17efaa25d632c140c17e9f35d01a80ef 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.cc
+++ b/tensorflow/contrib/lite/kernels/kernel_util.cc
@@ -22,9 +22,12 @@ limitations under the License.
 
 namespace tflite {
 
-TfLiteStatus GetQuantizedConvolutionMultipler(
-    TfLiteContext* context, TfLiteTensor* input, TfLiteTensor* filter,
-    TfLiteTensor* bias, TfLiteTensor* output, double* multiplier) {
+TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
+                                              const TfLiteTensor* input,
+                                              const TfLiteTensor* filter,
+                                              const TfLiteTensor* bias,
+                                              TfLiteTensor* output,
+                                              double* multiplier) {
   const double input_product_scale = input->params.scale * filter->params.scale;
   const double bias_scale = bias->params.scale;
   const double output_scale = output->params.scale;
@@ -87,13 +90,13 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
   }
 }
 
-bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2) {
+bool HaveSameShapes(const TfLiteTensor* input1, const TfLiteTensor* input2) {
   return TfLiteIntArrayEqual(input1->dims, input2->dims);
 }
 
 TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
-                                        TfLiteTensor* input1,
-                                        TfLiteTensor* input2,
+                                        const TfLiteTensor* input1,
+                                        const TfLiteTensor* input2,
                                         TfLiteIntArray** output_shape) {
   int64_t dims1 = NumDimensions(input1);
   int64_t dims2 = NumDimensions(input2);
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h
index e225443a67b2ac6fb67bfd7e0828417da4ed4dab..82cded36f2ed2777daccafee5890f47c0d7254e8 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@@ -24,8 +24,8 @@ inline int NumDimensions(const TfLiteTensor* t) { return t->dims->size; }
 inline int SizeOfDimension(const TfLiteTensor* t, int dim) {
   return t->dims->data[dim];
 }
-inline TfLiteTensor* GetInput(TfLiteContext* context, TfLiteNode* node,
-                              int index) {
+inline const TfLiteTensor* GetInput(TfLiteContext* context, TfLiteNode* node,
+                                    int index) {
   return &context->tensors[node->inputs->data[index]];
 }
 inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node,
@@ -47,8 +47,9 @@ inline int64_t NumElements(const TfLiteTensor* t) {
   return count;
 }
 
-inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
-                                            const TfLiteNode* node, int index) {
+inline const TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
+                                                  const TfLiteNode* node,
+                                                  int index) {
   const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
   if (use_tensor) {
     return &context->tensors[node->inputs->data[index]];
@@ -78,9 +79,12 @@ inline void SetTensorToDynamic(TfLiteTensor* tensor) {
 // Calculates the multiplication factor for a quantized convolution (or
 // quantized depthwise convolution) involving the given tensors. Returns an
 // error if the scales of the tensors are not compatible.
-TfLiteStatus GetQuantizedConvolutionMultipler(
-    TfLiteContext* context, TfLiteTensor* input, TfLiteTensor* filter,
-    TfLiteTensor* bias, TfLiteTensor* output, double* multiplier);
+TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
+                                              const TfLiteTensor* input,
+                                              const TfLiteTensor* filter,
+                                              const TfLiteTensor* bias,
+                                              TfLiteTensor* output,
+                                              double* multiplier);
 
 // Calculates the useful range of an activation layer given its activation
 // tensor.
@@ -92,13 +96,13 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
                                    float* activation_max);
 
 // Return true if the given tensors have the same shape.
-bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2);
+bool HaveSameShapes(const TfLiteTensor* input1, const TfLiteTensor* input2);
 
 // Calculate the output_shape that is necessary for element-wise operations
 // with broadcasting involving the two input tensors.
 TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
-                                        TfLiteTensor* input1,
-                                        TfLiteTensor* input2,
+                                        const TfLiteTensor* input1,
+                                        const TfLiteTensor* input2,
                                         TfLiteIntArray** output_shape);
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/kernel_util_test.cc b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
index c65b68970f6853e17af3a70aad7a2bc982a1ee60..bf6f249acc85ee050681eb6f33067be2a1aa037e 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util_test.cc
+++ b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
@@ -33,7 +33,7 @@ class KernelUtilTest : public ::testing::Test {
     tensor1_.allocation_type = kTfLiteMmapRo;
     tensor2_.allocation_type = kTfLiteMmapRo;
   }
-  ~KernelUtilTest() {
+  ~KernelUtilTest() override {
     TfLiteTensorFree(&tensor1_);
     TfLiteTensorFree(&tensor2_);
   }
diff --git a/tensorflow/contrib/lite/kernels/l2norm.cc b/tensorflow/contrib/lite/kernels/l2norm.cc
index e67f4e06f3680f8c9447a9e831b63415994ea176..3205c1cc52724207904621a5870636841ef379fe 100644
--- a/tensorflow/contrib/lite/kernels/l2norm.cc
+++ b/tensorflow/contrib/lite/kernels/l2norm.cc
@@ -40,7 +40,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
@@ -64,7 +64,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
@@ -94,7 +94,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
 #undef TF_LITE_L2NORM
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context, "Output type is %d, requires float.",
+                         output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/l2norm_test.cc b/tensorflow/contrib/lite/kernels/l2norm_test.cc
index 042314ccf55cb6de12c743448fbe040f35e7baab..11cc666bad61b5753d933b98510ee8d4a093644e 100644
--- a/tensorflow/contrib/lite/kernels/l2norm_test.cc
+++ b/tensorflow/contrib/lite/kernels/l2norm_test.cc
@@ -76,6 +76,23 @@ TEST(L2NormOpTest, SimpleTest) {
               ElementsAreArray({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}));
 }
 
+TEST(L2NormOpTest, MultipleBatchesTest) {
+  L2NormOpModel m({3, 1, 1, 6}, TensorType_FLOAT32,
+                  ActivationFunctionType_NONE);
+  m.SetInput({
+      -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 1
+      -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 2
+      -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 3
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({
+                  -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 1
+                  -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 2
+                  -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 3
+              }));
+}
+
 TEST(L2NormOpTest, SimpleUint8Test) {
   L2NormOpModel m({1, 1, 1, 6}, TensorType_UINT8, ActivationFunctionType_NONE);
 
diff --git a/tensorflow/contrib/lite/kernels/local_response_norm.cc b/tensorflow/contrib/lite/kernels/local_response_norm.cc
index c1c70d0dfa0050dee3815aa15f5d16d2e7ddc721..36dca299d0e07a84af60a13dfeb50b0f8fe38ee2 100644
--- a/tensorflow/contrib/lite/kernels/local_response_norm.cc
+++ b/tensorflow/contrib/lite/kernels/local_response_norm.cc
@@ -38,7 +38,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
@@ -60,7 +60,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteLocalResponseNormParams*>(node->builtin_data);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
@@ -77,7 +77,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
 #undef TF_LITE_LOCAL_RESPONSE_NORM
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context, "Output type is %d, requires float.",
+                         output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/lsh_projection.cc b/tensorflow/contrib/lite/kernels/lsh_projection.cc
index af5d971f21152e592f6229bb52d7eed96c27f710..863e08ed680588a8cfb73792b8262d390ea6bf36 100644
--- a/tensorflow/contrib/lite/kernels/lsh_projection.cc
+++ b/tensorflow/contrib/lite/kernels/lsh_projection.cc
@@ -77,16 +77,16 @@ TfLiteStatus Resize(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, NumInputs(node) == 2 || NumInputs(node) == 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* hash = GetInput(context, node, 0);
+  const TfLiteTensor* hash = GetInput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, NumDimensions(hash), 2);
   // Support up to 32 bits.
   TF_LITE_ENSURE(context, SizeOfDimension(hash, 1) <= 32);
 
-  TfLiteTensor* input = GetInput(context, node, 1);
+  const TfLiteTensor* input = GetInput(context, node, 1);
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
 
   if (NumInputs(node) == 3) {
-    TfLiteTensor* weight = GetInput(context, node, 2);
+    const TfLiteTensor* weight = GetInput(context, node, 2);
     TF_LITE_ENSURE_EQ(context, NumDimensions(weight), 1);
     TF_LITE_ENSURE_EQ(context, SizeOfDimension(weight, 0),
                       SizeOfDimension(input, 0));
@@ -173,9 +173,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       reinterpret_cast<TfLiteLSHProjectionParams*>(node->builtin_data);
 
   int32_t* out_buf = GetOutput(context, node, 0)->data.i32;
-  TfLiteTensor* hash = GetInput(context, node, 0);
-  TfLiteTensor* input = GetInput(context, node, 1);
-  TfLiteTensor* weight =
+  const TfLiteTensor* hash = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 1);
+  const TfLiteTensor* weight =
       NumInputs(node) == 2 ? nullptr : GetInput(context, node, 2);
 
   switch (params->type) {
diff --git a/tensorflow/contrib/lite/kernels/lstm.cc b/tensorflow/contrib/lite/kernels/lstm.cc
index a1521efbb4e2dfc378915fb04b0cd156353bcc5e..990b3da0554ebcb13f995fa281ed04f8c7c6d7ea 100644
--- a/tensorflow/contrib/lite/kernels/lstm.cc
+++ b/tensorflow/contrib/lite/kernels/lstm.cc
@@ -92,7 +92,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE(context, params->cell_clip >= 0);
   TF_LITE_ENSURE(context, params->proj_clip >= 0);
 
-  TfLiteTensor* input_to_input_weights =
+  const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
   if (input_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
@@ -100,19 +100,19 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
   }
 
-  TfLiteTensor* input_to_forget_weights =
+  const TfLiteTensor* input_to_forget_weights =
       GetInput(context, node, kInputToForgetWeightsTensor);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
 
-  TfLiteTensor* input_to_cell_weights =
+  const TfLiteTensor* input_to_cell_weights =
       GetInput(context, node, kInputToCellWeightsTensor);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
 
-  TfLiteTensor* recurrent_to_input_weights =
+  const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
   if (recurrent_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
@@ -122,7 +122,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                       n_output);
   }
 
-  TfLiteTensor* recurrent_to_forget_weights =
+  const TfLiteTensor* recurrent_to_forget_weights =
       GetInput(context, node, kRecurrentToForgetWeightsTensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
@@ -130,7 +130,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
                     n_output);
 
-  TfLiteTensor* recurrent_to_cell_weights =
+  const TfLiteTensor* recurrent_to_cell_weights =
       GetInput(context, node, kRecurrentToCellWeightsTensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
@@ -146,21 +146,21 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
        (recurrent_to_input_weights == nullptr));
   TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
 
-  TfLiteTensor* cell_to_input_weights =
+  const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
   if (cell_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* cell_to_forget_weights =
+  const TfLiteTensor* cell_to_forget_weights =
       GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
   if (cell_to_forget_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* cell_to_output_weights =
+  const TfLiteTensor* cell_to_output_weights =
       GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
   if (cell_to_output_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
@@ -179,7 +179,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
 
   // Make sure the input gate bias is present only when not a CIFG-LSTM.
-  TfLiteTensor* input_gate_bias =
+  const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, kInputGateBiasTensor);
   if (use_cifg) {
     TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
@@ -188,21 +188,21 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* forget_gate_bias =
+  const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, kForgetGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* output_gate_bias =
+  const TfLiteTensor* output_gate_bias =
       GetInput(context, node, kOutputGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* projection_weights =
+  const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
   if (projection_weights) {
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
@@ -210,7 +210,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
   }
 
-  TfLiteTensor* projection_bias =
+  const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, kProjectionBiasTensor);
   if (projection_bias) {
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
@@ -241,18 +241,18 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Inferring batch size, number of outputs and number of cells from the
   // input tensors.
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TF_LITE_ENSURE(context, input->dims->size > 1);
   const int n_batch = input->dims->data[0];
   const int n_input = input->dims->data[1];
 
-  TfLiteTensor* input_to_output_weights =
+  const TfLiteTensor* input_to_output_weights =
       GetInput(context, node, kInputToOutputWeightsTensor);
   const int n_cell = input_to_output_weights->dims->data[0];
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
 
-  TfLiteTensor* recurrent_to_output_weights =
+  const TfLiteTensor* recurrent_to_output_weights =
       GetInput(context, node, kRecurrentToOutputWeightsTensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
@@ -298,7 +298,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   output_state->allocation_type = kTfLiteArenaRwPersistent;
   cell_state->allocation_type = kTfLiteArenaRwPersistent;
 
-  TfLiteTensor* input_to_input_weights =
+  const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
   const bool use_cifg = (input_to_input_weights == nullptr);
   if (use_cifg) {
@@ -322,44 +322,44 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 // The LSTM Op engine.
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
 
-  TfLiteTensor* input_to_input_weights =
+  const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  TfLiteTensor* input_to_forget_weights =
+  const TfLiteTensor* input_to_forget_weights =
       GetInput(context, node, kInputToForgetWeightsTensor);
-  TfLiteTensor* input_to_cell_weights =
+  const TfLiteTensor* input_to_cell_weights =
       GetInput(context, node, kInputToCellWeightsTensor);
-  TfLiteTensor* input_to_output_weights =
+  const TfLiteTensor* input_to_output_weights =
       GetInput(context, node, kInputToOutputWeightsTensor);
 
-  TfLiteTensor* recurrent_to_input_weights =
+  const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  TfLiteTensor* recurrent_to_forget_weights =
+  const TfLiteTensor* recurrent_to_forget_weights =
       GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  TfLiteTensor* recurrent_to_cell_weights =
+  const TfLiteTensor* recurrent_to_cell_weights =
       GetInput(context, node, kRecurrentToCellWeightsTensor);
-  TfLiteTensor* recurrent_to_output_weights =
+  const TfLiteTensor* recurrent_to_output_weights =
       GetInput(context, node, kRecurrentToOutputWeightsTensor);
 
-  TfLiteTensor* cell_to_input_weights =
+  const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
-  TfLiteTensor* cell_to_forget_weights =
+  const TfLiteTensor* cell_to_forget_weights =
       GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
-  TfLiteTensor* cell_to_output_weights =
+  const TfLiteTensor* cell_to_output_weights =
       GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
 
-  TfLiteTensor* input_gate_bias =
+  const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, kInputGateBiasTensor);
-  TfLiteTensor* forget_gate_bias =
+  const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, kForgetGateBiasTensor);
-  TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
-  TfLiteTensor* output_gate_bias =
+  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* output_gate_bias =
       GetInput(context, node, kOutputGateBiasTensor);
 
-  TfLiteTensor* projection_weights =
+  const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  TfLiteTensor* projection_bias =
+  const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, kProjectionBiasTensor);
 
   TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
diff --git a/tensorflow/contrib/lite/kernels/maximum_minimum.cc b/tensorflow/contrib/lite/kernels/maximum_minimum.cc
index 5a28d663c9e756040746f0a98b356afba76cceab..8d676218bdcf71a7acadf62f213d35c6997f7575 100644
--- a/tensorflow/contrib/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/contrib/lite/kernels/maximum_minimum.cc
@@ -41,8 +41,8 @@ struct OpContext {
     input2 = GetInput(context, node, kInputTensor2);
     output = GetOutput(context, node, kOutputTensor);
   }
-  TfLiteTensor* input1;
-  TfLiteTensor* input2;
+  const TfLiteTensor* input1;
+  const TfLiteTensor* input2;
   TfLiteTensor* output;
 };
 
diff --git a/tensorflow/contrib/lite/kernels/mean.cc b/tensorflow/contrib/lite/kernels/mean.cc
index 98f80e32d95b47cbe6267457f9508e2897e804d9..03e5db24de3f3c2d4e17df21bc0b592a02078d6b 100644
--- a/tensorflow/contrib/lite/kernels/mean.cc
+++ b/tensorflow/contrib/lite/kernels/mean.cc
@@ -40,8 +40,8 @@ struct MeanContext {
     output = GetOutput(context, node, 0);
   }
   TfLiteMeanParams* params;
-  TfLiteTensor* input;
-  TfLiteTensor* axis;
+  const TfLiteTensor* input;
+  const TfLiteTensor* axis;
   TfLiteTensor* output;
 };
 
diff --git a/tensorflow/contrib/lite/kernels/mfcc.cc b/tensorflow/contrib/lite/kernels/mfcc.cc
index 018db0dc54c5d281bf3fb3ff8a1f111b427fe76b..3f5bc4d68a57daa8423953f591ac139dc55eacb9 100644
--- a/tensorflow/contrib/lite/kernels/mfcc.cc
+++ b/tensorflow/contrib/lite/kernels/mfcc.cc
@@ -67,8 +67,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
-  TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
+  const TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
+  const TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(inputWav), 3);
@@ -94,8 +94,8 @@ template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteMfccParams*>(node->user_data);
 
-  TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
-  TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
+  const TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
+  const TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   const int32 sample_rate = *GetTensorData<int>(inputRate);
diff --git a/tensorflow/contrib/lite/kernels/mul.cc b/tensorflow/contrib/lite/kernels/mul.cc
index 54575019de4c678ce25561cf2ac8dc80c9973363..62f4e94a386fbbc6987e8a6dc1a9a47ce3349cbb 100644
--- a/tensorflow/contrib/lite/kernels/mul.cc
+++ b/tensorflow/contrib/lite/kernels/mul.cc
@@ -57,8 +57,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
@@ -80,7 +80,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                TfLiteMulParams* params, const OpData* data,
-               TfLiteTensor* input1, TfLiteTensor* input2,
+               const TfLiteTensor* input1, const TfLiteTensor* input2,
                TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
@@ -109,7 +109,7 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                    TfLiteMulParams* params, const OpData* data,
-                   TfLiteTensor* input1, TfLiteTensor* input2,
+                   const TfLiteTensor* input1, const TfLiteTensor* input2,
                    TfLiteTensor* output) {
   auto input1_offset = -input1->params.zero_point;
   auto input2_offset = -input2->params.zero_point;
@@ -149,8 +149,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
@@ -159,8 +159,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
                                output);
   } else {
-    context->ReportError(context,
-                         "Mul only supports FLOAT32 and quantized UINT8 now.");
+    context->ReportError(
+        context, "Mul only supports FLOAT32 and quantized UINT8 now, got %d.",
+        output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/neg.cc b/tensorflow/contrib/lite/kernels/neg.cc
index 692da817272958fdb4a789d04bf43ed4f79731b4..4124c05388cca180c2b417603e6d239f1f97b5bf 100644
--- a/tensorflow/contrib/lite/kernels/neg.cc
+++ b/tensorflow/contrib/lite/kernels/neg.cc
@@ -27,7 +27,7 @@ constexpr int kOutputTensor = 0;
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   output->type = input->type;
@@ -44,7 +44,7 @@ void Negate(const T* in_data, int num_elements, T* out_data) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   const int num_elements = NumElements(input);
   switch (input->type) {
@@ -59,7 +59,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     default:
       context->ReportError(
-          context, "Neg only currently supports int64, int32, and float32.",
+          context,
+          "Neg only currently supports int64, int32, and float32, got %d.",
           input->type);
       return kTfLiteError;
   }
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
index 4f9449a225c66a0fb2a9285e6aff3a1f7147f5dd..83668cb4ca87e9eb53ab4ba9e88f91e3315594de 100644
--- a/tensorflow/contrib/lite/kernels/pad.cc
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -37,11 +37,17 @@ struct PadContext {
   PadContext(TfLiteContext* context, TfLiteNode* node) {
     input = GetInput(context, node, 0);
     paddings = GetInput(context, node, 1);
+    if (NumInputs(node) == 3) {
+      constant_values = GetOptionalInputTensor(context, node, 2);
+    } else {
+      constant_values = nullptr;
+    }
     output = GetOutput(context, node, 0);
     dims = NumDimensions(input);
   }
-  TfLiteTensor* input;
-  TfLiteTensor* paddings;
+  const TfLiteTensor* constant_values;
+  const TfLiteTensor* input;
+  const TfLiteTensor* paddings;
   TfLiteTensor* output;
   int dims;
 };
@@ -76,11 +82,15 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE(context, NumInputs(node) == 2 || NumInputs(node) == 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   PadContext op_context(context, node);
   TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+  if (op_context.constant_values != nullptr) {
+    TF_LITE_ENSURE_EQ(context, op_context.input->type,
+                      op_context.constant_values->type);
+  }
 
   // TODO(nupurgarg): Our current implementations rely on the inputs being 4D.
   TF_LITE_ENSURE_EQ(context, op_context.dims, 4);
@@ -98,6 +108,11 @@ template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   PadContext op_context(context, node);
 
+  if (op_context.constant_values != nullptr) {
+    // Ensure that constant_values is a scalar.
+    TF_LITE_ENSURE_EQ(context, NumElements(op_context.constant_values), 1);
+  }
+
   // Resize the output tensor if the output tensor is dynamic.
   if (IsDynamicTensor(op_context.output)) {
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
@@ -119,50 +134,74 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     after_padding.push_back(paddings_data[idx * 2 + 1]);
   }
 
-#define TF_LITE_PAD(type, scalar, pad_value)                                \
-  type::Pad(GetTensorData<scalar>(op_context.input),                        \
-            GetTensorDims(op_context.input), before_padding, after_padding, \
-            GetTensorData<scalar>(op_context.output),                       \
-            GetTensorDims(op_context.output), pad_value)
+#define TF_LITE_PAD(type, scalar, pad_value)                                  \
+  type::PadV2(GetTensorData<scalar>(op_context.input),                        \
+              GetTensorDims(op_context.input), before_padding, after_padding, \
+              GetTensorData<scalar>(op_context.output),                       \
+              GetTensorDims(op_context.output), pad_value)
 
   switch (op_context.input->type) {
-    case kTfLiteFloat32:
+    case kTfLiteFloat32: {
+      float pad_value = op_context.constant_values == nullptr
+                            ? 0.f
+                            : *GetTensorData<float>(op_context.constant_values);
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, float, 0);
+        TF_LITE_PAD(reference_ops, float, pad_value);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, float, 0);
+        TF_LITE_PAD(optimized_ops, float, pad_value);
+      }
+    } break;
+    case kTfLiteUInt8: {
+      uint8_t pad_value;
+      if (op_context.constant_values == nullptr) {
+        // Quantized Pad requires that 0 is represented in the quantized
+        // range.
+        TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
+                                    std::numeric_limits<uint8_t>::min());
+        TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
+                                    std::numeric_limits<uint8_t>::max());
+        pad_value = static_cast<uint8_t>(op_context.output->params.zero_point);
+      } else {
+        // Quantized Pad requires that 'constant_values' is represented in the
+        // same quantized range as the input and output tensors.
+        TF_LITE_ENSURE_EQ(context, op_context.output->params.zero_point,
+                          op_context.constant_values->params.zero_point);
+        TF_LITE_ENSURE_EQ(context, op_context.output->params.scale,
+                          op_context.constant_values->params.scale);
+        pad_value = *GetTensorData<uint8_t>(op_context.constant_values);
       }
-      break;
-    case kTfLiteUInt8:
-      // Quantized Pad requires that 0 is represented in the quantized range.
-      TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
-                                  std::numeric_limits<uint8_t>::min());
-      TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
-                                  std::numeric_limits<uint8_t>::max());
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, uint8_t,
-                    op_context.output->params.zero_point);
+        TF_LITE_PAD(reference_ops, uint8_t, pad_value);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, uint8_t,
-                    op_context.output->params.zero_point);
+        TF_LITE_PAD(optimized_ops, uint8_t, pad_value);
       }
-      break;
-    case kTfLiteInt32:
+    } break;
+    case kTfLiteInt32: {
+      int32_t pad_value =
+          op_context.constant_values == nullptr
+              ? 0
+              : *GetTensorData<int32_t>(op_context.constant_values);
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int32_t, 0);
+        TF_LITE_PAD(reference_ops, int32_t, pad_value);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int32_t, 0);
+        TF_LITE_PAD(optimized_ops, int32_t, pad_value);
       }
-      break;
-    case kTfLiteInt64:
+    } break;
+    case kTfLiteInt64: {
+      int64_t pad_value =
+          op_context.constant_values == nullptr
+              ? 0L
+              : *GetTensorData<int64_t>(op_context.constant_values);
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int64_t, 0);
+        TF_LITE_PAD(reference_ops, int64_t, pad_value);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int64_t, 0);
+        TF_LITE_PAD(optimized_ops, int64_t, pad_value);
       }
-      break;
+    } break;
     default:
-      context->ReportError(context, "Type is currently not supported by Pad.");
+      context->ReportError(context,
+                           "Type %d is currently not supported by Pad.",
+                           op_context.input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_PAD
@@ -185,6 +224,21 @@ TfLiteRegistration* Register_PAD_GENERIC_OPT() {
 
 TfLiteRegistration* Register_PAD() { return Register_PAD_GENERIC_OPT(); }
 
+// Also register Pad as PadV2.
+TfLiteRegistration* Register_PADV2_REF() {
+  static TfLiteRegistration r = {nullptr, nullptr, pad::Prepare,
+                                 pad::Eval<pad::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_PADV2_GENERIC_OPT() {
+  static TfLiteRegistration r = {nullptr, nullptr, pad::Prepare,
+                                 pad::Eval<pad::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_PADV2() { return Register_PADV2_GENERIC_OPT(); }
+
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/pad_test.cc b/tensorflow/contrib/lite/kernels/pad_test.cc
index c06237e5720874e66c5953edab2d3749cc88af28..f8b9064fbbdd25ef6a7aec0eefd8fd97572eed75 100644
--- a/tensorflow/contrib/lite/kernels/pad_test.cc
+++ b/tensorflow/contrib/lite/kernels/pad_test.cc
@@ -24,21 +24,26 @@ namespace {
 using ::testing::ElementsAreArray;
 using ::testing::Matcher;
 
+template <typename T>
 class PadOpModel : public SingleOpModel {
  public:
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
   }
 
   void SetQuantizedInput(std::initializer_list<float> data) {
     QuantizeAndPopulate<uint8_t>(input_, data);
   }
 
+  void SetQuantizedPadValue(float data) {
+    QuantizeAndPopulate<uint8_t>(constant_values_, {data});
+  }
+
   void SetPaddings(std::initializer_list<int> paddings) {
     PopulateTensor<int>(paddings_, paddings);
   }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
   std::vector<float> GetDequantizedOutput() {
@@ -50,6 +55,59 @@ class PadOpModel : public SingleOpModel {
   int input_;
   int output_;
   int paddings_;
+  int constant_values_;
+};
+
+namespace {
+
+// Returns the corresponding TensorType given the type T.
+template <typename T>
+TensorType GetTensorType() {
+  if (std::is_same<T, float>::value) return TensorType_FLOAT32;
+  if (std::is_same<T, int32_t>::value) return TensorType_INT32;
+  if (std::is_same<T, uint8_t>::value) return TensorType_UINT8;
+  return TensorType_MIN;  // default value
+}
+
+}  // namespace
+
+// Tests case where paddings is a const tensor. Type T is the dtype.
+template <typename T>
+class PadV2OpConstModel : public PadOpModel<T> {
+ public:
+  PadV2OpConstModel(const TensorData& input,
+                    std::initializer_list<int> paddings_shape,
+                    std::initializer_list<int> paddings, T constant_values,
+                    const TensorData& output) {
+    this->input_ = this->AddInput(input);
+    this->paddings_ =
+        this->AddConstInput(TensorType_INT32, paddings, paddings_shape);
+    this->constant_values_ =
+        this->AddConstInput(GetTensorType<T>(), {constant_values}, {1});
+
+    this->output_ = this->AddOutput(output);
+
+    this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
+                       CreatePadV2Options(this->builder_).Union());
+    this->BuildInterpreter({input.shape});
+  }
+
+  PadV2OpConstModel(const TensorData& input,
+                    std::initializer_list<int> paddings_shape,
+                    std::initializer_list<int> paddings,
+                    const TensorData& constant_values,
+                    const TensorData& output) {
+    this->input_ = this->AddInput(input);
+    this->paddings_ =
+        this->AddConstInput(TensorType_INT32, paddings, paddings_shape);
+    this->constant_values_ = this->AddInput(constant_values);
+
+    this->output_ = this->AddOutput(output);
+
+    this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
+                       CreatePadV2Options(this->builder_).Union());
+    this->BuildInterpreter({input.shape});
+  }
 };
 
 // Tests case where paddings is a const tensor.
@@ -58,7 +116,7 @@ class PadOpModel : public SingleOpModel {
 //    PadOpDynamicModel m(input_shape, paddings_shape, paddings_data);
 //    m.SetInput(input_data);
 //    m.Invoke();
-class PadOpConstModel : public PadOpModel {
+class PadOpConstModel : public PadOpModel<float> {
  public:
   PadOpConstModel(const TensorData& input,
                   std::initializer_list<int> paddings_shape,
@@ -66,6 +124,7 @@ class PadOpConstModel : public PadOpModel {
                   const TensorData& output) {
     input_ = AddInput(input);
     paddings_ = AddConstInput(TensorType_INT32, paddings, paddings_shape);
+    constant_values_ = AddNullInput();
     output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
@@ -74,6 +133,38 @@ class PadOpConstModel : public PadOpModel {
   }
 };
 
+// Test case where paddings is a non-const tensor.
+template <typename T>
+class PadV2OpDynamicModel : public PadOpModel<T> {
+ public:
+  PadV2OpDynamicModel(const TensorData& input,
+                      std::initializer_list<int> paddings_shape,
+                      T constant_values, const TensorData& output) {
+    this->input_ = this->AddInput(input);
+    this->paddings_ = this->AddInput(TensorType_INT32);
+    this->constant_values_ =
+        this->AddConstInput(GetTensorType<T>(), {constant_values}, {1});
+    this->output_ = this->AddOutput(output);
+
+    this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
+                       CreatePadV2Options(this->builder_).Union());
+    this->BuildInterpreter({input.shape, paddings_shape});
+  }
+  PadV2OpDynamicModel(const TensorData& input,
+                      std::initializer_list<int> paddings_shape,
+                      const TensorData& constant_values,
+                      const TensorData& output) {
+    this->input_ = this->AddInput(input);
+    this->paddings_ = this->AddInput(TensorType_INT32);
+    this->constant_values_ = this->AddInput(constant_values);
+    this->output_ = this->AddOutput(output);
+
+    this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
+                       CreatePadV2Options(this->builder_).Union());
+    this->BuildInterpreter({input.shape, paddings_shape});
+  }
+};
+
 // Test case where paddings is a non-const tensor.
 //
 // Example usage is as follows:
@@ -81,13 +172,14 @@ class PadOpConstModel : public PadOpModel {
 //    m.SetInput(input_data);
 //    m.SetPaddings(paddings_data);
 //    m.Invoke();
-class PadOpDynamicModel : public PadOpModel {
+class PadOpDynamicModel : public PadOpModel<float> {
  public:
   PadOpDynamicModel(const TensorData& input,
                     std::initializer_list<int> paddings_shape,
                     const TensorData& output) {
     input_ = AddInput(input);
     paddings_ = AddInput(TensorType_INT32);
+    constant_values_ = AddNullInput();
     output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
@@ -237,6 +329,272 @@ TEST_F(QuantizedPadOpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
+TEST(PadV2OpTest, TooManyDimensions) {
+  EXPECT_DEATH(PadV2OpConstModel<float>(
+                   {TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
+                   {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}, 0.0,
+                   {TensorType_FLOAT32}),
+               "dims != 4");
+}
+
+TEST(PadV2OpTest, UnequalDimensions) {
+  EXPECT_DEATH(
+      PadV2OpConstModel<float>({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2},
+                               {1, 1, 2, 2, 3, 3}, 0.0, {TensorType_FLOAT32}),
+      "3 != 4");
+}
+
+TEST(PadV2OpTest, InvalidPadValue) {
+  EXPECT_DEATH(PadV2OpConstModel<float>({TensorType_FLOAT32, {1, 1, 2, 1}},
+                                        {4, 2}, {0, 0, 1, -1, 2, -1, 0, 0}, 0.0,
+                                        {TensorType_FLOAT32}),
+               "Pad value has to be greater than equal to 0.");
+}
+
+TEST(PadV2OpTest, SimpleConstTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                             {0, 0, 1, 1, 1, 1, 0, 0}, 0.0,
+                             {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
+                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, SimpleConstFloat32ValuedTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                             {0, 0, 1, 1, 1, 1, 0, 0}, 5, {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
+                                               5, 5, 5, 5, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, Simple4DConstFloat32ValuedTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
+                             {0, 1, 0, 0, 0, 0, 0, 1}, 5, {TensorType_FLOAT32});
+  m.SetInput({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 5, 3, 5, 5, 5, 5, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 2, 2}));
+}
+
+TEST(PadV2OpTest, SimpleConstInt32ValuedTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<int32_t> m({TensorType_INT32, {1, 2, 2, 1}}, {4, 2},
+                               {0, 0, 1, 1, 1, 1, 0, 0}, 5, {TensorType_INT32});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
+                                               5, 5, 5, 5, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, SimpleDynamicTest) {
+  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, 0.0,
+                               {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
+                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, SimpleDynamicValuedTest) {
+  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, 5,
+                               {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
+                                               5, 5, 5, 5, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, AdvancedConstTest) {
+  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                             {0, 0, 0, 2, 1, 3, 0, 0}, 0, {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST(PadV2OpTest, AdvancedDynamicTest) {
+  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2}, 0,
+                               {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+class QuantizedPadV2OpTest : public ::testing::Test {
+ protected:
+  std::vector<Matcher<float>> DequantizedArrayNear(
+      const std::vector<float>& values, const float min, const float max) {
+    const float quantization_tolerance = (max - min) / 255.0;
+    return ArrayFloatNear(values, quantization_tolerance);
+  }
+};
+
+TEST_F(QuantizedPadV2OpTest, ZeroNotInQuantizationRange) {
+  // The test_util and actual quantization code currently ensure that the range
+  // must include zero, but if that ever changes, this test will catch it.
+  EXPECT_DEATH(
+      PadV2OpConstModel<float> m({TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0},
+                                 {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 0,
+                                 {TensorType_UINT8, {}, 1.0, 2.0}),
+      ".*Check failed: f_min <= 0.*");
+}
+
+TEST_F(QuantizedPadV2OpTest, SimpleConstTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                               {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+                               {TensorType_UINT8, {1}, -1.0, 1.0},
+                               {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.SetQuantizedPadValue(0);
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadV2OpTest, SimpleDynamicTest) {
+  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
+                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.SetQuantizedPadValue(0);
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadV2OpTest, AdvancedConstTest) {
+  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
+                               {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
+                               {TensorType_UINT8, {1}, -1.0, 1.0},
+                               {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.SetQuantizedPadValue(0);
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST_F(QuantizedPadV2OpTest, AdvancedDynamicTest) {
+  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
+                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
+                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.SetQuantizedPadValue(0);
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST_F(QuantizedPadV2OpTest, SimpleConstValuedTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                               {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+                               {TensorType_UINT8, {1}, -1.0, 1.0},
+                               {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.SetQuantizedPadValue(-0.5);
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {-0.5, -0.5, -0.5, -0.5, -0.5, -0.8, 0.2, -0.5, -0.5, 0.9,
+                   0.7, -0.5, -0.5, -0.5, -0.5, -0.5},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadV2OpTest, SimpleDynamicValuedTest) {
+  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
+                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.SetQuantizedPadValue(-0.5);
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {-0.5, -0.5, -0.5, -0.5, -0.5, -0.8, 0.2, -0.5, -0.5, 0.9,
+                   0.7, -0.5, -0.5, -0.5, -0.5, -0.5},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadV2OpTest, AdvancedConstValuedTest) {
+  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
+                               {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
+                               {TensorType_UINT8, {1}, -1.0, 1.0},
+                               {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.SetQuantizedPadValue(-0.5);
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {-0.5, -0.8, 0.2,  0.9,  -0.5, -0.5, -0.5, -0.5, 0.7,  0.1,
+                   -0.3, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5,
+                   -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST_F(QuantizedPadV2OpTest, AdvancedDynamicValuedTest) {
+  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
+                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
+                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.SetQuantizedPadValue(-0.5);
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {-0.5, -0.8, 0.2,  0.9,  -0.5, -0.5, -0.5, -0.5, 0.7,  0.1,
+                   -0.3, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5,
+                   -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/padding.h b/tensorflow/contrib/lite/kernels/padding.h
index e81b970e0fb149e8c5d95ed12622917fdc336f7a..3cb55f19a99f3e54c2ef14b8b890b286ad25f3d1 100644
--- a/tensorflow/contrib/lite/kernels/padding.h
+++ b/tensorflow/contrib/lite/kernels/padding.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
 #define TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
 
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+
 namespace tflite {
 
 inline int ComputePadding(int stride, int dilation_rate, int in_size,
@@ -24,6 +26,33 @@ inline int ComputePadding(int stride, int dilation_rate, int in_size,
   return padding > 0 ? padding : 0;
 }
 
+// Matching GetWindowedOutputSize in TensorFlow.
+inline int ComputeOutSize(TfLitePadding padding, int image_size,
+                          int filter_size, int stride) {
+  switch (padding) {
+    case kTfLitePaddingSame:
+      return (image_size + stride - 1) / stride;
+    case kTfLitePaddingValid:
+      return (image_size + stride - filter_size) / stride;
+    default:
+      return 0;
+  }
+}
+
+inline TfLitePaddingValues ComputePaddingHeightWidth(
+    int stride_height, int stride_width, int dilation_rate, int in_height,
+    int in_width, int filter_height, int filter_width, TfLitePadding padding) {
+  int out_width = ComputeOutSize(padding, in_width, filter_width, stride_width);
+  int out_height =
+      ComputeOutSize(padding, in_height, filter_height, stride_height);
+
+  TfLitePaddingValues padding_values;
+  padding_values.height =
+      ComputePadding(stride_height, 1, in_height, filter_height, out_height);
+  padding_values.width =
+      ComputePadding(stride_width, 1, in_width, filter_width, out_width);
+  return padding_values;
+}
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
diff --git a/tensorflow/contrib/lite/kernels/pooling.cc b/tensorflow/contrib/lite/kernels/pooling.cc
index 0bf27c34c1337b4ae4b8b73ee2dafcc931c7ce3c..311e9b8399726d758182e1f084a890d6f10e57ce 100644
--- a/tensorflow/contrib/lite/kernels/pooling.cc
+++ b/tensorflow/contrib/lite/kernels/pooling.cc
@@ -69,7 +69,7 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
@@ -122,7 +122,7 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
 template <KernelType kernel_type>
 void AverageEvalFloat(TfLiteContext* context, TfLiteNode* node,
                       TfLitePoolParams* params, OpData* data,
-                      TfLiteTensor* input, TfLiteTensor* output) {
+                      const TfLiteTensor* input, TfLiteTensor* output) {
   float activation_min, activation_max;
   CalculateActivationRangeFloat(params->activation, &activation_min,
                                 &activation_max);
@@ -143,7 +143,7 @@ void AverageEvalFloat(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 void AverageEvalQuantized(TfLiteContext* context, TfLiteNode* node,
                           TfLitePoolParams* params, OpData* data,
-                          TfLiteTensor* input, TfLiteTensor* output) {
+                          const TfLiteTensor* input, TfLiteTensor* output) {
   int32_t activation_min;
   int32_t activation_max;
   CalculateActivationRangeUint8(params->activation, output, &activation_min,
@@ -165,8 +165,8 @@ void AverageEvalQuantized(TfLiteContext* context, TfLiteNode* node,
 
 template <KernelType kernel_type>
 void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLitePoolParams* params, OpData* data, TfLiteTensor* input,
-                  TfLiteTensor* output) {
+                  TfLitePoolParams* params, OpData* data,
+                  const TfLiteTensor* input, TfLiteTensor* output) {
   float activation_min, activation_max;
   CalculateActivationRangeFloat(params->activation, &activation_min,
                                 &activation_max);
@@ -187,7 +187,7 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
                       TfLitePoolParams* params, OpData* data,
-                      TfLiteTensor* input, TfLiteTensor* output) {
+                      const TfLiteTensor* input, TfLiteTensor* output) {
   int32_t activation_min;
   int32_t activation_max;
   CalculateActivationRangeUint8(params->activation, output, &activation_min,
@@ -209,8 +209,8 @@ void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
 
 template <KernelType kernel_type>
 void L2EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                 TfLitePoolParams* params, OpData* data, TfLiteTensor* input,
-                 TfLiteTensor* output) {
+                 TfLitePoolParams* params, OpData* data,
+                 const TfLiteTensor* input, TfLiteTensor* output) {
   float activation_min, activation_max;
   CalculateActivationRangeFloat(params->activation, &activation_min,
                                 &activation_max);
@@ -236,7 +236,7 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       AverageEvalFloat<kernel_type>(context, node, params, data, input, output);
@@ -246,7 +246,8 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
                                         output);
       break;
     default:
-      context->ReportError(context, "Type not currently supported.");
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
@@ -258,7 +259,7 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       MaxEvalFloat<kernel_type>(context, node, params, data, input, output);
@@ -267,7 +268,8 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
       MaxEvalQuantized<kernel_type>(context, node, params, data, input, output);
       break;
     default:
-      context->ReportError(context, "Type not currently supported.");
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
@@ -279,7 +281,7 @@ TfLiteStatus L2Eval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       L2EvalFloat<kernel_type>(context, node, params, data, input, output);
@@ -288,7 +290,8 @@ TfLiteStatus L2Eval(TfLiteContext* context, TfLiteNode* node) {
     // We don't have a quantized implementation, so just fall through to the
     // 'default' case.
     default:
-      context->ReportError(context, "Type not currently supported.");
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index 2df4498c38e2782abfa6b99702d9c1e7d949bb3e..66069996049b1a415a6397f7321970eb90d0c050 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -60,6 +60,7 @@ TfLiteRegistration* Register_LSTM();
 TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_LSTM();
 TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
 TfLiteRegistration* Register_PAD();
+TfLiteRegistration* Register_PADV2();
 TfLiteRegistration* Register_RESHAPE();
 TfLiteRegistration* Register_RESIZE_BILINEAR();
 TfLiteRegistration* Register_SKIP_GRAM();
@@ -79,9 +80,16 @@ TfLiteRegistration* Register_PRELU();
 TfLiteRegistration* Register_MAXIMUM();
 TfLiteRegistration* Register_MINIMUM();
 TfLiteRegistration* Register_ARG_MAX();
+TfLiteRegistration* Register_GREATER();
+TfLiteRegistration* Register_GREATER_EQUAL();
 TfLiteRegistration* Register_LESS();
+TfLiteRegistration* Register_LESS_EQUAL();
 TfLiteRegistration* Register_FLOOR();
 TfLiteRegistration* Register_NEG();
+TfLiteRegistration* Register_SELECT();
+TfLiteRegistration* Register_SLICE();
+TfLiteRegistration* Register_SIN();
+TfLiteRegistration* Register_TRANSPOSE_CONV();
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
@@ -121,6 +129,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
              Register_UNIDIRECTIONAL_SEQUENCE_LSTM());
   AddBuiltin(BuiltinOperator_PAD, Register_PAD());
+  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2());
   AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
   AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR());
   AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
@@ -142,9 +151,16 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
   AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
   AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
+  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER());
+  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL());
   AddBuiltin(BuiltinOperator_LESS, Register_LESS());
+  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL());
   AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
   AddBuiltin(BuiltinOperator_NEG, Register_NEG());
+  AddBuiltin(BuiltinOperator_SELECT, Register_SELECT());
+  AddBuiltin(BuiltinOperator_SLICE, Register_SLICE());
+  AddBuiltin(BuiltinOperator_SIN, Register_SIN());
+  AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV());
 
 #if 0
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
@@ -155,29 +171,6 @@ BuiltinOpResolver::BuiltinOpResolver() {
 #endif
 }
 
-TfLiteRegistration* BuiltinOpResolver::FindOp(
-    tflite::BuiltinOperator op) const {
-  auto it = builtins_.find(op);
-  return it != builtins_.end() ? it->second : nullptr;
-}
-
-TfLiteRegistration* BuiltinOpResolver::FindOp(const char* op) const {
-  auto it = custom_ops_.find(op);
-  return it != custom_ops_.end() ? it->second : nullptr;
-}
-
-void BuiltinOpResolver::AddBuiltin(tflite::BuiltinOperator op,
-                                   TfLiteRegistration* registration) {
-  registration->builtin_code = op;
-  builtins_.insert(std::make_pair(op, registration));
-}
-
-void BuiltinOpResolver::AddCustom(const char* name,
-                                  TfLiteRegistration* registration) {
-  registration->builtin_code = BuiltinOperator_CUSTOM;
-  custom_ops_.insert(std::make_pair(std::string(name), registration));
-}
-
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/register.h b/tensorflow/contrib/lite/kernels/register.h
index b9cff0ae21086b44e0c920095d5f6c9668346f38..b928f1b302580d52f708bbf85dfcfc0f79ff1e69 100644
--- a/tensorflow/contrib/lite/kernels/register.h
+++ b/tensorflow/contrib/lite/kernels/register.h
@@ -23,24 +23,9 @@ namespace tflite {
 namespace ops {
 namespace builtin {
 
-class BuiltinOpResolver : public OpResolver {
+class BuiltinOpResolver : public MutableOpResolver {
  public:
   BuiltinOpResolver();
-  TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override;
-  TfLiteRegistration* FindOp(const char* op) const override;
-  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration);
-  void AddCustom(const char* name, TfLiteRegistration* registration);
-
- private:
-  struct BuiltinOperatorHasher {
-    size_t operator()(const tflite::BuiltinOperator& x) const {
-      return std::hash<size_t>()(static_cast<size_t>(x));
-    }
-  };
-  std::unordered_map<tflite::BuiltinOperator, TfLiteRegistration*,
-                     BuiltinOperatorHasher>
-      builtins_;
-  std::unordered_map<std::string, TfLiteRegistration*> custom_ops_;
 };
 
 }  // namespace builtin
diff --git a/tensorflow/contrib/lite/kernels/reshape.cc b/tensorflow/contrib/lite/kernels/reshape.cc
index 438f70d3115130efe477a3ceeccd2e77108c979a..3287040695140e3e7921c9f517450b9416b050b6 100644
--- a/tensorflow/contrib/lite/kernels/reshape.cc
+++ b/tensorflow/contrib/lite/kernels/reshape.cc
@@ -35,7 +35,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Tensorflow's Reshape allows one of the shape components to have the
@@ -70,7 +70,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   memcpy(output->data.raw, input->data.raw, input->bytes);
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear.cc b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
index 9e3e19c09a4012ebdadbc2a7c2ba06c4bfefd206..f2092eaa36db32ebbc959ac23365bb13dd034e68 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
@@ -36,8 +36,10 @@ constexpr int kInputTensor = 0;
 constexpr int kSizeTensor = 1;
 constexpr int kOutputTensor = 0;
 
-TfLiteStatus ResizeOutputTensor(TfLiteContext* context, TfLiteTensor* input,
-                                TfLiteTensor* size, TfLiteTensor* output) {
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                const TfLiteTensor* input,
+                                const TfLiteTensor* size,
+                                TfLiteTensor* output) {
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
   output_size->data[0] = input->dims->data[0];
   const int32* size_data = GetTensorData<int32>(size);
@@ -51,8 +53,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // TODO(ahentz): Our current implementations rely on the inputs being 4D.
@@ -78,9 +80,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
 
   if (IsDynamicTensor(output)) {
     TF_LITE_ENSURE_OK(context,
@@ -102,7 +104,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
 #undef TF_LITE_RESIZE_BILINEAR
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context, "Output type is %d, requires float.",
+                         output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/select.cc b/tensorflow/contrib/lite/kernels/select.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9b6cee3cb55bf93b987fa8e59bdf9c591f5c0372
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/select.cc
@@ -0,0 +1,127 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace select {
+
+constexpr int kInputTensorCondition = 0;
+constexpr int kInputTensorX = 1;
+constexpr int kInputTensorY = 2;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus SelectPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input_condition =
+      GetInput(context, node, kInputTensorCondition);
+  const TfLiteTensor* input_x = GetInput(context, node, kInputTensorX);
+  const TfLiteTensor* input_y = GetInput(context, node, kInputTensorY);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // Input must be bool.
+  TF_LITE_ENSURE(context, input_condition->type == kTfLiteBool);
+
+  // Input tensors must have the same type and size
+  TF_LITE_ENSURE_EQ(context, input_x->type, input_y->type);
+  TF_LITE_ENSURE(context, HaveSameShapes(input_x, input_y));
+  output->type = input_x->type;
+
+  // Either the same shape, or input_condition must be Rank 1 and match over the
+  // first dimension.
+  bool same_shape = HaveSameShapes(input_condition, input_x);
+  if (!same_shape && NumDimensions(input_condition) == 1) {
+    same_shape =
+        SizeOfDimension(input_condition, 0) == SizeOfDimension(input_x, 0);
+  }
+
+  TF_LITE_ENSURE(context, same_shape);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_x->dims);
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus SelectEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_condition =
+      GetInput(context, node, kInputTensorCondition);
+  const TfLiteTensor* input_x = GetInput(context, node, kInputTensorX);
+  const TfLiteTensor* input_y = GetInput(context, node, kInputTensorY);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  bool is_rank_one = !HaveSameShapes(input_condition, input_x);
+
+#define TF_LITE_SELECT(type, op)                                          \
+  reference_ops::op(GetTensorData<bool>(input_condition),                 \
+                    GetTensorDims(input_condition),                       \
+                    GetTensorData<type>(input_x), GetTensorDims(input_x), \
+                    GetTensorData<type>(input_y), GetTensorDims(input_y), \
+                    GetTensorData<type>(output), GetTensorDims(output));
+
+#define TF_LITE_SWITCH(type, op)                                               \
+  switch (type) {                                                              \
+    break;                                                                     \
+    case kTfLiteBool:                                                          \
+      TF_LITE_SELECT(bool, op);                                                \
+      break;                                                                   \
+    case kTfLiteFloat32:                                                       \
+      TF_LITE_SELECT(float, op);                                               \
+      break;                                                                   \
+    case kTfLiteUInt8:                                                         \
+      TF_LITE_SELECT(uint8_t, op);                                             \
+      break;                                                                   \
+    case kTfLiteInt32:                                                         \
+      TF_LITE_SELECT(int32_t, op);                                             \
+      break;                                                                   \
+    case kTfLiteInt64:                                                         \
+      TF_LITE_SELECT(int64_t, op);                                             \
+      break;                                                                   \
+    default:                                                                   \
+      context->ReportError(context,                                            \
+                           "Does not support type other than bool|float|int, " \
+                           "got %d",                                           \
+                           type);                                              \
+      return kTfLiteError;                                                     \
+  }
+
+  if (is_rank_one) {
+    TF_LITE_SWITCH(input_x->type, RankOneSelect);
+  } else {
+    TF_LITE_SWITCH(input_x->type, Select);
+  }
+
+#undef TF_LITE_SELECT
+#undef TF_LITE_SWITCH
+  return kTfLiteOk;
+}
+
+}  // namespace select
+
+TfLiteRegistration* Register_SELECT() {
+  static TfLiteRegistration r = {nullptr, nullptr, select::SelectPrepare,
+                                 select::SelectEval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/select_test.cc b/tensorflow/contrib/lite/kernels/select_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cfe24a5fc92765747d1c75bc3e6964b959e2205d
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/select_test.cc
@@ -0,0 +1,143 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class SelectOpModel : public SingleOpModel {
+ public:
+  SelectOpModel(std::initializer_list<int> input1_shape,
+                std::initializer_list<int> input2_shape,
+                std::initializer_list<int> input3_shape,
+                TensorType input_type) {
+    input1_ = AddInput(TensorType_BOOL);
+    input2_ = AddInput(input_type);
+    input3_ = AddInput(input_type);
+    output_ = AddOutput(input_type);
+    SetBuiltinOp(BuiltinOperator_SELECT, BuiltinOptions_SelectOptions,
+                 CreateSelectOptions(builder_).Union());
+    BuildInterpreter({input1_shape, input2_shape, input3_shape});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+  int input3() { return input3_; }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input1_;
+  int input2_;
+  int input3_;
+  int output_;
+};
+
+TEST(SelectOpTest, SelectBool) {
+  SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
+                      TensorType_BOOL);
+
+  model.PopulateTensor<bool>(model.input1(), {true, false, true, false});
+  model.PopulateTensor<bool>(model.input2(), {false, false, false, false});
+  model.PopulateTensor<bool>(model.input3(), {true, true, true, true});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput<bool>(),
+              ElementsAreArray({false, true, false, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(SelectOpTest, SelectFloat) {
+  SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
+                      TensorType_FLOAT32);
+
+  model.PopulateTensor<bool>(model.input1(), {true, false, true, false});
+  model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.3, 0.4});
+  model.PopulateTensor<float>(model.input3(), {0.5, 0.6, 0.7, 0.8});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput<float>(), ElementsAreArray({0.1, 0.6, 0.3, 0.8}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(SelectOpTest, SelectUInt8) {
+  SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
+                      TensorType_UINT8);
+
+  model.PopulateTensor<bool>(model.input1(), {false, true, false, false});
+  model.PopulateTensor<uint8>(model.input2(), {1, 2, 3, 4});
+  model.PopulateTensor<uint8>(model.input3(), {5, 6, 7, 8});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput<uint8>(), ElementsAreArray({5, 2, 7, 8}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(SelectOpTest, SelectInt32) {
+  SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
+                      TensorType_INT32);
+
+  model.PopulateTensor<bool>(model.input1(), {false, true, false, false});
+  model.PopulateTensor<int32>(model.input2(), {1, 2, 3, 4});
+  model.PopulateTensor<int32>(model.input3(), {5, 6, 7, 8});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput<int32>(), ElementsAreArray({5, 2, 7, 8}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(SelectOpTest, RankOneSelectInt32) {
+  SelectOpModel model({2}, {2, 1, 2, 1}, {2, 1, 2, 1}, TensorType_INT32);
+
+  model.PopulateTensor<bool>(model.input1(), {false, true});
+  model.PopulateTensor<int32>(model.input2(), {1, 2, 3, 4});
+  model.PopulateTensor<int32>(model.input3(), {5, 6, 7, 8});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput<int32>(), ElementsAreArray({5, 6, 3, 4}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 2, 1}));
+}
+
+TEST(SelectOpTest, RankZeroSelectInt32) {
+  SelectOpModel model({1}, {1, 2, 2, 1}, {1, 2, 2, 1}, TensorType_INT32);
+
+  model.PopulateTensor<bool>(model.input1(), {false});
+  model.PopulateTensor<int32>(model.input2(), {1, 2, 3, 4});
+  model.PopulateTensor<int32>(model.input3(), {5, 6, 7, 8});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput<int32>(), ElementsAreArray({5, 6, 7, 8}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 2, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/slice.cc b/tensorflow/contrib/lite/kernels/slice.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6a20e802a99cdf23a005a8cd9f1fd97b03c8070a
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/slice.cc
@@ -0,0 +1,201 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string.h>
+#include <cmath>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace slice {
+
+constexpr int kInputTensor = 0;
+constexpr int kBeginTensor = 1;
+constexpr int kSizeTensor = 2;
+constexpr int kOutputTensor = 0;
+
+// This Op only supports 1-4D cases and since we use the optimized ops 4D
+// implementation, the 1-3D tensors are mapped to 4D.
+const int kMaxDim = 4;
+
+template <typename T>
+TfLiteStatus CalculateOutputShapeVector(
+    TfLiteContext* context, const TfLiteTensor* input,
+    const TfLiteTensor* begin, const TfLiteTensor* size,
+    std::vector<int64_t>* output_shape_vector) {
+  for (int idx = 0; idx < NumDimensions(input); ++idx) {
+    T size_value = GetTensorData<T>(size)[idx];
+    if (size_value < 0) {
+      if (size_value != -1) {
+        context->ReportError(context, "Invalid size.");
+        return kTfLiteError;
+      }
+      size_value = SizeOfDimension(input, idx) - GetTensorData<T>(begin)[idx];
+    } else {
+      if (SizeOfDimension(input, idx) <
+          GetTensorData<T>(begin)[idx] + size_value) {
+        context->ReportError(context, "Invalid begin and size.");
+        return kTfLiteError;
+      }
+    }
+    output_shape_vector->push_back(size_value);
+  }
+  return kTfLiteOk;
+}
+
+template <typename T>
+void GetBeginAndSizeVectors(int dimensions, const TfLiteTensor* begin,
+                            const TfLiteTensor* size, std::vector<int>* begins,
+                            std::vector<int>* sizes) {
+  for (int idx = dimensions - 1; idx >= 0; --idx) {
+    begins->push_back(GetTensorData<T>(begin)[idx]);
+    sizes->push_back(GetTensorData<T>(size)[idx]);
+  }
+}
+
+TfLiteStatus ResizeOutputShape(TfLiteContext* context,
+                               const TfLiteTensor* input,
+                               const TfLiteTensor* begin,
+                               const TfLiteTensor* size, TfLiteTensor* output) {
+  std::vector<int64_t> output_shape_vector;
+
+  if (begin->type == kTfLiteInt32) {
+    TF_LITE_ENSURE_STATUS(CalculateOutputShapeVector<int32_t>(
+        context, input, begin, size, &output_shape_vector));
+  } else if (begin->type == kTfLiteInt64) {
+    TF_LITE_ENSURE_STATUS(CalculateOutputShapeVector<int64_t>(
+        context, input, begin, size, &output_shape_vector));
+  } else {
+    context->ReportError(
+        context, "Type %d is currently not supported by Slice.", begin->type);
+    return kTfLiteError;
+  }
+
+  TfLiteIntArray* output_shape =
+      TfLiteIntArrayCreate(output_shape_vector.size());
+  std::copy(output_shape_vector.begin(), output_shape_vector.end(),
+            output_shape->data);
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* begin = GetInput(context, node, kBeginTensor);
+  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // Ensure validity of input tensor and its dimension.
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE(context,
+                 begin->type == kTfLiteInt32 || begin->type == kTfLiteInt64);
+  TF_LITE_ENSURE(context,
+                 size->type == kTfLiteInt32 || size->type == kTfLiteInt64);
+  TF_LITE_ENSURE(context, NumDimensions(begin) == NumDimensions(size) == 1);
+  TF_LITE_ENSURE_MSG(context, NumDimensions(input) <= kMaxDim,
+                     "Slice op only supports 1D-4D input arrays.");
+
+  // Postpone allocation of output if any of the indexing tensors is not
+  // constant
+  if (!(IsConstantTensor(begin) && IsConstantTensor(size))) {
+    SetTensorToDynamic(output);
+    return kTfLiteOk;
+  }
+
+  return ResizeOutputShape(context, input, begin, size, output);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* begin = GetInput(context, node, kBeginTensor);
+  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutputShape(context, input, begin, size, output));
+  }
+
+  std::vector<int> begins;
+  begins.reserve(kMaxDim);
+  std::vector<int> sizes;
+  sizes.reserve(kMaxDim);
+
+  if (begin->type == kTfLiteInt32) {
+    GetBeginAndSizeVectors<int32_t>(NumDimensions(input), begin, size, &begins,
+                                    &sizes);
+  } else if (begin->type == kTfLiteInt64) {
+    GetBeginAndSizeVectors<int64_t>(NumDimensions(input), begin, size, &begins,
+                                    &sizes);
+  } else {
+    context->ReportError(
+        context, "Type %d is currently not supported by Slice.", begin->type);
+    return kTfLiteError;
+  }
+
+  for (int i = NumDimensions(input); i < kMaxDim; ++i) {
+    begins.push_back(0);
+    sizes.push_back(1);
+  }
+
+#define TF_LITE_SLICE(data_type)                                            \
+  optimized_ops::Slice<data_type>(                                          \
+      GetTensorData<data_type>(input), GetTensorDims(input), begins, sizes, \
+      GetTensorData<data_type>(output), GetTensorDims(output))
+
+  switch (input->type) {
+    case kTfLiteFloat32:
+      TF_LITE_SLICE(float);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_SLICE(int32_t);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_SLICE(int64_t);
+      break;
+    case kTfLiteUInt8:
+      TF_LITE_SLICE(uint8_t);
+      break;
+    case kTfLiteBool:
+      TF_LITE_SLICE(bool);
+      break;
+    default:
+      context->ReportError(
+          context, "Type %d is currently not supported by Slice.", input->type);
+      return kTfLiteError;
+  }
+#undef TF_LITE_SLICE
+  return kTfLiteOk;
+}
+
+}  // namespace slice
+
+TfLiteRegistration* Register_SLICE() {
+  static TfLiteRegistration r = {nullptr, nullptr, slice::Prepare, slice::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/slice_test.cc b/tensorflow/contrib/lite/kernels/slice_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4828f88f36bc1e7daf84ab6831a2ccc98bfaed40
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/slice_test.cc
@@ -0,0 +1,173 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename input_type, typename index_type>
+class SliceOpModel : public SingleOpModel {
+ public:
+  SliceOpModel(std::initializer_list<int> input_shape,
+               std::initializer_list<int> begin_shape,
+               std::initializer_list<int> size_shape,
+               TensorType tensor_index_type, TensorType tensor_input_type) {
+    input_ = AddInput(tensor_input_type);
+    begin_ = AddInput(tensor_index_type);
+    size_ = AddInput(tensor_index_type);
+    output_ = AddOutput(tensor_input_type);
+    SetBuiltinOp(BuiltinOperator_SLICE, BuiltinOptions_SliceOptions,
+                 CreateSliceOptions(builder_).Union());
+    BuildInterpreter({input_shape, begin_shape, size_shape});
+  }
+
+  void SetInput(std::initializer_list<input_type> data) {
+    PopulateTensor<input_type>(input_, data);
+  }
+  void SetBegin(std::initializer_list<index_type> data) {
+    PopulateTensor<index_type>(begin_, data);
+  }
+  void SetSize(std::initializer_list<index_type> data) {
+    PopulateTensor<index_type>(size_, data);
+  }
+
+  std::vector<input_type> GetOutput() {
+    return ExtractVector<input_type>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int begin_;
+  int size_;
+  int output_;
+};
+
+TEST(SliceOpTest, In1D) {
+  SliceOpModel<float, int32_t> m({4}, {1}, {1}, TensorType_INT32,
+                                 TensorType_FLOAT32);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetSize({2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3}));
+}
+
+TEST(SliceOpTest, In2D) {
+  SliceOpModel<float, int32_t> m({2, 3}, {2}, {2}, TensorType_INT32,
+                                 TensorType_FLOAT32);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, 0});
+  m.SetSize({1, 2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 5}));
+}
+
+TEST(SliceOpTest, In3D) {
+  SliceOpModel<float, int32_t> m({2, 3, 2}, {3}, {4}, TensorType_INT32,
+                                 TensorType_FLOAT32);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetSize({2, 3, 2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3, 2}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+}
+
+TEST(SliceOpTest, InputFloat) {
+  SliceOpModel<float, int32_t> m({4, 1, 1, 1}, {4}, {4}, TensorType_INT32,
+                                 TensorType_FLOAT32);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({3, 1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 1, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3, 4}));
+}
+
+TEST(SliceOpTest, IndexInt64) {
+  SliceOpModel<float, int64_t> m({4, 1, 1, 1}, {4}, {4}, TensorType_INT64,
+                                 TensorType_FLOAT32);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({3, 1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 1, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3, 4}));
+}
+
+// See these test cases under:
+// https://www.tensorflow.org/versions/master/api_docs/python/tf/slice
+TEST(SliceOpTest, InputInteger1) {
+  SliceOpModel<int32_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
+                                   TensorType_INT32);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({1, 1, 3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3}));
+}
+
+TEST(SliceOpTest, InputInteger2) {
+  SliceOpModel<int32_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
+                                   TensorType_INT32);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({1, 2, 3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 4, 4, 4}));
+}
+
+TEST(SliceOpTest, InputInteger3) {
+  SliceOpModel<int32_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
+                                   TensorType_INT32);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({2, 1, 3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
+}
+
+TEST(SliceOpTest, SizeMinus1) {
+  SliceOpModel<int32_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
+                                   TensorType_INT32);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({2, 1, -1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
index d8c9e352f00627eee45ae836b720f2af77140538..c9269599e58f095ded4788e2ab064583ae0a708c 100644
--- a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
@@ -40,9 +40,9 @@ struct SpaceToBatchNDContext {
     paddings = GetInput(context, node, 2);
     output = GetOutput(context, node, 0);
   }
-  TfLiteTensor* input;
-  TfLiteTensor* block_shape;
-  TfLiteTensor* paddings;
+  const TfLiteTensor* input;
+  const TfLiteTensor* block_shape;
+  const TfLiteTensor* paddings;
   TfLiteTensor* output;
 };
 
@@ -152,8 +152,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     default:
-      context->ReportError(context,
-                           "Type is currently not supported by SpaceToBatch.");
+      context->ReportError(
+          context, "Type %d is currently not supported by SpaceToBatch.",
+          op_context.input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_SPACE_TO_BATCH_ND
diff --git a/tensorflow/contrib/lite/kernels/space_to_depth.cc b/tensorflow/contrib/lite/kernels/space_to_depth.cc
index cb2e509c9811b1469c4d3f676532edff570a6c4a..9dbe9b9edaccc3ea75f1997378aba5a218ee3030 100644
--- a/tensorflow/contrib/lite/kernels/space_to_depth.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_depth.cc
@@ -42,7 +42,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
@@ -76,7 +76,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteSpaceToDepthParams*>(node->builtin_data);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
 #define TF_LITE_SPACE_TO_DEPTH(type, scalar)                                  \
@@ -113,7 +113,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     default:
-      context->ReportError(context, "Type not currently supported.");
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_SPACE_TO_DEPTH
diff --git a/tensorflow/contrib/lite/kernels/split.cc b/tensorflow/contrib/lite/kernels/split.cc
index b524c79f8779b0119781679c0af9fe354e38ad4f..43387df9ceb4d54a2784c3fa4718a95262948729 100644
--- a/tensorflow/contrib/lite/kernels/split.cc
+++ b/tensorflow/contrib/lite/kernels/split.cc
@@ -34,8 +34,8 @@ struct OpContext {
     input = GetInput(context, node, 1);
   }
   TfLiteSplitParams* params;
-  TfLiteTensor* axis;
-  TfLiteTensor* input;
+  const TfLiteTensor* axis;
+  const TfLiteTensor* input;
 };
 
 TfLiteStatus UseDynamicOutputTensors(TfLiteContext* context, TfLiteNode* node) {
@@ -46,8 +46,8 @@ TfLiteStatus UseDynamicOutputTensors(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus ResizeOutputTensors(TfLiteContext* context, TfLiteNode* node,
-                                 TfLiteTensor* axis, TfLiteTensor* input,
-                                 int num_splits) {
+                                 const TfLiteTensor* axis,
+                                 const TfLiteTensor* input, int num_splits) {
   int axis_value = GetTensorData<int>(axis)[0];
   if (axis_value < 0) {
     axis_value += NumDimensions(input);
@@ -138,8 +138,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     }
     default:
-      context->ReportError(context,
-                           "Only float32 and uint8 are currently supported.");
+      context->ReportError(
+          context, "Only float32 and uint8 are currently supported, got %d.",
+          op_context.input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_SPLIT
diff --git a/tensorflow/contrib/lite/kernels/squeeze.cc b/tensorflow/contrib/lite/kernels/squeeze.cc
index 29447ab021c7b68ff51070d35262402e08dc7ab9..09a5662fd9e70da700c629d94453cb87ad37c448 100644
--- a/tensorflow/contrib/lite/kernels/squeeze.cc
+++ b/tensorflow/contrib/lite/kernels/squeeze.cc
@@ -26,13 +26,12 @@ namespace builtin {
 namespace squeeze {
 
 struct SqueezeContext {
-  SqueezeContext(TfLiteContext* context, TfLiteNode* node) {
-    params = reinterpret_cast<TfLiteSqueezeParams*>(node->builtin_data);
-    input = GetInput(context, node, 0);
-    output = GetOutput(context, node, 0);
-  }
+  SqueezeContext(TfLiteContext* context, TfLiteNode* node)
+      : params(reinterpret_cast<TfLiteSqueezeParams*>(node->builtin_data)),
+        input(GetInput(context, node, 0)),
+        output(GetOutput(context, node, 0)) {}
   TfLiteSqueezeParams* params;
-  TfLiteTensor* input;
+  const TfLiteTensor* const input;
   TfLiteTensor* output;
 };
 
diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc
index 40ac436b7dcabe7a12166e5381f0381941a204d3..725dd8105ab9506d5203ed38a11f8e06abdab603 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice.cc
@@ -49,10 +49,10 @@ struct StridedSliceContext {
     dims = NumDimensions(input);
   }
   const TfLiteStridedSliceParams* params;
-  TfLiteTensor* input;
-  TfLiteTensor* begin;
-  TfLiteTensor* end;
-  TfLiteTensor* strides;
+  const TfLiteTensor* input;
+  const TfLiteTensor* begin;
+  const TfLiteTensor* end;
+  const TfLiteTensor* strides;
   TfLiteTensor* output;
   int dims;
 };
@@ -235,8 +235,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     default:
       context->ReportError(context,
-                           "Type is currently not supported "
-                           "by StridedSlice.");
+                           "Type %d is currently not supported "
+                           "by StridedSlice.",
+                           op_context.input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_STRIDED_SLICE
diff --git a/tensorflow/contrib/lite/kernels/sub.cc b/tensorflow/contrib/lite/kernels/sub.cc
index 7c60a4fdbffdc96b8967f52f8dbab3e18ecbcc0a..d788159a8d80e6479024b7b75624839387a461c7 100644
--- a/tensorflow/contrib/lite/kernels/sub.cc
+++ b/tensorflow/contrib/lite/kernels/sub.cc
@@ -57,8 +57,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
@@ -80,7 +80,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                TfLiteSubParams* params, const OpData* data,
-               TfLiteTensor* input1, TfLiteTensor* input2,
+               const TfLiteTensor* input1, const TfLiteTensor* input2,
                TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
@@ -109,7 +109,7 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                    TfLiteSubParams* params, const OpData* data,
-                   TfLiteTensor* input1, TfLiteTensor* input2,
+                   const TfLiteTensor* input1, const TfLiteTensor* input2,
                    TfLiteTensor* output) {
   auto input1_offset = -input1->params.zero_point;
   auto input2_offset = -input2->params.zero_point;
@@ -164,8 +164,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
@@ -174,8 +174,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
                                output);
   } else {
-    context->ReportError(context,
-                         "Inputs and outputs not all float|uint8 types.");
+    context->ReportError(
+        context, "output type %d is not support, requires float|uint8 types.",
+        output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/svdf.cc b/tensorflow/contrib/lite/kernels/svdf.cc
index 13da51c7a78c362160e2ecd6121aa31bc7ce5355..308860c299e9d74729d35b760e0f605437872c92 100644
--- a/tensorflow/contrib/lite/kernels/svdf.cc
+++ b/tensorflow/contrib/lite/kernels/svdf.cc
@@ -58,9 +58,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
 
   TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
-  TfLiteTensor* weights_feature =
+  const TfLiteTensor* weights_feature =
       GetInput(context, node, kWeightsFeatureTensor);
-  TfLiteTensor* weights_time = GetInput(context, node, kWeightsTimeTensor);
+  const TfLiteTensor* weights_time =
+      GetInput(context, node, kWeightsTimeTensor);
 
   // Check all the parameters of tensor match within themselves and match the
   // input configuration.
@@ -73,7 +74,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ASSERT_EQ(input->dims->data[1], weights_feature->dims->data[1]);
   TF_LITE_ASSERT_EQ(weights_time->dims->data[0], num_filters);
 
-  TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   if (bias) {
     TF_LITE_ASSERT_EQ(bias->dims->data[0], num_units);
   }
@@ -123,16 +124,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* weights_feature =
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* weights_feature =
       GetInput(context, node, kWeightsFeatureTensor);
-  TfLiteTensor* weights_time = GetInput(context, node, kWeightsTimeTensor);
+  const TfLiteTensor* weights_time =
+      GetInput(context, node, kWeightsTimeTensor);
 
   TfLiteTensor* state = GetOutput(context, node, kStateTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
 
-  TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
 
   const int rank = params->rank;
   const int batch_size = input->dims->data[0];
diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc
index 0bb28b50b2a5e5a9fd803ecf1b0928026f63881e..1a01ee093626c08badd65858fc16ad44e69e4912 100644
--- a/tensorflow/contrib/lite/kernels/test_util.cc
+++ b/tensorflow/contrib/lite/kernels/test_util.cc
@@ -22,23 +22,6 @@ namespace tflite {
 using ::testing::FloatNear;
 using ::testing::Matcher;
 
-namespace {
-template <typename T>
-std::pair<float, int32_t> QuantizationParams(float f_min, float f_max) {
-  // These are required by many quantized operations.
-  CHECK_LE(f_min, 0);
-  CHECK_GE(f_max, 0);
-  T q_min = std::numeric_limits<T>::min();
-  T q_max = std::numeric_limits<T>::max();
-  float range = q_max - q_min;
-  float scale = (f_max - f_min) / range;
-  int32_t zero_point = std::min(
-      q_max,
-      std::max(q_min, static_cast<T>(std::round(q_min - f_min / scale))));
-  return {scale, zero_point};
-}
-}  // namespace
-
 std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values,
                                            float max_abs_error) {
   std::vector<Matcher<float>> matchers;
@@ -49,69 +32,8 @@ std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values,
   return matchers;
 }
 
-int SingleOpModel::AddTensor(TensorData t, std::initializer_list<int> data) {
-  int id = tensors_.size();
-
-  // This is slightly different depending on whether we are adding a
-  // quantized or a regular tensor.
-  bool is_quantized = (t.min != 0 || t.max != 0 || t.scale != 0);
-
-  flatbuffers::Offset<QuantizationParameters> q_params = 0;
-
-  if (is_quantized) {
-    if (t.min != 0 || t.max != 0) {
-      if (t.type == TensorType_UINT8) {
-        std::tie(t.scale, t.zero_point) =
-            QuantizationParams<uint8_t>(t.min, t.max);
-      } else if (t.type == TensorType_INT32) {
-        std::tie(t.scale, t.zero_point) =
-            QuantizationParams<int32_t>(t.min, t.max);
-      } else {
-        LOG(FATAL) << "No support for the requested quantized type";
-      }
-      t.min = 0;
-      t.max = 0;
-    }
-
-    q_params = CreateQuantizationParameters(
-        builder_, /*min=*/0, /*max=*/0, builder_.CreateVector<float>({t.scale}),
-        builder_.CreateVector<int64_t>({t.zero_point}));
-  }
-
-  int buffer_id = 0;
-  if (data.size()) {
-    // Initialize buffers list with empty buffer to allow for non-const tensors.
-    if (buffers_.empty()) {
-      buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
-    }
-
-    // Add data as a Buffer to buffers list.
-    buffer_id = buffers_.size();
-    auto data_buffer =
-        builder_.CreateVector(reinterpret_cast<const uint8_t*>(data.begin()),
-                              sizeof(int) * data.size());
-    buffers_.push_back(CreateBuffer(builder_, data_buffer));
-  }
-
-  tensors_.push_back(CreateTensor(builder_, builder_.CreateVector<int>(t.shape),
-                                  t.type, /*buffer=*/buffer_id,
-                                  /*name=*/0, q_params));
-
-  tensor_data_[id] = t;
-
-  return id;
-}
-
 int SingleOpModel::AddInput(const TensorData& t) {
-  int id = AddTensor(t, {});
-  inputs_.push_back(id);
-  return id;
-}
-
-int SingleOpModel::AddConstInput(TensorType type,
-                                 std::initializer_list<int> data,
-                                 std::initializer_list<int> shape) {
-  int id = AddTensor(TensorData{type, shape}, data);
+  int id = AddTensor<float>(t, {});
   inputs_.push_back(id);
   return id;
 }
@@ -123,7 +45,7 @@ int SingleOpModel::AddNullInput() {
 }
 
 int SingleOpModel::AddOutput(const TensorData& t) {
-  int id = AddTensor(t, {});
+  int id = AddTensor<float>(t, {});
   outputs_.push_back(id);
   return id;
 }
@@ -179,7 +101,7 @@ void SingleOpModel::BuildInterpreter(
     }
     resolver_ = std::unique_ptr<OpResolver>(resolver);
   }
-  InterpreterBuilder(model, *resolver_)(&interpreter_);
+  CHECK(InterpreterBuilder(model, *resolver_)(&interpreter_) == kTfLiteOk);
 
   CHECK(interpreter_ != nullptr);
 
diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h
index 6fb6fe27ebace8159c960f7a7b9acc2326ba3062..55edc97d19fa75bedb6c0928fcf9c7be5f434522 100644
--- a/tensorflow/contrib/lite/kernels/test_util.h
+++ b/tensorflow/contrib/lite/kernels/test_util.h
@@ -89,18 +89,24 @@ struct TensorData {
 class SingleOpResolver : public OpResolver {
  public:
   SingleOpResolver(const BuiltinOperator op, TfLiteRegistration* registration)
-      : op_(op), registration_(registration) {}
-  TfLiteRegistration* FindOp(BuiltinOperator op) const override {
+      : op_(op), registration_(*registration) {
+    registration_.builtin_code = static_cast<int32_t>(op);
+    registration_.version = 1;
+  }
+  const TfLiteRegistration* FindOp(BuiltinOperator op,
+                                   int version) const override {
     if (op == op_) {
-      return registration_;
+      return &registration_;
     }
     return nullptr;
   }
-  TfLiteRegistration* FindOp(const char* op) const override { return nullptr; }
+  const TfLiteRegistration* FindOp(const char* op, int version) const override {
+    return nullptr;
+  }
 
  private:
   const BuiltinOperator op_;
-  TfLiteRegistration* registration_;
+  TfLiteRegistration registration_;
 };
 
 class SingleOpModel {
@@ -116,9 +122,14 @@ class SingleOpModel {
   int AddInput(TensorType type) { return AddInput(TensorData{type}); }
   int AddInput(const TensorData& t);
 
-  // Add a Tensor containing const data and return the tensor id.
-  int AddConstInput(TensorType type, std::initializer_list<int> data,
-                    std::initializer_list<int> shape);
+  // Templated version of AddConstInput().
+  template <typename T>
+  int AddConstInput(TensorType type, std::initializer_list<T> data,
+                    std::initializer_list<int> shape) {
+    int id = AddTensor(TensorData{type, shape}, data);
+    inputs_.push_back(id);
+    return id;
+  }
 
   // Add a null input tensor (optional input) and return kOptionalTensor.
   int AddNullInput();
@@ -224,7 +235,79 @@ class SingleOpModel {
   std::unique_ptr<OpResolver> resolver_;
 
  private:
-  int AddTensor(TensorData t, std::initializer_list<int> data);
+  // TODO(gavinbelson): sync this method with
+  // //tensorflow/contrib/lite/kernels/internal/quantization_util.h?l=31
+  template <typename T>
+  std::pair<float, int32_t> QuantizationParams(float f_min, float f_max) {
+    // These are required by many quantized operations.
+    CHECK_LE(f_min, 0);
+    CHECK_GE(f_max, 0);
+    T q_min = std::numeric_limits<T>::min();
+    T q_max = std::numeric_limits<T>::max();
+    float range = q_max - q_min;
+    float scale = (f_max - f_min) / range;
+    int32_t zero_point = std::min(
+        q_max,
+        std::max(q_min, static_cast<T>(std::round(q_min - f_min / scale))));
+    return {scale, zero_point};
+  }
+
+  template <typename T>
+  int AddTensor(TensorData t, std::initializer_list<T> data) {
+    int id = tensors_.size();
+
+    // This is slightly different depending on whether we are adding a
+    // quantized or a regular tensor.
+    bool is_quantized = (t.min != 0 || t.max != 0 || t.scale != 0);
+
+    flatbuffers::Offset<QuantizationParameters> q_params = 0;
+
+    if (is_quantized) {
+      if (t.min != 0 || t.max != 0) {
+        if (t.type == TensorType_UINT8) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<uint8_t>(t.min, t.max);
+        } else if (t.type == TensorType_INT32) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<int32_t>(t.min, t.max);
+        } else {
+          LOG(FATAL) << "No support for the requested quantized type";
+        }
+        t.min = 0;
+        t.max = 0;
+      }
+
+      q_params = CreateQuantizationParameters(
+          builder_, /*min=*/0, /*max=*/0,
+          builder_.CreateVector<float>({t.scale}),
+          builder_.CreateVector<int64_t>({t.zero_point}));
+    }
+
+    int buffer_id = 0;
+    if (data.size()) {
+      // Initialize buffers list with empty buffer to allow for non-const
+      // tensors.
+      if (buffers_.empty()) {
+        buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
+      }
+
+      // Add data as a Buffer to buffers list.
+      buffer_id = buffers_.size();
+      auto data_buffer =
+          builder_.CreateVector(reinterpret_cast<const uint8_t*>(data.begin()),
+                                sizeof(T) * data.size());
+      buffers_.push_back(CreateBuffer(builder_, data_buffer));
+    }
+
+    tensors_.push_back(CreateTensor(builder_,
+                                    builder_.CreateVector<int>(t.shape), t.type,
+                                    /*buffer=*/buffer_id,
+                                    /*name=*/0, q_params));
+
+    tensor_data_[id] = t;
+
+    return id;
+  }
 
   std::map<int, TensorData> tensor_data_;
   std::vector<int32_t> inputs_;
diff --git a/tensorflow/contrib/lite/kernels/topk_v2.cc b/tensorflow/contrib/lite/kernels/topk_v2.cc
index ad9b744f1af2715a37cc60ef61b0b9540fe2532b..fb0e49c90c41747f9b7e53570276c8b8045030fd 100644
--- a/tensorflow/contrib/lite/kernels/topk_v2.cc
+++ b/tensorflow/contrib/lite/kernels/topk_v2.cc
@@ -30,15 +30,14 @@ constexpr int kOutputIndexes = 1;
 
 namespace {
 TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
+  const TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
   // INT32 number of top results is supported.
   TF_LITE_ENSURE_EQ(context, top_k->type, kTfLiteInt32);
   // Check that the tensor contains only one value.
-  TF_LITE_ENSURE_EQ(context, NumDimensions(top_k), 1);
   TF_LITE_ENSURE_EQ(context, NumElements(top_k), 1);
-  const int32 k = top_k->data.i32[0];
+  const int32 k = *GetTensorData<int32_t>(top_k);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const int num_dimensions = NumDimensions(input);
   // Check that input has one or more dimensions.
   TF_LITE_ENSURE_MSG(context, input->dims->size >= 1,
@@ -162,11 +161,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 2);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output_values = GetOutput(context, node, kOutputValues);
   TF_LITE_ENSURE_EQ(context, input->type, output_values->type);
 
-  TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
+  const TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
   TF_LITE_ENSURE_EQ(context, top_k->type, kTfLiteInt32);
 
   // Set output dynamic if the input is not const.
@@ -187,11 +186,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   if (IsDynamicTensor(output_values)) {
     TF_LITE_ENSURE_OK(context, ResizeOutput(context, node));
   }
-  TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
+  const TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
   const int32 k = top_k->data.i32[0];
   // The tensor can have more than 2 dimensions or even be a vector, the code
   // anyway calls the internal dimension as row;
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const int32 row_size = input->dims->data[input->dims->size - 1];
   int32 num_rows = 1;
   for (int i = 0; i < input->dims->size - 1; ++i) {
@@ -215,7 +214,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
            output_values->data.i64);
       break;
     default:
-      context->ReportError(context, "Type is currently not supported by TopK.");
+      context->ReportError(context,
+                           "Type %d is currently not supported by TopK.",
+                           output_values->type);
       return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/transpose.cc b/tensorflow/contrib/lite/kernels/transpose.cc
index d3c10a9bb7b07404ccd8cfe2636473a622b91787..800b0563d7ee6126d65005ff4ef61219db9eebb5 100644
--- a/tensorflow/contrib/lite/kernels/transpose.cc
+++ b/tensorflow/contrib/lite/kernels/transpose.cc
@@ -37,8 +37,8 @@ struct TransposeContext {
     perm = GetInput(context, node, 1);
     output = GetOutput(context, node, 0);
   }
-  TfLiteTensor* input;
-  TfLiteTensor* perm;
+  const TfLiteTensor* input;
+  const TfLiteTensor* perm;
   TfLiteTensor* output;
 };
 
@@ -136,7 +136,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     default:
       context->ReportError(context,
-                           "Type is currently not supported by Transpose.");
+                           "Type %d is currently not supported by Transpose.",
+                           op_context.input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_TRANSPOSE
diff --git a/tensorflow/contrib/lite/kernels/transpose_conv.cc b/tensorflow/contrib/lite/kernels/transpose_conv.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c99661029ed1ac881536f83519dcec355c60d50
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/transpose_conv.cc
@@ -0,0 +1,146 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/kernels/padding.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace transpose_conv {
+
+constexpr int kOutputShapeTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kDataInputTensor = 2;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus ResizeOutputShape(TfLiteContext* context,
+                               const TfLiteTensor* output_shape,
+                               TfLiteTensor* output) {
+  // Currently only support int32 for output shape.
+  if (output_shape->type != kTfLiteInt32) {
+    context->ReportError(context, "Output shape is %d, not int32.",
+                         output_shape->type);
+    return kTfLiteError;
+  }
+  const int output_dimensions = NumElements(output_shape);
+  TfLiteIntArray* output_shape_array = TfLiteIntArrayCreate(output_dimensions);
+  for (int i = 0; i < output_dimensions; ++i) {
+    output_shape_array->data[i] = GetTensorData<int32_t>(output_shape)[i];
+  }
+
+  return context->ResizeTensor(context, output, output_shape_array);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* output_shape =
+      GetInput(context, node, kOutputShapeTensor);
+  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* input = GetInput(context, node, kDataInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output_shape), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(weights), 4);
+
+  // Currenlty only supports float32.
+  const TfLiteType data_type = input->type;
+  TF_LITE_ENSURE(context, data_type == kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, output->type, data_type);
+  TF_LITE_ENSURE_EQ(context, weights->type, data_type);
+
+  // Ensure that weights and inputs have the same channel dimension.
+  // Note: TOCO will reorder weights in the following format: OHWI.
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(input, 3),
+                    SizeOfDimension(weights, 0));
+
+  if (!IsConstantTensor(output_shape)) {
+    SetTensorToDynamic(output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputShape(context, output_shape, output);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* output_shape =
+      GetInput(context, node, kOutputShapeTensor);
+  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* input = GetInput(context, node, kDataInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  const auto* params =
+      reinterpret_cast<TfLiteTransposeConvParams*>(node->builtin_data);
+
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutputShape(context, output_shape, output));
+  }
+
+  // Get height and width of the output image.
+  const int width = SizeOfDimension(output, 2);
+  const int height = SizeOfDimension(output, 1);
+  const int filter_width = SizeOfDimension(weights, 1);
+  const int filter_height = SizeOfDimension(weights, 2);
+
+  const int stride_width = params->stride_width;
+  const int stride_height = params->stride_height;
+
+  const TfLitePaddingValues& padding_size =
+      ComputePaddingHeightWidth(stride_height, stride_width, 1, height, width,
+                                filter_height, filter_width, params->padding);
+
+  // Currently only support float32.
+  switch (input->type) {
+    case kTfLiteFloat32:
+      optimized_ops::TransposeConv(
+          GetTensorData<float>(input), GetTensorDims(input),
+          GetTensorData<float>(weights), GetTensorDims(weights), stride_width,
+          stride_height, padding_size.width, padding_size.height,
+          GetTensorData<float>(output), GetTensorDims(output));
+      break;
+    default:
+      context->ReportError(context, "Type %d, not currently supported.",
+                           input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace transpose_conv
+
+TfLiteRegistration* Register_TRANSPOSE_CONV() {
+  static TfLiteRegistration r = {nullptr, nullptr, transpose_conv::Prepare,
+                                 transpose_conv::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/transpose_conv_test.cc b/tensorflow/contrib/lite/kernels/transpose_conv_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..52be08934997f484337e4a3592bc7af832601695
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/transpose_conv_test.cc
@@ -0,0 +1,222 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class TransposeConvOpModel : public SingleOpModel {
+ public:
+  TransposeConvOpModel(std::initializer_list<int> input_shape,
+                       std::initializer_list<int> filter_shape, Padding padding,
+                       int stride_w, int stride_h) {
+    output_shape_ = AddInput(TensorType_INT32);
+    filter_ = AddInput(TensorType_FLOAT32);
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(
+        BuiltinOperator_TRANSPOSE_CONV, BuiltinOptions_TransposeConvOptions,
+        CreateTransposeConvOptions(builder_, padding, stride_w, stride_h)
+            .Union());
+    BuildInterpreter({{4}, filter_shape, input_shape});
+  }
+
+  int output_shape() { return output_shape_; }
+  int filter() { return filter_; }
+  int input() { return input_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int output_shape_;
+  int filter_;
+  int input_;
+  int output_;
+};
+
+// Test case:
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 4, 4, 1 ]),
+//     tf.constant(np.arange(1, 10), shape=[ 3, 3, 1, 1 ], dtype=tf.float32),
+//     tf.constant(np.arange(1, 17), shape=[ 1, 4, 4, 1 ], dtype=tf.float32),
+//     [1, 1, 1, 1 ],
+//     "SAME")
+TEST(TransposeConvOpModelTest, SimpleTest) {
+  TransposeConvOpModel m({1, 4, 4, 1}, {1, 3, 3, 1}, Padding_SAME, 1, 1);
+  m.PopulateTensor<int>(m.output_shape(), {1, 4, 4, 1});
+  m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  m.PopulateTensor<float>(
+      m.input(), {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({29, 62, 83, 75, 99, 192, 237, 198, 207, 372,
+                                417, 330, 263, 446, 485, 365}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+// Test case:
+// filter = tf.constant(np.arange(1, 19),
+//                      shape=[ 3, 3, 1, 2 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 4, 4, 1 ]),
+//     filter,
+//     tf.constant(np.arange(1, 33), shape=[ 1, 4, 4, 2 ], dtype=tf.float32),
+//     [1, 1, 1, 1 ],
+//     "SAME")
+// And filter value is derived by:
+// filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[18, 1])
+TEST(TransposeConvOpModelTest, TwoFiltersTest) {
+  TransposeConvOpModel m({1, 4, 4, 2}, {2, 3, 3, 1}, Padding_SAME, 1, 1);
+  m.PopulateTensor<int>(m.output_shape(), {1, 4, 4, 1});
+  m.PopulateTensor<float>(m.filter(), {1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6,
+                                       8, 10, 12, 14, 16, 18});
+  m.PopulateTensor<float>(
+      m.input(),
+      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({184, 412, 568, 528, 678, 1347, 1689, 1434, 1494,
+                                2715, 3057, 2442, 1968, 3352, 3652, 2760}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+// Test case:
+// filter = tf.constant(np.arange(1, 19),
+//                      shape=[ 3, 3, 1, 2 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 6, 6, 1 ]),
+//     filter,
+//     tf.constant(np.arange(1, 33), shape=[ 1, 4, 4, 2 ], dtype=tf.float32),
+//     [1, 1, 1, 1 ],
+//     "VALID")
+// And filter value is derived by:
+// filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[1, 18])
+TEST(TransposeConvOpModelTest, PaddingValidTest) {
+  TransposeConvOpModel m({1, 4, 4, 2}, {2, 3, 3, 1}, Padding_VALID, 1, 1);
+  m.PopulateTensor<int>(m.output_shape(), {1, 6, 6, 1});
+  m.PopulateTensor<float>(m.filter(), {1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6,
+                                       8, 10, 12, 14, 16, 18});
+  m.PopulateTensor<float>(
+      m.input(),
+      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({5,    22,   59,   101,  114,  83,   52,   184,  412,
+                        568,  528,  344,  237,  678,  1347, 1689, 1434, 879,
+                        597,  1494, 2715, 3057, 2442, 1431, 856,  1968, 3352,
+                        3652, 2760, 1548, 689,  1534, 2543, 2729, 2010, 1103}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 6, 6, 1}));
+}
+
+// Test case:
+// filter = tf.constant(np.arange(1, 10),
+//                      shape=[ 3, 3, 1, 1 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 5, 5, 1 ]),
+//     filter,
+//     tf.constant(np.arange(1, 5), shape=[ 1, 2, 2, 1 ], dtype=tf.float32),
+//     [1, 2, 2, 1 ],
+//     "VALID")
+TEST(TransposeConvOpModelTest, StrideValidTest) {
+  TransposeConvOpModel m({1, 2, 2, 1}, {1, 3, 3, 1}, Padding_VALID, 2, 2);
+  m.PopulateTensor<int>(m.output_shape(), {1, 5, 5, 1});
+  m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  m.PopulateTensor<float>(m.input(), {1, 2, 3, 4});
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({1,  2,  5,  4,  6,  4,  5,  14, 10, 12, 10, 14, 36,
+                        24, 30, 12, 15, 34, 20, 24, 21, 24, 55, 32, 36}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 5, 5, 1}));
+}
+
+// Test case:
+// filter = tf.constant(np.arange(1, 19),
+//                      shape=[ 3, 3, 2, 1 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 5, 5, 2 ]),
+//     filter,
+//     tf.constant(np.arange(1, 5), shape=[ 1, 2, 2, 1 ], dtype=tf.float32),
+//     [1, 2, 2, 1 ],
+//     "VALID")
+TEST(TransposeConvOpModelTest, MultiChannelTest) {
+  TransposeConvOpModel m({1, 2, 2, 1}, {1, 3, 3, 2}, Padding_VALID, 2, 2);
+  m.PopulateTensor<int>(m.output_shape(), {1, 5, 5, 2});
+  m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+                                       13, 14, 15, 16, 17, 18});
+  m.PopulateTensor<float>(m.input(), {1, 2, 3, 4});
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({1,  2,  3,  4,  7,  10,  6,   8,  10, 12, 7,  8,  9,
+                        10, 25, 28, 18, 20, 22,  24,  16, 20, 24, 28, 62, 72,
+                        42, 48, 54, 60, 21, 24,  27,  30, 61, 68, 36, 40, 44,
+                        48, 39, 42, 45, 48, 103, 110, 60, 64, 68, 72}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 5, 5, 2}));
+}
+
+// Test case:
+// filter = tf.constant(np.random.randint(1, 10, size=9),
+//                      shape=[ 3, 3, 1, 1 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 3, 4, 1 ]),
+//     filter,
+//     tf.constant([323, 521], shape=[ 1, 1, 2, 1], dtype=tf.float32),
+//     [1, 3, 3, 1 ],
+//     "SAME")
+// And filter value is derived by:
+// filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[-1])
+TEST(TransposeConvOpModelTest, AccuracyTest) {
+  TransposeConvOpModel m({1, 1, 2, 1}, {1, 3, 3, 1}, Padding_SAME, 3, 3);
+  m.PopulateTensor<int>(m.output_shape(), {1, 3, 4, 1});
+  m.PopulateTensor<float>(m.filter(), {9, 5, 6, 9, 8, 5, 3, 1, 4});
+  m.PopulateTensor<float>(m.input(), {323, 521});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {1615., 1938., 4689., 2605., 2584., 1615.,
+                                  4689., 4168., 323., 1292., 1563., 521.})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 4, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
index 5987bf68b5a73eaf39567bd65a0508839475505f..1c28123a24edd9886476bf8e9ea3ba4c692baa2b 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
@@ -92,7 +92,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE(context, params->cell_clip >= 0);
   TF_LITE_ENSURE(context, params->proj_clip >= 0);
 
-  TfLiteTensor* input_to_input_weights =
+  const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
   if (input_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
@@ -100,19 +100,19 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
   }
 
-  TfLiteTensor* input_to_forget_weights =
+  const TfLiteTensor* input_to_forget_weights =
       GetInput(context, node, kInputToForgetWeightsTensor);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
 
-  TfLiteTensor* input_to_cell_weights =
+  const TfLiteTensor* input_to_cell_weights =
       GetInput(context, node, kInputToCellWeightsTensor);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
 
-  TfLiteTensor* recurrent_to_input_weights =
+  const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
   if (recurrent_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
@@ -122,7 +122,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                       n_output);
   }
 
-  TfLiteTensor* recurrent_to_forget_weights =
+  const TfLiteTensor* recurrent_to_forget_weights =
       GetInput(context, node, kRecurrentToForgetWeightsTensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
@@ -130,7 +130,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
                     n_output);
 
-  TfLiteTensor* recurrent_to_cell_weights =
+  const TfLiteTensor* recurrent_to_cell_weights =
       GetInput(context, node, kRecurrentToCellWeightsTensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
@@ -146,21 +146,21 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
        (recurrent_to_input_weights == nullptr));
   TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
 
-  TfLiteTensor* cell_to_input_weights =
+  const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
   if (cell_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* cell_to_forget_weights =
+  const TfLiteTensor* cell_to_forget_weights =
       GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
   if (cell_to_forget_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* cell_to_output_weights =
+  const TfLiteTensor* cell_to_output_weights =
       GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
   if (cell_to_output_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
@@ -179,7 +179,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
 
   // Make sure the input gate bias is present only when not a CIFG-LSTM.
-  TfLiteTensor* input_gate_bias =
+  const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, kInputGateBiasTensor);
   if (use_cifg) {
     TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
@@ -188,21 +188,21 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* forget_gate_bias =
+  const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, kForgetGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* output_gate_bias =
+  const TfLiteTensor* output_gate_bias =
       GetInput(context, node, kOutputGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* projection_weights =
+  const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
   if (projection_weights) {
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
@@ -210,7 +210,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
   }
 
-  TfLiteTensor* projection_bias =
+  const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, kProjectionBiasTensor);
   if (projection_bias) {
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
@@ -241,19 +241,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Inferring batch size, number of outputs and sequence length and
   // number of cells from the input tensors.
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TF_LITE_ENSURE(context, input->dims->size > 1);
   const int max_time = input->dims->data[0];
   const int n_batch = input->dims->data[1];
   const int n_input = input->dims->data[2];
 
-  TfLiteTensor* input_to_output_weights =
+  const TfLiteTensor* input_to_output_weights =
       GetInput(context, node, kInputToOutputWeightsTensor);
   const int n_cell = input_to_output_weights->dims->data[0];
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
 
-  TfLiteTensor* recurrent_to_output_weights =
+  const TfLiteTensor* recurrent_to_output_weights =
       GetInput(context, node, kRecurrentToOutputWeightsTensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
@@ -300,7 +300,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   output_state->allocation_type = kTfLiteArenaRwPersistent;
   cell_state->allocation_type = kTfLiteArenaRwPersistent;
 
-  TfLiteTensor* input_to_input_weights =
+  const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
   const bool use_cifg = (input_to_input_weights == nullptr);
   if (use_cifg) {
@@ -324,44 +324,44 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 // The LSTM Op engine.
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
 
-  TfLiteTensor* input_to_input_weights =
+  const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  TfLiteTensor* input_to_forget_weights =
+  const TfLiteTensor* input_to_forget_weights =
       GetInput(context, node, kInputToForgetWeightsTensor);
-  TfLiteTensor* input_to_cell_weights =
+  const TfLiteTensor* input_to_cell_weights =
       GetInput(context, node, kInputToCellWeightsTensor);
-  TfLiteTensor* input_to_output_weights =
+  const TfLiteTensor* input_to_output_weights =
       GetInput(context, node, kInputToOutputWeightsTensor);
 
-  TfLiteTensor* recurrent_to_input_weights =
+  const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  TfLiteTensor* recurrent_to_forget_weights =
+  const TfLiteTensor* recurrent_to_forget_weights =
       GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  TfLiteTensor* recurrent_to_cell_weights =
+  const TfLiteTensor* recurrent_to_cell_weights =
       GetInput(context, node, kRecurrentToCellWeightsTensor);
-  TfLiteTensor* recurrent_to_output_weights =
+  const TfLiteTensor* recurrent_to_output_weights =
       GetInput(context, node, kRecurrentToOutputWeightsTensor);
 
-  TfLiteTensor* cell_to_input_weights =
+  const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
-  TfLiteTensor* cell_to_forget_weights =
+  const TfLiteTensor* cell_to_forget_weights =
       GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
-  TfLiteTensor* cell_to_output_weights =
+  const TfLiteTensor* cell_to_output_weights =
       GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
 
-  TfLiteTensor* input_gate_bias =
+  const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, kInputGateBiasTensor);
-  TfLiteTensor* forget_gate_bias =
+  const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, kForgetGateBiasTensor);
-  TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
-  TfLiteTensor* output_gate_bias =
+  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* output_gate_bias =
       GetInput(context, node, kOutputGateBiasTensor);
 
-  TfLiteTensor* projection_weights =
+  const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  TfLiteTensor* projection_bias =
+  const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, kProjectionBiasTensor);
 
   TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
index ac00c37b67dcbe77023a2495a698967ca555b1d5..8429dba54bd1806125aadc2119ca59c1bd42ce89 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
 #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -38,17 +39,26 @@ constexpr int kBiasTensor = 3;
 constexpr int kHiddenStateTensor = 0;
 constexpr int kOutputTensor = 1;
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* scratch_tensor_index = new int;
+  context->AddTensors(context, /*tensors_to_add=*/2, scratch_tensor_index);
+  return scratch_tensor_index;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<int*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Check we have all the inputs and outputs we need.
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
 
-  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
-  TfLiteTensor* input_weights =
-      &context->tensors[node->inputs->data[kWeightsTensor]];
-  TfLiteTensor* recurrent_weights =
-      &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
-  TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input_weights = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* recurrent_weights =
+      GetInput(context, node, kRecurrentWeightsTensor);
+  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
 
   // Check all the parameters of tensor match within themselves and match the
   // input configuration.
@@ -63,10 +73,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ASSERT_EQ(input_weights->dims->data[0], bias->dims->data[0]);
   TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[0], bias->dims->data[0]);
   TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[1], bias->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, input_weights->type, recurrent_weights->type);
 
-  TfLiteTensor* hidden_state =
-      &context->tensors[node->outputs->data[kHiddenStateTensor]];
-  TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
+  TfLiteTensor* hidden_state = GetOutput(context, node, kHiddenStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Resize state.
   TfLiteIntArray* hidden_state_size_array = TfLiteIntArrayCreate(2);
@@ -86,22 +97,44 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, output, output_size_array));
 
+  // Allocate temporary tensors to store quantized values of input and
+  // hidden_state tensors.
+  if (input->type == kTfLiteFloat32 && input_weights->type == kTfLiteUInt8) {
+    int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+    TfLiteIntArrayFree(node->temporaries);
+    node->temporaries = TfLiteIntArrayCreate(2);
+    node->temporaries->data[0] = *scratch_tensor_index;
+    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
+    input_quantized->type = kTfLiteUInt8;
+    input_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
+      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
+                                                       input_quantized_size));
+    }
+    node->temporaries->data[1] = *scratch_tensor_index + 1;
+    TfLiteTensor* hidden_state_quantized =
+        GetTemporary(context, node, /*index=*/1);
+    hidden_state_quantized->type = kTfLiteUInt8;
+    hidden_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(hidden_state_quantized->dims,
+                             hidden_state->dims)) {
+      TfLiteIntArray* hidden_state_quantized_size =
+          TfLiteIntArrayCopy(hidden_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, hidden_state_quantized,
+                                              hidden_state_quantized_size));
+    }
+  }
   return kTfLiteOk;
 }
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data);
-
-  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
-  TfLiteTensor* input_weights =
-      &context->tensors[node->inputs->data[kWeightsTensor]];
-  TfLiteTensor* recurrent_weights =
-      &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
-  TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
-  TfLiteTensor* hidden_state =
-      &context->tensors[node->outputs->data[kHiddenStateTensor]];
-  TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
-
+TfLiteStatus EvalFloat(const TfLiteTensor* input,
+                       const TfLiteTensor* input_weights,
+                       const TfLiteTensor* recurrent_weights,
+                       const TfLiteTensor* bias,
+                       const TfLiteSequenceRNNParams* params,
+                       TfLiteTensor* hidden_state, TfLiteTensor* output) {
   // Initialize the pointer bias.
   const float* bias_ptr = bias->data.f;
 
@@ -120,7 +153,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   if (time_major) {
     // Initialize the pointer to hidden state.
     float* hidden_state_ptr_batch = hidden_state->data.f;
-    // Unroll the sequence and use batch batch operations for efficiency.
+    // Unroll the sequence and use batch operations for efficiency.
     for (int s = 0; s < max_time; s++) {
       // Initialize the pointer to input and output.
       const float* input_ptr_batch =
@@ -154,12 +187,115 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus EvalQuantized(const TfLiteTensor* input,
+                           const TfLiteTensor* input_weights,
+                           const TfLiteTensor* recurrent_weights,
+                           const TfLiteTensor* bias,
+                           const TfLiteSequenceRNNParams* params,
+                           TfLiteTensor* input_scratch,
+                           TfLiteTensor* hidden_state_scratch,
+                           TfLiteTensor* hidden_state, TfLiteTensor* output) {
+  const bool time_major = params->time_major;
+  const int batch_size =
+      (time_major) ? input->dims->data[1] : input->dims->data[0];
+  const int max_time =
+      (time_major) ? input->dims->data[0] : input->dims->data[1];
+  const int num_units = input_weights->dims->data[0];
+  const int input_size = input->dims->data[2];
+
+  // Initialize the pointer bias.
+  const float* bias_ptr = bias->data.f;
+  // Initialize input_weights and recurrent_weights.
+  const int8_t* input_weights_ptr =
+      reinterpret_cast<const int8_t*>(input_weights->data.uint8);
+  const int8_t* recurrent_weights_ptr =
+      reinterpret_cast<const int8_t*>(recurrent_weights->data.uint8);
+  // Get the scale of the quantized weights.
+  float input_weights_scale = input_weights->params.scale;
+  float recurrent_weights_scale = recurrent_weights->params.scale;
+  // Initialize temporary storage for quantized values.
+  int8_t* quantized_input_ptr =
+      reinterpret_cast<int8_t*>(input_scratch->data.uint8);
+  int8_t* quantized_hidden_state_ptr =
+      reinterpret_cast<int8_t*>(hidden_state_scratch->data.uint8);
+
+  if (time_major) {
+    // Initialize the pointer to hidden state.
+    float* hidden_state_ptr_batch = hidden_state->data.f;
+    // Unroll the sequence and use batch operations for efficiency.
+    for (int s = 0; s < max_time; s++) {
+      // Initialize the pointer to input and output.
+      const float* input_ptr_batch =
+          input->data.f + s * input_size * batch_size;
+      float* output_ptr_batch = output->data.f + s * num_units * batch_size;
+
+      kernel_utils::RnnBatchStep(
+          input_ptr_batch, input_weights_ptr, input_weights_scale,
+          recurrent_weights_ptr, recurrent_weights_scale, bias_ptr, input_size,
+          num_units, batch_size, params->activation, quantized_input_ptr,
+          quantized_hidden_state_ptr, hidden_state_ptr_batch, output_ptr_batch);
+    }
+  } else {
+    // For each batch
+    for (int b = 0; b < batch_size; b++) {
+      // Initialize the pointer to hidden state.
+      float* hidden_state_ptr_batch = hidden_state->data.f + b * num_units;
+      for (int s = 0; s < max_time; s++) {
+        // Initialize the pointer to input and output.
+        const float* input_ptr_batch =
+            input->data.f + b * input_size * max_time + s * input_size;
+        float* output_ptr_batch =
+            output->data.f + b * num_units * max_time + s * num_units;
+
+        kernel_utils::RnnBatchStep(
+            input_ptr_batch, input_weights_ptr, input_weights_scale,
+            recurrent_weights_ptr, recurrent_weights_scale, bias_ptr,
+            input_size, num_units, /*batch_size=*/1, params->activation,
+            quantized_input_ptr, quantized_hidden_state_ptr,
+            hidden_state_ptr_batch, output_ptr_batch);
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input_weights = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* recurrent_weights =
+      GetInput(context, node, kRecurrentWeightsTensor);
+  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
+  TfLiteTensor* hidden_state = GetOutput(context, node, kHiddenStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (input_weights->type) {
+    case kTfLiteFloat32:
+      return EvalFloat(input, input_weights, recurrent_weights, bias, params,
+                       hidden_state, output);
+    case kTfLiteUInt8: {
+      // TODO(mirkov): implement eval with quantized inputs as well.
+      TfLiteTensor* input_quantized = GetTemporary(context, node, 0);
+      TfLiteTensor* hidden_state_quantized = GetTemporary(context, node, 1);
+      return EvalQuantized(input, input_weights, recurrent_weights, bias,
+                           params, input_quantized, hidden_state_quantized,
+                           hidden_state, output);
+    }
+    default:
+      context->ReportError(context, "Type %d not currently supported.",
+                           input_weights->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 }  // namespace unidirectional_sequence_rnn
 
 TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_RNN() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
-                                 unidirectional_sequence_rnn::Prepare,
-                                 unidirectional_sequence_rnn::Eval};
+  static TfLiteRegistration r = {
+      unidirectional_sequence_rnn::Init, unidirectional_sequence_rnn::Free,
+      unidirectional_sequence_rnn::Prepare, unidirectional_sequence_rnn::Eval};
   return &r;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
index 7e32969763b59620dc3534708f965750680002d2..0adab837b07a6d3bd5d7edd267916cd8e1bb75b2 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
@@ -122,17 +122,66 @@ static float rnn_golden_output[] = {
     0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
     0.628881,   3.58099,    1.49974,   0};
 
+static std::initializer_list<float> rnn_weights = {
+    0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
+    0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
+    0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
+    -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
+    -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
+    -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
+    -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
+    0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
+    0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
+    0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
+    -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
+    0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
+    -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
+    -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
+    0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
+    0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
+    0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
+    -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
+    0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
+    0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
+    -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
+    0.277308,    0.415818};
+
+static std::initializer_list<float> rnn_recurrent_weights = {
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1};
+
+static std::initializer_list<float> rnn_bias = {
+    0.065691948, -0.69055247, 0.1107955,  -0.97084129, -0.23957068, -0.23566568,
+    -0.389184,   0.47481549,  -0.4791103, 0.29931796,  0.10463274,  0.83918178,
+    0.37197268,  0.61957061,  0.3956964,  -0.37609905};
+
 class UnidirectionalRNNOpModel : public SingleOpModel {
  public:
-  UnidirectionalRNNOpModel(int batches, int sequence_len, int units, int size,
-                           bool time_major)
+  UnidirectionalRNNOpModel(
+      int batches, int sequence_len, int units, int size, bool time_major,
+      const TensorType& weights = TensorType_FLOAT32,
+      const TensorType& recurrent_weights = TensorType_FLOAT32)
       : batches_(batches),
         sequence_len_(sequence_len),
         units_(units),
         input_size_(size) {
     input_ = AddInput(TensorType_FLOAT32);
-    weights_ = AddInput(TensorType_FLOAT32);
-    recurrent_weights_ = AddInput(TensorType_FLOAT32);
+    weights_ = AddInput(weights);
+    recurrent_weights_ = AddInput(recurrent_weights);
     bias_ = AddInput(TensorType_FLOAT32);
     hidden_state_ = AddOutput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
@@ -187,7 +236,7 @@ class UnidirectionalRNNOpModel : public SingleOpModel {
   int num_batches() { return batches_; }
   int sequence_len() { return sequence_len_; }
 
- private:
+ protected:
   int input_;
   int weights_;
   int recurrent_weights_;
@@ -201,58 +250,31 @@ class UnidirectionalRNNOpModel : public SingleOpModel {
   int input_size_;
 };
 
-// TODO(mirkov): add another test which directly compares to TF once TOCO
-// supports the conversion from dynamic_rnn with BasicRNNCell.
-TEST(FullyConnectedOpTest, BlackBoxTest) {
+// The hybrid model has quantized weights and recurrent_weights.
+class HybridUnidirectionalRNNOpModel : public UnidirectionalRNNOpModel {
+ public:
+  HybridUnidirectionalRNNOpModel(int batches, int sequence_len, int units,
+                                 int size, bool time_major)
+      : UnidirectionalRNNOpModel(batches, sequence_len, units, size, time_major,
+                                 TensorType_UINT8, TensorType_UINT8) {}
+
+  void SetWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(weights_, f);
+  }
+
+  void SetRecurrentWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_weights_, f);
+  }
+};
+
+TEST(UnidirectionalRNNOpTest, BlackBoxTest) {
   UnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                                /*units=*/16, /*size=*/8, /*time_major=*/false);
-  rnn.SetWeights(
-      {0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
-       0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
-       0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
-       -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
-       -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
-       -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
-       -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
-       0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
-       0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
-       0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
-       -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
-       0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
-       -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
-       -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
-       0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
-       0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
-       0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
-       -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
-       0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
-       0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
-       -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
-       0.277308,    0.415818});
-
-  rnn.SetBias({0.065691948, -0.69055247, 0.1107955, -0.97084129, -0.23957068,
-               -0.23566568, -0.389184, 0.47481549, -0.4791103, 0.29931796,
-               0.10463274, 0.83918178, 0.37197268, 0.61957061, 0.3956964,
-               -0.37609905});
-
-  rnn.SetRecurrentWeights({0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1});
-
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
   rnn.ResetHiddenState();
+
   const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
   float* batch_start = rnn_input;
   float* batch_end = batch_start + input_sequence_size;
@@ -270,56 +292,42 @@ TEST(FullyConnectedOpTest, BlackBoxTest) {
   EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
 }
 
-TEST(FullyConnectedOpTest, TimeMajorBlackBoxTest) {
-  UnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
-                               /*units=*/16, /*size=*/8, /*time_major=*/true);
-  rnn.SetWeights(
-      {0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
-       0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
-       0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
-       -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
-       -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
-       -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
-       -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
-       0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
-       0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
-       0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
-       -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
-       0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
-       -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
-       -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
-       0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
-       0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
-       0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
-       -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
-       0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
-       0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
-       -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
-       0.277308,    0.415818});
-
-  rnn.SetBias({0.065691948, -0.69055247, 0.1107955, -0.97084129, -0.23957068,
-               -0.23566568, -0.389184, 0.47481549, -0.4791103, 0.29931796,
-               0.10463274, 0.83918178, 0.37197268, 0.61957061, 0.3956964,
-               -0.37609905});
-
-  rnn.SetRecurrentWeights({0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1});
+TEST(HybridUnidirectionalRNNOpModelOpTest, BlackBoxTest) {
+  HybridUnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                                     /*units=*/16, /*size=*/8,
+                                     /*time_major=*/false);
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
+  rnn.ResetHiddenState();
+
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  float* batch_start = rnn_input;
+  float* batch_end = batch_start + input_sequence_size;
+  rnn.SetInput(0, batch_start, batch_end);
+  rnn.SetInput(input_sequence_size, batch_start, batch_end);
+
+  rnn.Invoke();
+
+  float* golden_start = rnn_golden_output;
+  float* golden_end = golden_start + rnn.num_units() * rnn.sequence_len();
+  std::vector<float> expected;
+  expected.insert(expected.end(), golden_start, golden_end);
+  expected.insert(expected.end(), golden_start, golden_end);
+
+  EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                   expected, /*max_abs_error=*/0.013)));
+}
 
+TEST(UnidirectionalRNNOpTest, TimeMajorBlackBoxTest) {
+  UnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                               /*units=*/16, /*size=*/8,
+                               /*time_major=*/true);
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
   rnn.ResetHiddenState();
+
   for (int i = 0; i < rnn.sequence_len(); i++) {
     float* batch_start = rnn_input + i * rnn.input_size();
     float* batch_end = batch_start + rnn.input_size();
@@ -341,6 +349,37 @@ TEST(FullyConnectedOpTest, TimeMajorBlackBoxTest) {
   EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
 }
 
+TEST(HybridUnidirectionalRNNOpModelOpTest, TimeMajorBlackBoxTest) {
+  HybridUnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                                     /*units=*/16, /*size=*/8,
+                                     /*time_major=*/true);
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
+  rnn.ResetHiddenState();
+
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    rnn.SetInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_batch_start = rnn_golden_output + i * rnn.num_units();
+    float* golden_batch_end = golden_batch_start + rnn.num_units();
+    expected.insert(expected.end(), golden_batch_start, golden_batch_end);
+    expected.insert(expected.end(), golden_batch_start, golden_batch_end);
+  }
+
+  EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                   expected, /*max_abs_error=*/0.013)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 590f042e21670912c20da58771902c97effdae64..80fcb28bc7f6c09c7b979fcefcbc25deef583a00 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -184,8 +184,10 @@ TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() {
   TfLiteStatus status = kTfLiteOk;
   auto opcodes = model_->operator_codes();
   for (const OperatorCode* opcode : *opcodes) {
-    TfLiteRegistration* registration = nullptr;
+    const TfLiteRegistration* registration = nullptr;
     auto builtin_code = opcode->builtin_code();
+    int version = opcode->version();
+
     if (builtin_code > BuiltinOperator_MAX ||
         builtin_code < BuiltinOperator_MIN) {
       error_reporter_->Report(
@@ -194,8 +196,7 @@ TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() {
           builtin_code);
       status = kTfLiteError;
     } else if (builtin_code != BuiltinOperator_CUSTOM) {
-      flatbuffer_op_index_to_registration_types_.push_back(builtin_code);
-      registration = op_resolver_.FindOp(builtin_code);
+      registration = op_resolver_.FindOp(builtin_code, version);
       if (registration == nullptr) {
         error_reporter_->Report("Didn't find op for builtin opcode '%s'\n",
                                 EnumNameBuiltinOperator(builtin_code));
@@ -207,11 +208,13 @@ TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() {
       status = kTfLiteError;
     } else {
       const char* name = opcode->custom_code()->c_str();
-      registration = op_resolver_.FindOp(name);
+      registration = op_resolver_.FindOp(name, version);
       flatbuffer_op_index_to_registration_types_.push_back(
           BuiltinOperator_CUSTOM);
       if (registration == nullptr) {
-        error_reporter_->Report("Didn't find custom op for name '%s'\n", name);
+        error_reporter_->Report(
+            "Didn't find custom op for name '%s' with version %d\n", name,
+            version);
         status = kTfLiteError;
       }
     }
@@ -333,6 +336,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->stride_height = conv_params->stride_h();
         params->activation =
             parse_activation(conv_params->fused_activation_function());
+
         params->dilation_width_factor = conv_params->dilation_w_factor();
         params->dilation_height_factor = conv_params->dilation_h_factor();
       }
@@ -352,6 +356,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_PRELU:
     case BuiltinOperator_FLOOR:
     case BuiltinOperator_NEG:
+    case BuiltinOperator_SIN:
       break;
     case BuiltinOperator_CAST: {
       TfLiteCastParams* params = MallocPOD<TfLiteCastParams>();
@@ -569,6 +574,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_PAD: {
       break;
     }
+    case BuiltinOperator_PADV2: {
+      break;
+    }
     case BuiltinOperator_RESHAPE: {
       auto* params = MallocPOD<TfLiteReshapeParams>();
       if (auto* schema_params = op->builtin_options_as_ReshapeOptions()) {
@@ -669,7 +677,26 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
-    case BuiltinOperator_LESS: {
+    case BuiltinOperator_GREATER:
+    case BuiltinOperator_GREATER_EQUAL:
+    case BuiltinOperator_LESS:
+    case BuiltinOperator_LESS_EQUAL:
+    case BuiltinOperator_SELECT: {
+      break;
+    }
+    case BuiltinOperator_SLICE: {
+      break;
+    }
+    case BuiltinOperator_TRANSPOSE_CONV: {
+      TfLiteTransposeConvParams* params =
+          MallocPOD<TfLiteTransposeConvParams>();
+      if (auto* transpose_conv_params =
+              op->builtin_options_as_TransposeConvOptions()) {
+        params->padding = parse_padding(transpose_conv_params->padding());
+        params->stride_width = transpose_conv_params->stride_w();
+        params->stride_height = transpose_conv_params->stride_h();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_DELEGATE: {
@@ -696,27 +723,30 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
       status = kTfLiteError;
       continue;
     }
-    const TfLiteRegistration* reg =
+
+    const TfLiteRegistration* registration =
         flatbuffer_op_index_to_registration_[op->opcode_index()];
-    if (reg == nullptr) {
+    if (registration == nullptr) {
       error_reporter_->Report("Skipping op for opcode_index %d\n", index);
       status = kTfLiteError;
       continue;
     }
 
-    auto op_type =
-        flatbuffer_op_index_to_registration_types_[op->opcode_index()];
+    BuiltinOperator op_type =
+        static_cast<BuiltinOperator>(registration->builtin_code);
+
     if (op_type != BuiltinOperator_CUSTOM && op->custom_options()) {
       error_reporter_->Report(
           "Found builtin operator %s with custom options.\n",
           EnumNameBuiltinOperator(op_type));
     }
+
     if (op->custom_options()) {
       interpreter->AddNodeWithParameters(
           FlatBufferIntArrayToVector(op->inputs()),
           FlatBufferIntArrayToVector(op->outputs()),
           reinterpret_cast<const char*>(op->custom_options()->data()),
-          op->custom_options()->size(), nullptr, reg);
+          op->custom_options()->size(), nullptr, registration);
     } else {
       void* builtin_data = nullptr;
       TF_LITE_ENSURE_STATUS(
@@ -724,7 +754,7 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
       interpreter->AddNodeWithParameters(
           FlatBufferIntArrayToVector(op->inputs()),
           FlatBufferIntArrayToVector(op->outputs()), nullptr, 0, builtin_data,
-          reg);
+          registration);
     }
   }
 
diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h
index 5a55b031a8c28085e02782608eb820a3cfe78dde..3946b490417104f620ecb55bb22d4ef99fd33bb7 100644
--- a/tensorflow/contrib/lite/model.h
+++ b/tensorflow/contrib/lite/model.h
@@ -37,6 +37,7 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 
 namespace tflite {
@@ -131,18 +132,6 @@ class FlatBufferModel {
   Allocation* allocation_ = nullptr;
 };
 
-// Abstract interface that returns TfLiteRegistrations given op codes or custom
-// op names. This is the mechanism that ops being referenced in the flatbuffer
-// model are mapped to executable function pointers (TfLiteRegistrations).
-class OpResolver {
- public:
-  // Finds the op registration for a builtin operator by enum code.
-  virtual TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const = 0;
-  // Finds the op registration of a custom operator by op name.
-  virtual TfLiteRegistration* FindOp(const char* op) const = 0;
-  virtual ~OpResolver() {}
-};
-
 // Build an interpreter capable of interpreting `model`.
 //
 // model: a scoped model whose lifetime must be at least as long as
@@ -187,7 +176,7 @@ class InterpreterBuilder {
   const OpResolver& op_resolver_;
   ErrorReporter* error_reporter_;
 
-  std::vector<TfLiteRegistration*> flatbuffer_op_index_to_registration_;
+  std::vector<const TfLiteRegistration*> flatbuffer_op_index_to_registration_;
   std::vector<BuiltinOperator> flatbuffer_op_index_to_registration_types_;
   const Allocation* allocation_ = nullptr;
 };
diff --git a/tensorflow/contrib/lite/model_test.cc b/tensorflow/contrib/lite/model_test.cc
index ae6c1ece18963f11f48a6f07bea4065ce39687e0..15bae21a411c1241cf71ab4d3f0e0289eaac8ef3 100644
--- a/tensorflow/contrib/lite/model_test.cc
+++ b/tensorflow/contrib/lite/model_test.cc
@@ -55,11 +55,12 @@ class TrivialResolver : public OpResolver {
   explicit TrivialResolver(TfLiteRegistration* constant_return = nullptr)
       : constant_return_(constant_return) {}
   // Find the op registration of a custom operator by op name.
-  TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override {
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override {
     return constant_return_;
   }
   // Find the op registration of a custom operator by op name.
-  TfLiteRegistration* FindOp(const char* op) const override {
+  const TfLiteRegistration* FindOp(const char* op, int version) const override {
     return constant_return_;
   }
 
diff --git a/tensorflow/contrib/lite/models/smartreply/BUILD b/tensorflow/contrib/lite/models/smartreply/BUILD
index a82d1f2eb673b9b7211581f5a9f9febc140d4d1e..8b5fa240ac31d9ee61879c42aee3c5d449ae60db 100644
--- a/tensorflow/contrib/lite/models/smartreply/BUILD
+++ b/tensorflow/contrib/lite/models/smartreply/BUILD
@@ -22,7 +22,6 @@ cc_library(
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/tools:mutable_op_resolver",
         "@com_google_absl//absl/strings",
         "@com_googlesource_code_re2//:re2",
         "@farmhash_archive//:farmhash",
@@ -39,7 +38,6 @@ cc_library(
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/tools:mutable_op_resolver",
         "@com_google_absl//absl/strings",
         "@com_googlesource_code_re2//:re2",
     ],
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc b/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
index f97a6486d6c11cf0184622f515fe5b1e096c6257..29c8ad2286d705ea60fcd258e7283f6e1c3b70b8 100644
--- a/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
+++ b/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
@@ -61,7 +61,7 @@ bool IsValidNgram(const tflite::StringRef& strref) {
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteIntArray* outputSize1 = TfLiteIntArrayCreate(1);
   TfLiteIntArray* outputSize2 = TfLiteIntArrayCreate(1);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   int dim = input->dims->data[0];
   if (dim == 0) {
     // TFLite non-string output should have size greater than 0.
@@ -76,7 +76,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   int num_strings = tflite::GetStringCount(input);
   TfLiteTensor* label = GetOutput(context, node, 0);
   TfLiteTensor* weight = GetOutput(context, node, 1);
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor.cc b/tensorflow/contrib/lite/models/smartreply/predictor.cc
index 6da5cc8eecc0920850f666b0992c4d9598c55b6c..5d6c47dce8d90192d35a3a51fe6d0beb11f3b23f 100644
--- a/tensorflow/contrib/lite/models/smartreply/predictor.cc
+++ b/tensorflow/contrib/lite/models/smartreply/predictor.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
 #include "tensorflow/contrib/lite/string_util.h"
-#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
 
 void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
 
@@ -104,11 +104,11 @@ void GetSegmentPredictions(
             });
 
   // Add backoff response.
-  for (const string& backoff : config.backoff_responses) {
+  for (const auto& backoff : config.backoff_responses) {
     if (predictor_responses->size() >= config.num_response) {
       break;
     }
-    predictor_responses->push_back({backoff, config.backoff_confidence});
+    predictor_responses->emplace_back(backoff, config.backoff_confidence);
   }
 }
 
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index 4a648e42837fbf6b7326c315be202ae0a80a47ca..becd1f615f04a806cba9c494323285c004ec41df 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -65,7 +65,8 @@ inline bool NNAPIExists() {
   return nnapi_is_available;
 }
 
-// nn api types
+// NN api types based on NNAPI header file
+// https://developer.android.com/ndk/reference/group/neural-networks
 
 /**
  * Operand types.
@@ -77,31 +78,11 @@ inline bool NNAPIExists() {
  * ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, and ANEURALNETWORKS_INT32.
  */
 enum {
-  /** The following entries are used to declare scalars. */
-
-  /** A 32 bit floating point scalar value. */
   ANEURALNETWORKS_FLOAT32 = 0,
-  /** A signed 32 bit integer scalar value. */
   ANEURALNETWORKS_INT32 = 1,
-  /** An unsigned 32 bit integer scalar value. */
   ANEURALNETWORKS_UINT32 = 2,
-
-  /** The following entries are used to declare tensors. */
-
-  /** A tensor of 32 bit floating point values. */
   ANEURALNETWORKS_TENSOR_FLOAT32 = 3,
-  /** A tensor of 32 bit integer values. */
   ANEURALNETWORKS_TENSOR_INT32 = 4,
-  /** A tensor of 8 bit integers that represent real numbers.
-   *
-   * Attached to this tensor are two numbers that can be used to convert
-   * the 8 bit integer to the real value and vice versa.  These two numbers are:
-   * - scale: a 32 bit floating point value
-   * - zero_value: an 32 bit integer
-   *
-   * The formula is:
-   * real_value = (integer_value - zero_value) * scale.
-   */
   ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
 };
 
@@ -111,968 +92,44 @@ enum {
  * The type of operations that can be added to a model.
  */
 enum {
-  /** Adds two tensors, element-wise.
-   *
-   * Takes two input tensors of identical type and compatible dimensions. The
-   * output is the sum of both input tensors, optionally modified by an
-   * activation function.
-   *
-   * Two dimensions are compatible when:
-   *     1. they are equal, or
-   *     2. one of them is 1
-   *
-   * The size of the output is the maximum size along each dimension of the
-   * input operands. It starts with the trailing dimensions, and works its way
-   * forward.
-   *
-   * Example:
-   *
-   *     input1.dimension = {4, 1, 2}
-   *     input2.dimension = {5, 4, 3, 1}
-   *     output.dimension = {5, 4, 3, 2}
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: up to 4
-   *
-   * Inputs:
-   * * 0: A tensor.
-   * * 1: A tensor of the same type, and compatible dimensions as input0.
-   * * 2: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *      Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The sum, a tensor of the same type as input0.
-   */
   ANEURALNETWORKS_ADD = 0,
-  /** Performs a 2-D average pooling operation.
-   *
-   * The output dimensions are functions of the filter dimensions, stride, and
-   * padding.
-   *
-   * The values in the output tensor are computed as:
-   *
-   *     output[batch, row, col, channel] =
-   *         sum_{i, j}(input[batch, row + i, col + j, channel]) / sum(1)
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
-   * input.
-   * * 1: An INT32 value, specifying the padding on the left, in the ‘width’
-   * dimension.
-   * * 2: An INT32 value, specifying the padding on the right,in the ‘width’
-   * dimension.
-   * * 3: An INT32 value, specifying the padding on the top, in the ‘height’
-   * dimension.
-   * * 4: An INT32 value, specifying the padding on the bottom, in the ‘height’
-   * dimension.
-   * * 5: An INT32 value, specifying the output stride in the ‘width’ dimension.
-   * * 6: An INT32 value, specifying the output stride in the ‘height’
-   * dimension.
-   * * 7: An INT32 value, specifying the filter width.
-   * * 8: An INT32 value, specifying the filter height.
-   * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *      Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
-   * depth].
-   */
   ANEURALNETWORKS_AVERAGE_POOL_2D = 1,
-  /** Concatenates the input tensors along the given dimension.
-   *
-   * The input tensors must have identical type and the same dimensions except
-   * the dimension along the concatenation axis.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4
-   *
-   * Inputs:
-   * 0 ~ n: The list on n input tensors, of shape [D0, D1, ..., Daxis(i), ...,
-   * Dm] n+1: An INT32 value, specifying the concatenation axis. n+2: An INT32
-   * value, and has to be one of the {@link FuseCode} values. Specifies the
-   * activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The output, a tensor of the same type as the input tensors.
-   *      The output shape is [D0, D1, ..., sum(Daxis(i)), ..., Dm].
-   */
   ANEURALNETWORKS_CONCATENATION = 2,
-  /** Performs an 2-D convolution operation.
-   *
-   * The CONV_2D op sweeps a 2-D filter that can mix channels together over a
-   * batch of images, applying the filter to each window of each image of the
-   * appropriate size.
-   *
-   * The output dimensions are functions of the filter dimensions, stride, and
-   * padding.
-   *
-   * The values in the output tensor are computed as:
-   *
-   *     output[batch, row, col, channel] =
-   *         sum_{i, j} (
-   *             input[batch, row + i, col + j, k] *
-   *             filter[channel, row + i, col + j, k] +
-   *             bias[channel]
-   *         )
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying
-   * the input.
-   * * 1: A 4-D tensor, of shape [depth_out, filter_height, filter_width,
-   * depth_in], specifying the filter.
-   * * 2: A 1-D tensor, of shape [depth_out], specifying the bias.
-   *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32} type, the
-   * bias should also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}. For input
-   * tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the bias should
-   * be of {@link ANEURALNETWORKS_TENSOR_INT32}.
-   * * 3: An INT32 value, specifying the padding on the left, in the ‘width’
-   * dimension.
-   * * 4: An INT32 value, specifying the padding on the right,in the ‘width’
-   * dimension.
-   * * 5: An INT32 value, specifying the padding on the top, in the ‘height’
-   * dimension.
-   * * 6: An INT32 value, specifying the padding on the bottom, in the ‘height’
-   * dimension.
-   * * 7: An INT32 value, specifying the output stride in the ‘width’ dimension.
-   * * 8: An INT32 value, specifying the output stride in the ‘height’
-   * dimension.
-   * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *      Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
-   * depth_out].
-   */
   ANEURALNETWORKS_CONV_2D = 3,
-  /** Performs a depthwise 2-D convolution operation.
-   *
-   * Given an input tensor of shape [batches, height, width, depth_in] and a
-   * filter tensor of shape [depth_out, filter_height, filter_width, depth_in]
-   * containing in_channels convolutional filters of depth 1, DEPTHWISE_CONV
-   * applies a different filter to each input channel (expanding from 1 channel
-   * to channel_multiplier channels for each), then concatenates the results
-   * together.
-   *
-   * The output has depth_out = depth_in * depth_multiplier channels.
-   * The output dimensions are functions of the filter dimensions, stride, and
-   * padding.
-   *
-   * The values in the output tensor are computed as:
-   *
-   *     output[b, i, j, k * channel_multiplier + q] =
-   *         sum_{di, dj} (
-   *             input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-   *             filter[di, dj, k, q]
-   *         )
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying
-   * the input.
-   * * 1: A 4-D tensor, of shape [depth_out, filter_height, filter_width,
-   * depth_in], specifying the filter.
-   * * 2: A 1-D tensor, of shape [depth_out], specifying the bias.
-   *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32} type, the
-   * bias should also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}. For input
-   * tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the bias should
-   * be of {@link ANEURALNETWORKS_TENSOR_INT32}.
-   * * 3: An INT32 value, specifying the padding on the left, in the ‘width’
-   * dimension.
-   * * 4: An INT32 value, specifying the padding on the right,in the ‘width’
-   * dimension.
-   * * 5: An INT32 value, specifying the padding on the top, in the ‘height’
-   * dimension.
-   * * 6: An INT32 value, specifying the padding on the bottom, in the ‘height’
-   * dimension.
-   * * 7: An INT32 value, specifying the output stride in the ‘width’ dimension.
-   * * 8: An INT32 value, specifying the output stride in the ‘height’
-   * dimension.
-   * * 9: An INT32 value, specifying the depthwise multiplier.
-   * * 10: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *       Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
-   * depth_out].
-   */
   ANEURALNETWORKS_DEPTHWISE_CONV_2D = 4,
-  /** Rearranges data from depth into blocks of spatial data.
-   *
-   * More specifically, this op outputs a copy of the input tensor where values
-   * from the depth dimension are moved in spatial blocks to the height and
-   * width dimensions. The value block_size indicates the input block size and
-   * how the data is moved.
-   *
-   * Chunks of data of size block_size * block_size from depth are rearranged
-   * into non-overlapping blocks of size block_size x block_size.
-   *
-   * The width of the output tensor is input_depth * block_size, whereas the
-   * height is input_height * block_size. The depth of the input tensor must be
-   * divisible by block_size * block_size
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying
-   * the input.
-   * * 1: An INT32 value, specifying the block_size. block_size must be >=1 and
-   *      block_size * block_size must be a divisor of the input depth.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batch, height*block_size,
-   * width*block_size, depth/(block_size*block_size)].
-   */
   ANEURALNETWORKS_DEPTH_TO_SPACE = 5,
-  /** Dequantizes the input tensor.
-   *
-   * The formula is:
-   *
-   *     output = (input - zero_value) * scale.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4
-   *
-   * Inputs:
-   * * 0: A tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0, but with type
-   *      {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
-   */
   ANEURALNETWORKS_DEQUANTIZE = 6,
-
-  /**
-   * Looks up items from a given tensor.
-   *
-   * Each item in the output is a raw copy of the corresponding item in
-   * the input “values”. If the given “lookup” indices are out of bounds,
-   * the op will fail and an error will be reported.
-   *
-   * Inputs:
-   * * 0: Values. An n-D tensor of any type X (where n >= 2). E.g., if n is 2,
-   *      then the shape would be [lookup_dimension, values_dimension], where
-   *      “lookup_dimension” corresponds to the indexing dimension in the lookup
-   *      table, and “values_dimension” to the contents.
-   * * 1: Lookups. An 1-D tensor of type T, of shape [lookup_size], where
-   *      “lookup_size” is the number of elements to look for, and each entry
-   *      corresponds to the first dimension of the “values” tensor.
-   *
-   * Output:
-   * * 0: A n-D tensor of type X and the same rank and shape as the “values”
-   *      tensor, except for the first dimension which has size “lookup_size”.
-   */
   ANEURALNETWORKS_EMBEDDING_LOOKUP = 7,
-
-  /** Computes element-wise floor() on the input tensor.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: up to 4
-   *
-   * Inputs:
-   * * 0: A tensor.
-   *
-   * Outputs:
-   * * 0: The output, a tensor of the same type and dimensions as input0.
-   */
   ANEURALNETWORKS_FLOOR = 8,
-  /** Denotes a fully (densely) connected layer, which connects all elements in
-   * the input tensor with each element in the output tensor.
-   *
-   * This layer implements the operation:
-   *
-   *     outputs = activation(inputs * weights’ + bias)
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4.
-   *
-   * Inputs:
-   * * 0: A tensor, specifying the input. If rank is greater than 2, then it
-   * gets flattened to a 2-D Tensor. The 2-D Tensor is handled as if dimensions
-   * corresponded to shape [batch_size, input_size], where “batch_size”
-   * corresponds to the batching dimension, and “input_size” is the size of the
-   * input.
-   * * 1: A 2-D tensor, specifying the weights, of shape [num_units,
-   * input_size], where "num_units" corresponds to the number of output nodes.
-   * * 2: A 1-D tensor, of shape [num_units], specifying the bias.
-   *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32} type, the
-   * bias should also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}. For input
-   * tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the bias should
-   * be of {@link ANEURALNETWORKS_TENSOR_INT32}.
-   * * 3: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *      Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The output tensor, of shape [batch_size, num_units].
-   */
   ANEURALNETWORKS_FULLY_CONNECTED = 9,
-
-  /**
-   * Looks up values of a hash table with given keys.
-   *
-   * Inputs:
-   * * 0: Lookups. A 1-D int32 tensor with shape [ k ].
-   * * 1: Keys. A 1-D int32 tensor with shape [ n ], *MUST* be sorted in
-   *      ascending order.
-   * * 2: Values. A tensor with shape [ n … ].
-   *
-   * Outputs:
-   * * 0: Output. A tensor with shape [ k …].
-   * * 1: Hits. A uint8 tensor with shape [ k ] indicates whether the lookup
-   *      hits or not.
-   */
   ANEURALNETWORKS_HASHTABLE_LOOKUP = 10,
-
-  /** Applies L2 normalization along the depth dimension.
-   *
-   * The values in the output tensor are computed as:
-   *
-   *     output[batch, row, col, channel] =
-   *         input[batch, row, col, channel] /
-   *         sqrt(sum_{c} pow(input[batch, row, col, c], 2))
-   *
-   * For x with more dimensions, independently normalizes each 1-D slice along
-   * dimension dim.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
-   * input.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
-   * depth].
-   */
   ANEURALNETWORKS_L2_NORMALIZATION = 11,
-
-  /** Performs an 2-D L2 pooling operation.
-   *
-   * The output dimensions are functions of the filter dimensions, stride, and
-   * padding.
-   *
-   * The values in the output tensor are computed as:
-   *
-   *     output[batch, row, col, channel] =
-   *         sqrt(sum_{i, j} pow(input[batch, row + i, col + j, channel], 2) /
-   * sum(1))
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
-   * input.
-   * * 1: An INT32 value, specifying the padding on the left, in the ‘width’
-   * dimension.
-   * * 2: An INT32 value, specifying the padding on the right,in the ‘width’
-   * dimension.
-   * * 3: An INT32 value, specifying the padding on the top, in the ‘height’
-   * dimension.
-   * * 4: An INT32 value, specifying the padding on the bottom, in the ‘height’
-   * dimension.
-   * * 5: An INT32 value, specifying the output stride in the ‘width’ dimension.
-   * * 6: An INT32 value, specifying the output stride in the ‘height’
-   * dimension.
-   * * 7: An INT32 value, specifying the filter width.
-   * * 8: An INT32 value, specifying the filter height.
-   * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *      Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
-   * depth].
-   */
   ANEURALNETWORKS_L2_POOL_2D = 12,
-  /** Applies Local Response Normalization along the depth dimension.
-   *
-   * The 4-D input tensor is treated as a 3-D array of 1-D vectors (along the
-   * last dimension), and each vector is normalized independently. Within a
-   * given vector, each component is divided by the weighted, squared sum of
-   * inputs within depth_radius.
-   *
-   * The output is calculated using this formula:
-   *
-   *     sqr_sum[a, b, c, d] =
-   *         sum(pow(input[a, b, c, d - depth_radius : d + depth_radius + 1], 2)
-   *     output = input / pow((bias + alpha * sqr_sum), beta)
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
-   * input.
-   * * 1: An INT32 value, specifying the radius of the normalization window.
-   * * 2: A FLOAT32 value, specifying the bias, must not be zero.
-   * * 3: A FLOAT32 value, specifying the scale factor, alpha.
-   * * 4: A FLOAT32 value, specifying the exponent, beta.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0.
-   */
   ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION = 13,
-  /** Computes sigmoid activation on the input tensor element-wise.
-   *
-   * The output is calculated using this formula:
-   *
-   *     output = 1 / (1 + exp(-input))
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4.
-   *
-   * Inputs:
-   * * 0: A tensor, specifying the input.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0.
-   */
   ANEURALNETWORKS_LOGISTIC = 14,
-
-  /**
-   * Projects an input to a bit vector via locality sensitive hashing.
-   *
-   * Inputs:
-   * * 0: Hash functions. Dim.size == 2, DataType: Float.
-   *            Tensor[0].Dim[0]: Number of hash functions.
-   *            Tensor[0].Dim[1]: Number of seeds per hash functions.
-   *            Tensor[0].Dim[1] <= 32 in sparse case.
-   *
-   * * 1: Input. Dim.size >= 1, no restriction on DataType.
-   * * 2: Weight. Optional. Dim.size == 1, DataType: Float.
-   *     If not set, each input element is considered to have the same weight of
-   *     1.0.
-   *     Tensor[1].Dim[0] == Tensor[2].Dim[0]
-   * * 3: Type:
-   *        Sparse: Value LSHProjectionType_SPARSE(=1).
-   *          Computed bit vector is considered to be sparse.
-   *          Each output element is an int32 made up of multiple bits computed
-   * from hash functions.
-   *
-   *        Dense: Value LSHProjectionType_DENSE(=2).
-   *          Computed bit vector is considered to be dense. Each output element
-   *          represents a bit and can take the value of either 0 or 1.
-   *
-   * Outputs:
-   * * 0: If the projection type is sparse:
-   *        Output.Dim == { Tensor[0].Dim[0] }
-   *        A tensor of int32 that represents hash signatures.
-   *      If the projection type is Dense:
-   *        Output.Dim == { Tensor[0].Dim[0] * Tensor[0].Dim[1] }
-   *        A flattened tensor that represents projected bit vectors.
-   */
   ANEURALNETWORKS_LSH_PROJECTION = 15,
-
-  /**
-   * Long short-term memory unit (LSTM) recurrent network layer.
-   *
-   * The default non-peephole implementation is based on:
-   * http://www.bioinf.jku.at/publications/older/2604.pdf
-   * S. Hochreiter and J. Schmidhuber. "Long Short-Term Memory". Neural
-   * Computation, 9(8):1735-1780, 1997.
-   *
-   * The peephole implementation is based on:
-   * https://research.google.com/pubs/archive/43905.pdf
-   * Hasim Sak, Andrew Senior, and Francoise Beaufays. "Long short-term memory
-   * recurrent neural network architectures for large scale acoustic modeling."
-   * INTERSPEECH, 2014.
-   *
-   * The coupling of input and forget gate (CIFG) is based on:
-   * http://arxiv.org/pdf/1503.04069.pdf
-   * Greff et al. "LSTM: A Search Space Odyssey"
-   *
-   * The class has the following independently optional inputs:
-   * * If input gate (if CIFG): “input_to_forget_weights”,
-   *   “recurrent_to_input_weights”, “cell_to_input_weights”, “input_gate_bias”.
-   * * If no peephole connections: “cell_to_input_weights”,
-   *   “cell_to_forget_weights”, “cell_to_output_weights”.
-   * * If no projection layer: “projection_weights” and “projection_bias”.
-   * * If no projection bias: “projection_bias”.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Inputs:
-   * * 0: Input.
-   *      A 2-D tensor of type T, of shape [batch_size, input_size], where
-   *      “batch_size” corresponds to the batching dimension, and “input_size”
-   *      is the size of the input.
-   * * 1: input_to_input_weights.
-   *      A 2-D tensor of type T, of shape [num_units, input_size], where
-   *      “num_units” corresponds to the number of cell units.
-   * * 2: input_to_forget_weights.
-   *      A 2-D tensor of type T, of shape [num_units, input_size].
-   * * 3: input_to_cell_weights.
-   *      A 2-D tensor of type T, of shape [num_units, input_size].
-   * * 4: input_to_output_weights.
-   *      A 2-D tensor of type T, of shape [num_units, input_size].
-   * * 5: recurrent_to_input_weights.
-   *      A 2-D tensor of type T, of shape [num_units, output_size], where
-   *      “output_size” corresponds to either the number of cell units (i.e.,
-   *      “num_units”), or the second dimension of the “projection_weights”, if
-   *      defined.
-   * * 6: recurrent_to_forget_weights.
-   *      A 2-D tensor of type T, of shape [num_units, output_size].
-   * * 7: recurrent_to_cell_weights.
-   *      A 2-D tensor of type T, of shape [num_units, output_size].
-   * * 8: recurrent_to_output_weights.
-   *      A 2-D tensor of type T, of shape [num_units, output_size].
-   * * 9: cell_to_input_weights.
-   *      A 1-D tensor of type T, of shape [num_units].
-   * * 10:cell_to_forget_weights.
-   *      A 1-D tensor of type T, of shape [num_units].
-   * * 11:cell_to_output_weights.
-   *      A 1-D tensor of type T, of shape [num_units].
-   * * 12:input_gate_bias.
-   *      A 1-D tensor of type T, of shape [num_units].
-   * * 13:forget_gate_bias.
-   *      A 1-D tensor of type T, of shape [num_units].
-   * * 14:cell_bias.
-   *      A 1-D tensor of type T, of shape [num_units].
-   * * 15:output_gate_bias.
-   *      A 1-D tensor of type T, of shape [num_units].
-   * * 16:projection_weights.
-   *      A 2-D tensor of type T, of shape [output_size, num_units].
-   * * 17:projection_bias.
-   *      A 1-D tensor of type T, of shape [output_size].
-   *
-   * Parameters:
-   * * 18:fused_activation_function.
-   *      An (optional) ActivationFunctionType indicating the activation
-   *      function.
-   *      If “NONE” is specified then it results in a linear activation.
-   * * 19:cell_clip.
-   *      A clipping threshold for the cell state, such that values are bound
-   *      within [-cell_clip, cell_clip]. If set to 0.0 then clipping is
-   *      disabled.
-   * * 20:proj_clip.
-   *      A clipping threshold for the output from the projection layer, such
-   *      that values are bound within [-proj_clip, proj_clip]. If set to 0.0
-   *      then clipping is disabled.
-   *
-   * Outputs:
-   * * 0: scratch_buffer.
-   *      A 3-D tensor of type T, of shape [batch_size, num_cell, 4].
-   * * 1: output_state.
-   *      A 2-D tensor of type T, of shape [batch_size, output_size].
-   * * 2: cell_state.
-   *      A 2-D tensor of type T, of shape [batch_size, num_units].
-   * * 3: output.
-   *      A 2-D tensor of type T, of shape [batch_size, output_size]. This is
-   *      effectively the same as the current “output_state” value.
-   */
   ANEURALNETWORKS_LSTM = 16,
-
-  /** Performs an 2-D max pooling operation.
-   *
-   * The output dimensions are functions of the filter dimensions, stride, and
-   * padding.
-   *
-   * The values in the output tensor are computed as:
-   *
-   *     output[batch, row, col, channel] =
-   *         max_{i, j} (input[batch, row + i, col + j, channel])
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
-   * input.
-   * * 1: An INT32 value, specifying the padding on the left, in the ‘width’
-   * dimension.
-   * * 2: An INT32 value, specifying the padding on the right,in the ‘width’
-   * dimension.
-   * * 3: An INT32 value, specifying the padding on the top, in the ‘height’
-   * dimension.
-   * * 4: An INT32 value, specifying the padding on the bottom, in the ‘height’
-   * dimension.
-   * * 5: An INT32 value, specifying the output stride in the ‘width’ dimension.
-   * * 6: An INT32 value, specifying the output stride in the ‘height’
-   * dimension.
-   * * 7: An INT32 value, specifying the filter width.
-   * * 8: An INT32 value, specifying the filter height.
-   * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *      Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
-   * depth].
-   */
   ANEURALNETWORKS_MAX_POOL_2D = 17,
-
-  /** Multiplies two tensors, element-wise.
-   *
-   * Takes two input tensors of identical type and compatible dimensions. The
-   * output is the product of both input tensors, optionally modified by an
-   * activation function.
-   *
-   * Two dimensions are compatible when:
-   *     1. they are equal, or
-   *     2. one of them is 1
-   *
-   * The size of the resulting output is the maximum size along each dimension
-   * of the input operands. It starts with the trailing dimensions, and works
-   * its way forward.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: up to 4
-   *
-   * Inputs:
-   * * 0: A tensor.
-   * * 1: A tensor of the same type, and compatible dimensions as input0.
-   * * 2: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *      Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The product, a tensor of the same type as input0.
-   */
   ANEURALNETWORKS_MUL = 18,
-  /** Computes rectified linear activation on the input tensor element-wise.
-   *
-   * The output is calculated using this formula:
-   *
-   *     output = max(0, input)
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4.
-   *
-   * Inputs:
-   * * 0: A tensor, specifying the input.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0.
-   */
   ANEURALNETWORKS_RELU = 19,
-  /** Computes rectified linear 1 activation on the input tensor element-wise.
-   *
-   * The output is calculated using this formula:
-   *
-   *     output = min(1.f, max(-1.f, input))
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4.
-   *
-   * Inputs:
-   * * 0: A tensor, specifying the input.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0.
-   */
   ANEURALNETWORKS_RELU1 = 20,
-  /** Computes rectified linear 6 activation on the input tensor element-wise.
-   *
-   * The output is calculated using this formula:
-   *
-   *     output = min(6, max(0, input))
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4.
-   *
-   * Inputs:
-   * * 0: A tensor, specifying the input.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0.
-   */
   ANEURALNETWORKS_RELU6 = 21,
-  /** Reshapes a tensor.
-   *
-   * Given tensor, this operation returns a tensor that has the same values as
-   * tensor, but with a newly specified shape.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4.
-   *
-   * Inputs:
-   * * 0: A tensor, specifying the tensor to be reshaped.
-   * * 1: A 1-D tensor of type {@link ANEURALNETWORKS_TENSOR_INT32}, defining
-   * the shape of the output tensor. The number of elements implied by shape
-   * must be the same as the number of elements in the input tensor.
-   *
-   * Outputs:
-   * * 0: The output tensor, of shape specified by the input shape.
-   */
   ANEURALNETWORKS_RESHAPE = 22,
-  /** Resizes images to given size using the bilinear interpretation.
-   *
-   * Resized images will be distorted if their original aspect ratio is not the
-   * same as input.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
-   * input.
-   * * 1: An INT32 value, specifying the output width of the output tensor.
-   * * 2: An INT32 value, specifying the output height of the output tensor.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batches, new_height, new_width,
-   * depth].
-   */
   ANEURALNETWORKS_RESIZE_BILINEAR = 23,
-
-  /**
-   * A basic recurrent neural network layer.
-   *
-   * This layer implements the operation:
-   * outputs = state = activation(inputs * input_weights + state *
-   * recurrent_weights + bias)
-   *
-   * Where:
-   * * “input_weights” is a weight matrix that multiplies the inputs;
-   * * “recurrent_weights” is a weight matrix that multiplies the current
-   *    “state” which itself is the output from the previous time step
-   *    computation;
-   * * “bias” is a bias vector (added to each output vector in the batch);
-   * * “activation” is the function passed as the “fused_activation_function”
-   *   argument (if not “NONE”).
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Inputs:
-   * * 0: input.
-   *      A 2-D tensor of type T, of shape [batch_size, input_size], where
-   *      “batch_size” corresponds to the batching dimension, and “input_size”
-   * is the size of the input.
-   * * 1: weights.
-   *      A 2-D tensor of type T, of shape [num_units, input_size], where
-   *      “num_units” corresponds to the number of units.
-   * * 2: recurrent_weights.
-   *      A 2-D tensor of type T, of shape [num_units, num_units], with columns
-   *      corresponding to the weights from each unit.
-   * * 3: bias.
-   *      A 1-D tensor of type T, of shape [num_units].
-   *
-   *    For FLOAT32 input tensor, bias must also be FLOAT32.
-   *    For UINT8 input tensor, bias must be INT32.
-   *
-   * Parameters
-   * * 4: fused_activation_function.
-   *      An (optional) ActivationFunctionType indicating the activation
-   *      function. If “NONE” is specified then it results in a linear
-   *      activation.
-   *
-   * * 5: Hidden state.
-   *      A 2-D tensor of type T, of shape [batch_size, num_units].
-   *
-   * Outputs:
-   * * 0: output.
-   *      A 2-D tensor of type T, of shape [batch_size, num_units]. This is
-   *      effectively the same as the current state value.
-   */
   ANEURALNETWORKS_RNN = 24,
-
-  /** Computes the softmax activation on the input tensor element-wise, per
-   * batch, by normalizing the input vector so the maximum coefficient is zero.
-   *
-   * The output is calculated using this formula:
-   *
-   *     output[batch, i] =
-   *         exp((input[batch, i] - max(input[batch, :])) * beta) /
-   *         sum_{k}{exp((input[batch, k] - max(input[batch, :])) * beta)}
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: 2 or 4.
-   *
-   * Inputs:
-   * * 0: A 2-D or 4-D tensor, specifying the tensor to be reshaped.
-   * * 1: A FLOAT32 value, specifying the scaling factor for the exponent, beta.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0.
-   */
   ANEURALNETWORKS_SOFTMAX = 25,
-
-  /** Rearranges blocks of spatial data, into depth.
-   *
-   * More specifically, this op outputs a copy of the input tensor where values
-   * from the height and width dimensions are moved to the depth dimension. The
-   * value block_size indicates the input block size and how the data is moved.
-   *
-   * Chunks of data of size block_size * block_size from depth are rearranged
-   * into non-overlapping blocks of size block_size x block_size.
-   *
-   * The depth of the output tensor is input_depth * block_size * block_size.
-   * The input tensor's height and width must be divisible by block_size.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying
-   * the input.
-   * * 1: An INT32 value, specifying the block_size. block_size must be >=1 and
-   *      block_size must be a divisor of both the input height and width.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batch, height/block_size,
-   * width/block_size, depth*block_size*block_size].
-   */
   ANEURALNETWORKS_SPACE_TO_DEPTH = 26,
-
-  /**
-   * SVDF op is a kind of stateful layer derived from the notion that a
-   * densely connected layer that's processing a sequence of input frames can
-   * be approximated by using a singular value decomposition of each of its
-   * nodes. The implementation is based on:
-   *
-   * https://research.google.com/pubs/archive/43813.pdf
-   *
-   * P. Nakkiran, R. Alvarez, R. Prabhavalkar, C. Parada.
-   * “Compressing Deep Neural Networks using a Rank-Constrained Topology”.
-   * INTERSPEECH, 2015.
-   *
-   * It processes the incoming input using a 2-stage filtering mechanism:
-   * * stage 1 performs filtering on the "features" dimension, whose outputs get
-   *   pushed into a memory of fixed-size memory_size.
-   * * stage 2 performs filtering on the "time" dimension of the memory_size
-   *   memoized outputs of stage 1.
-   *
-   * Specifically, for rank 1, this layer implements the operation:
-   *
-   *    memory = push(conv1d(inputs, weights_feature, feature_dim, "VALID"));
-   *    outputs = activation(memory * weights_time + bias);
-   *
-   * Where:
-   * * “weights_feature” is a weights matrix that processes the inputs (by
-   *   convolving the input with every “feature filter”), and whose outputs get
-   *   pushed, stacked in order, into the fixed-size “memory” (the oldest entry
-   *   gets dropped);
-   * * “weights_time” is a weights matrix that processes the “memory” (by a
-   *   batched matrix multiplication on the num_units);
-   * * “bias” is an optional bias vector (added to each output vector in the
-   *   batch); and
-   * * “activation” is the function passed as the “fused_activation_function”
-   *   argument (if not “NONE”).
-   *
-   * Each rank adds a dimension to the weights matrices by means of stacking
-   * the filters.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Inputs:
-   * * 0: input.
-   *      A 2-D tensor of type T, of shape [batch_size, input_size], where
-   *      “batch_size” corresponds to the batching dimension, and “input_size”
-   * is the size of the input.
-   * * 1: weights_feature.
-   *      A 2-D tensor of type T, of shape [num_units, input_size], where
-   *      “num_units” corresponds to the number of units.
-   * * 2: weights_time.
-   *      A 2-D tensor of type T, of shape [num_units, memory_size], where
-   *      “memory_size” corresponds to the fixed-size of the memory.
-   * * 3: bias.
-   *      A optional 1-D tensor of type T, of shape [num_units].
-   *
-   *    For FLOAT32 input tensor, bias must also be FLOAT32.
-   *    For UINT8 input tensor, bias must be INT32.
-   *
-   * Parameters:
-   * * 4: rank.
-   *      The rank of the SVD approximation.
-   * * 5: fused_activation_function.
-   *      An (optional) ActivationFunctionType indicating the activation
-   * function. If “NONE” is specified then it results in a linear activation.
-   *
-   * Outputs:
-   * * 0: state.
-   *      A 2-D tensor of type T, of shape [batch_size, (memory_size - 1) *
-   * num_units * rank].
-   * * 1: output.
-   *      A 2-D tensor of type T, of shape [batch_size, num_units].
-   */
   ANEURALNETWORKS_SVDF = 27,
-
-  /** Computes hyperbolic tangent of input tensor element-wise.
-   *
-   * The output is calculated using this formula:
-   *
-   *     output = tanh(input)
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: up to 4.
-   *
-   * Inputs:
-   * * 0: A tensor, specifying the input.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0.
-   */
   ANEURALNETWORKS_TANH = 28,
+  ANEURALNETWORKS_BATCH_TO_SPACE_ND = 29,
+  ANEURALNETWORKS_DIV = 30,
+  ANEURALNETWORKS_MEAN = 31,
+  ANEURALNETWORKS_PAD = 32,
+  ANEURALNETWORKS_SPACE_TO_BATCH_ND = 33,
+  ANEURALNETWORKS_SQUEEZE = 34,
+  ANEURALNETWORKS_STRIDED_SLICE = 35,
+  ANEURALNETWORKS_SUB = 36,
+  ANEURALNETWORKS_TRANSPOSE = 37,
 };
 
 /**
@@ -1080,13 +137,9 @@ enum {
  *
  */
 enum {
-  /** NO fused activation function. */
   ANEURALNETWORKS_FUSED_NONE = 0,
-  /** Fused ReLU activation function. */
   ANEURALNETWORKS_FUSED_RELU = 1,
-  /** Fused ReLU1 activation function. */
   ANEURALNETWORKS_FUSED_RELU1 = 2,
-  /** Fused ReLU6 activation function. */
   ANEURALNETWORKS_FUSED_RELU6 = 3,
 };
 
@@ -1094,20 +147,8 @@ enum {
  * Execution preferences.
  */
 enum {
-  /**
-   * Prefer executing in a way that minimizes battery drain.
-   * This is desirable for compilations that will be executed often.
-   */
   ANEURALNETWORKS_PREFER_LOW_POWER = 0,
-  /**
-   * Prefer returning a single answer as fast as possible, even if this causes
-   * more power consumption.
-   */
   ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER = 1,
-  /**
-   * Prefer maximizing the throughput of successive frames, for example when
-   * processing successive frames coming from the camera.
-   */
   ANEURALNETWORKS_PREFER_SUSTAINED_SPEED = 2,
 };
 
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index 4049400d4c4024bea0809b7599ef8e74399a4117..21f1df0be320ba433d623641dc6abdf7d1f3ab69 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -24,6 +24,10 @@ limitations under the License.
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h"
 
+#ifdef __ANDROID__
+#include <sys/system_properties.h>
+#endif
+
 namespace tflite {
 
 // TODO(aselle): FATAL leaves resources hanging.
@@ -47,6 +51,32 @@ void FATAL(const char* format, ...) {
     FATAL("Aborting since tflite returned failure."); \
   }
 
+namespace {
+
+int32_t GetAndroidSdkVersion() {
+#ifdef __ANDROID__
+  const char* sdkProp = "ro.build.version.sdk";
+  char sdkVersion[PROP_VALUE_MAX];
+  int length = __system_property_get(sdkProp, sdkVersion);
+  if (length != 0) {
+    for (int i = 0; i < length; ++i) {
+      int digit = sdkVersion[i] - '0';
+      if (digit < 0 || digit > 9) {
+        // Non-numeric SDK version, assume it's higher then expected;
+        return 0xFFFF;
+      }
+    }
+    return atoi(sdkVersion);
+  }
+  FATAL("No %s prop", sdkProp);
+#endif  // __ANDROID__
+  return 0;
+}
+
+static const int32_t kAndroidSdkVersion = GetAndroidSdkVersion();
+
+}  // namespace
+
 NNAPIAllocation::NNAPIAllocation(const char* filename,
                                  ErrorReporter* error_reporter)
     : MMAPAllocation(filename, error_reporter) {
@@ -62,6 +92,10 @@ NNAPIAllocation::~NNAPIAllocation() {
 }
 
 NNAPIDelegate::~NNAPIDelegate() {
+  if (nn_compiled_model_) {
+    ANeuralNetworksCompilation_free(nn_compiled_model_);
+    nn_compiled_model_ = nullptr;
+  }
   if (nn_model_) {
     ANeuralNetworksModel_free(nn_model_);
     nn_model_ = nullptr;
@@ -245,6 +279,11 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       add_scalar_float32(builtin->proj_clip);
     };
 
+    auto add_mean_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteMeanParams*>(data);
+      add_scalar_int32(builtin->keep_dims);
+    };
+
 #if 0
     auto add_reshape_params = [&](void* data) {
       auto builtin = reinterpret_cast<TfLiteReshapeParams*>(data);
@@ -262,8 +301,9 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       augmented_inputs.push_back(next_id++);
     };
 #endif
-
+    int nnapi_version = 10;
     ANeuralNetworksOperationType nn_op_type;
+
     switch (builtin) {
       case tflite::BuiltinOperator_ADD:
         nn_op_type = ANEURALNETWORKS_ADD;
@@ -337,6 +377,23 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
         nn_op_type = ANEURALNETWORKS_LSTM;
         break;
       }
+      case tflite::BuiltinOperator_PAD:
+        nnapi_version = 11;  // require NNAPI 1.1
+        nn_op_type = ANEURALNETWORKS_PAD;
+        break;
+      case tflite::BuiltinOperator_MEAN:
+        nnapi_version = 11;  // require NNAPI 1.1
+        add_mean_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_MEAN;
+        break;
+      case tflite::BuiltinOperator_DIV:
+        nnapi_version = 11;  // require NNAPI 1.1
+        nn_op_type = ANEURALNETWORKS_DIV;
+        break;
+      case tflite::BuiltinOperator_SUB:
+        nnapi_version = 11;  // require NNAPI 1.1
+        nn_op_type = ANEURALNETWORKS_SUB;
+        break;
       case tflite::BuiltinOperator_CONCAT_EMBEDDINGS:
       case tflite::BuiltinOperator_LSH_PROJECTION:
       case tflite::BuiltinOperator_SVDF:
@@ -350,7 +407,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
       case tflite::BuiltinOperator_L2_NORMALIZATION:
       case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
-      case tflite::BuiltinOperator_PAD:
+      case tflite::BuiltinOperator_PADV2:
       case tflite::BuiltinOperator_RESIZE_BILINEAR:
       case tflite::BuiltinOperator_CALL:
       case tflite::BuiltinOperator_SKIP_GRAM:
@@ -360,9 +417,6 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_BATCH_TO_SPACE_ND:
       case tflite::BuiltinOperator_TOPK_V2:
       case tflite::BuiltinOperator_TRANSPOSE:
-      case tflite::BuiltinOperator_MEAN:
-      case tflite::BuiltinOperator_DIV:
-      case tflite::BuiltinOperator_SUB:
       case tflite::BuiltinOperator_SPLIT:
       case tflite::BuiltinOperator_SQUEEZE:
       case tflite::BuiltinOperator_STRIDED_SLICE:
@@ -375,8 +429,15 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_MAXIMUM:
       case tflite::BuiltinOperator_MINIMUM:
       case tflite::BuiltinOperator_ARG_MAX:
+      case tflite::BuiltinOperator_GREATER:
+      case tflite::BuiltinOperator_GREATER_EQUAL:
       case tflite::BuiltinOperator_LESS:
+      case tflite::BuiltinOperator_LESS_EQUAL:
       case tflite::BuiltinOperator_NEG:
+      case tflite::BuiltinOperator_SELECT:
+      case tflite::BuiltinOperator_SLICE:
+      case tflite::BuiltinOperator_SIN:
+      case tflite::BuiltinOperator_TRANSPOSE_CONV:
         FATAL("Op code %d is currently not delegated to NNAPI", builtin);
         nn_op_type = -1;  // set to invalid
         break;
@@ -386,6 +447,10 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
         break;
     }
 
+    if (nnapi_version == 11 && kAndroidSdkVersion < 28) {
+      FATAL("Op %d needs NNAPI1.1", builtin);
+    }
+
     // Add the operation.
     CHECK_NN(ANeuralNetworksModel_addOperation(
         nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()),
diff --git a/tensorflow/contrib/lite/op_resolver.cc b/tensorflow/contrib/lite/op_resolver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f6e435e982443218b5e6328e48e1ce0d2393224c
--- /dev/null
+++ b/tensorflow/contrib/lite/op_resolver.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/op_resolver.h"
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+
+const TfLiteRegistration* MutableOpResolver::FindOp(tflite::BuiltinOperator op,
+                                                    int version) const {
+  auto it = builtins_.find(std::make_pair(op, version));
+  return it != builtins_.end() ? &it->second : nullptr;
+}
+
+const TfLiteRegistration* MutableOpResolver::FindOp(const char* op,
+                                                    int version) const {
+  auto it = custom_ops_.find(std::make_pair(op, version));
+  return it != custom_ops_.end() ? &it->second : nullptr;
+}
+
+void MutableOpResolver::AddBuiltin(tflite::BuiltinOperator op,
+                                   TfLiteRegistration* registration,
+                                   int min_version, int max_version) {
+  for (int version = min_version; version <= max_version; ++version) {
+    TfLiteRegistration new_registration = *registration;
+    new_registration.builtin_code = op;
+    new_registration.version = version;
+    auto op_key = std::make_pair(op, version);
+    builtins_[op_key] = new_registration;
+  }
+}
+
+void MutableOpResolver::AddCustom(const char* name,
+                                  TfLiteRegistration* registration,
+                                  int min_version, int max_version) {
+  for (int version = min_version; version <= max_version; ++version) {
+    TfLiteRegistration new_registration = *registration;
+    new_registration.builtin_code = BuiltinOperator_CUSTOM;
+    new_registration.version = version;
+    auto op_key = std::make_pair(name, version);
+    custom_ops_[op_key] = new_registration;
+  }
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/op_resolver.h b/tensorflow/contrib/lite/op_resolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..38a27069421586f28a5fbe4c7880a28f80548b98
--- /dev/null
+++ b/tensorflow/contrib/lite/op_resolver.h
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_OP_RESOLVER_H_
+#define TENSORFLOW_CONTRIB_LITE_OP_RESOLVER_H_
+
+#include <unordered_map>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// Abstract interface that returns TfLiteRegistrations given op codes or custom
+// op names. This is the mechanism that ops being referenced in the flatbuffer
+// model are mapped to executable function pointers (TfLiteRegistrations).
+class OpResolver {
+ public:
+  // Finds the op registration for a builtin operator by enum code.
+  virtual const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                           int version) const = 0;
+  // Finds the op registration of a custom operator by op name.
+  virtual const TfLiteRegistration* FindOp(const char* op,
+                                           int version) const = 0;
+  virtual ~OpResolver() {}
+};
+
+// Some versions of gcc doesn't support partial specialization in class scope,
+// so these are defined in a namescope.
+namespace op_resolver_hasher {
+template <typename V>
+struct ValueHasher {
+  size_t operator()(const V& v) const { return std::hash<V>()(v); }
+};
+
+template <>
+struct ValueHasher<tflite::BuiltinOperator> {
+  size_t operator()(const tflite::BuiltinOperator& v) const {
+    return std::hash<int>()(static_cast<int>(v));
+  }
+};
+
+template <typename T>
+struct OperatorKeyHasher {
+  size_t operator()(const T& x) const {
+    size_t a = ValueHasher<typename T::first_type>()(x.first);
+    size_t b = ValueHasher<typename T::second_type>()(x.second);
+    // Hash combinator used by TensorFlow core.
+    return a ^ (b + 0x9e3779b97f4a7800ULL + (a << 10) + (a >> 4));
+  }
+};
+}  // namespace op_resolver_hasher
+
+// An OpResolver that is mutable, also used as the op in gen_op_registration.
+// A typical usage:
+//   MutableOpResolver resolver;
+//   resolver.AddBuiltin(BuiltinOperator_ADD, Register_ADD());
+//   resolver.AddCustom("CustomOp", Register_CUSTOM_OP());
+//   InterpreterBuilder(model, resolver)(&interpreter);
+class MutableOpResolver : public OpResolver {
+ public:
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override;
+  const TfLiteRegistration* FindOp(const char* op, int version) const override;
+  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
+                  int min_version = 1, int max_version = 1);
+  void AddCustom(const char* name, TfLiteRegistration* registration,
+                 int min_version = 1, int max_version = 1);
+
+ private:
+  typedef std::pair<tflite::BuiltinOperator, int> BuiltinOperatorKey;
+  typedef std::pair<std::string, int> CustomOperatorKey;
+
+  std::unordered_map<BuiltinOperatorKey, TfLiteRegistration,
+                     op_resolver_hasher::OperatorKeyHasher<BuiltinOperatorKey> >
+      builtins_;
+  std::unordered_map<CustomOperatorKey, TfLiteRegistration,
+                     op_resolver_hasher::OperatorKeyHasher<CustomOperatorKey> >
+      custom_ops_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_OP_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/op_resolver_test.cc b/tensorflow/contrib/lite/op_resolver_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..10b7e319722faabd5c1c7442d1c820649affd1ca
--- /dev/null
+++ b/tensorflow/contrib/lite/op_resolver_test.cc
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/op_resolver.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+// We need some dummy functions to identify the registrations.
+TfLiteStatus DummyInvoke(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteRegistration* GetDummyRegistration() {
+  static TfLiteRegistration registration = {
+      .init = nullptr,
+      .free = nullptr,
+      .prepare = nullptr,
+      .invoke = DummyInvoke,
+  };
+  return &registration;
+}
+
+TEST(MutableOpResolverTest, FinOp) {
+  MutableOpResolver resolver;
+  resolver.AddBuiltin(BuiltinOperator_ADD, GetDummyRegistration());
+
+  const TfLiteRegistration* found_registration =
+      resolver.FindOp(BuiltinOperator_ADD, 1);
+  ASSERT_NE(found_registration, nullptr);
+  EXPECT_TRUE(found_registration->invoke == DummyInvoke);
+  EXPECT_EQ(found_registration->builtin_code, BuiltinOperator_ADD);
+  EXPECT_EQ(found_registration->version, 1);
+}
+
+TEST(MutableOpResolverTest, FindMissingOp) {
+  MutableOpResolver resolver;
+  resolver.AddBuiltin(BuiltinOperator_ADD, GetDummyRegistration());
+
+  const TfLiteRegistration* found_registration =
+      resolver.FindOp(BuiltinOperator_CONV_2D, 1);
+  EXPECT_EQ(found_registration, nullptr);
+}
+
+TEST(MutableOpResolverTest, RegisterOpWithMultipleVersions) {
+  MutableOpResolver resolver;
+  // The kernel supports version 2 and 3
+  resolver.AddBuiltin(BuiltinOperator_ADD, GetDummyRegistration(), 2, 3);
+
+  const TfLiteRegistration* found_registration;
+
+  found_registration = resolver.FindOp(BuiltinOperator_ADD, 2);
+  ASSERT_NE(found_registration, nullptr);
+  EXPECT_TRUE(found_registration->invoke == DummyInvoke);
+  EXPECT_EQ(found_registration->version, 2);
+
+  found_registration = resolver.FindOp(BuiltinOperator_ADD, 3);
+  ASSERT_NE(found_registration, nullptr);
+  EXPECT_TRUE(found_registration->invoke == DummyInvoke);
+  EXPECT_EQ(found_registration->version, 3);
+}
+
+TEST(MutableOpResolverTest, FindOpWithUnsupportedVersions) {
+  MutableOpResolver resolver;
+  // The kernel supports version 2 and 3
+  resolver.AddBuiltin(BuiltinOperator_ADD, GetDummyRegistration(), 2, 3);
+
+  const TfLiteRegistration* found_registration;
+
+  found_registration = resolver.FindOp(BuiltinOperator_ADD, 1);
+  EXPECT_EQ(found_registration, nullptr);
+
+  found_registration = resolver.FindOp(BuiltinOperator_ADD, 4);
+  EXPECT_EQ(found_registration, nullptr);
+}
+
+TEST(MutableOpResolverTest, FindCustomOp) {
+  MutableOpResolver resolver;
+  resolver.AddCustom("AWESOME", GetDummyRegistration());
+
+  const TfLiteRegistration* found_registration = resolver.FindOp("AWESOME", 1);
+  ASSERT_NE(found_registration, nullptr);
+  EXPECT_EQ(found_registration->builtin_code, BuiltinOperator_CUSTOM);
+  EXPECT_TRUE(found_registration->invoke == DummyInvoke);
+  EXPECT_EQ(found_registration->version, 1);
+  // TODO(ycling): The `custom_name` in TfLiteRegistration isn't properly
+  // filled yet. Fix this and add tests.
+}
+
+TEST(MutableOpResolverTest, FindMissingCustomOp) {
+  MutableOpResolver resolver;
+  resolver.AddCustom("AWESOME", GetDummyRegistration());
+
+  const TfLiteRegistration* found_registration =
+      resolver.FindOp("EXCELLENT", 1);
+  EXPECT_EQ(found_registration, nullptr);
+}
+
+TEST(MutableOpResolverTest, FindCustomOpWithUnsupportedVersion) {
+  MutableOpResolver resolver;
+  resolver.AddCustom("AWESOME", GetDummyRegistration());
+
+  const TfLiteRegistration* found_registration = resolver.FindOp("AWESOME", 2);
+  EXPECT_EQ(found_registration, nullptr);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
index c4200c879ba0e17b3bd183f4004eb75ebdd2f5ee..c0926d2f33c0bbc5111e6df90dbd759172021f95 100644
--- a/tensorflow/contrib/lite/python/convert.py
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -118,7 +118,8 @@ def toco_convert(input_data,
                  input_format=lite_constants.TENSORFLOW_GRAPHDEF,
                  output_format=lite_constants.TFLITE,
                  quantized_input_stats=None,
-                 drop_control_dependency=True):
+                 drop_control_dependency=True,
+                 allow_custom_ops=False):
   """Convert a model using TOCO from `input_format` to `output_format`.
 
   Typically this is to convert from TensorFlow GraphDef to TFLite, in which
@@ -154,6 +155,7 @@ def toco_convert(input_data,
   toco.drop_control_dependency = drop_control_dependency
   model = _model_flags_pb2.ModelFlags()
   toco.inference_type = inference_type
+  toco.allow_custom_ops = allow_custom_ops
   for idx, input_tensor in enumerate(input_tensors):
     if input_tensor.dtype == _dtypes.float32:
       tflite_input_type = lite_constants.FLOAT
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 5d89f7be6263936e09c8034df0155745373737b3..8bdeb035f5a778fa3b0d85da36d6b8d6721445ea 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -137,6 +137,14 @@ enum BuiltinOperator : byte {
   MINIMUM = 57,
   LESS = 58,
   NEG = 59,
+  PADV2 = 60,
+  GREATER = 61,
+  GREATER_EQUAL = 62,
+  LESS_EQUAL = 63,
+  SELECT = 64,
+  SLICE = 65,
+  SIN = 66,
+  TRANSPOSE_CONV = 67,
 }
 
 // Options for the builtin operators.
@@ -183,6 +191,13 @@ union BuiltinOptions {
   ArgMaxOptions,
   LessOptions,
   NegOptions,
+  PadV2Options,
+  GreaterOptions,
+  GreaterEqualOptions,
+  LessEqualOptions,
+  SelectOptions,
+  SliceOptions,
+  TransposeConvOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -316,6 +331,9 @@ table CallOptions {
 table PadOptions {
 }
 
+table PadV2Options {
+}
+
 table ReshapeOptions {
   new_shape:[int];
 }
@@ -405,17 +423,42 @@ table ArgMaxOptions {
   output_type : TensorType;
 }
 
+table GreaterOptions {
+}
+
+table GreaterEqualOptions {
+}
+
 table LessOptions {
 }
 
+table LessEqualOptions {
+}
+
 table NegOptions {
 }
 
+table SelectOptions {
+}
+
+table SliceOptions {
+}
+
+table TransposeConvOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
   builtin_code:BuiltinOperator;
   custom_code:string;
+
+  // The version of the operator. The version need to be bumped whenever new
+  // parameters are introduced into an op.
+  version:int = 1;
 }
 
 enum CustomOptionsFormat : byte {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index c172f77aa99327843e67d421d19518b2d64a1d53..35c34f53a6bf9716941f623b43f238c681252747 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -88,6 +88,9 @@ struct CallOptionsT;
 struct PadOptions;
 struct PadOptionsT;
 
+struct PadV2Options;
+struct PadV2OptionsT;
+
 struct ReshapeOptions;
 struct ReshapeOptionsT;
 
@@ -151,12 +154,30 @@ struct MaximumMinimumOptionsT;
 struct ArgMaxOptions;
 struct ArgMaxOptionsT;
 
+struct GreaterOptions;
+struct GreaterOptionsT;
+
+struct GreaterEqualOptions;
+struct GreaterEqualOptionsT;
+
 struct LessOptions;
 struct LessOptionsT;
 
+struct LessEqualOptions;
+struct LessEqualOptionsT;
+
 struct NegOptions;
 struct NegOptionsT;
 
+struct SelectOptions;
+struct SelectOptionsT;
+
+struct SliceOptions;
+struct SliceOptionsT;
+
+struct TransposeConvOptions;
+struct TransposeConvOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -276,11 +297,19 @@ enum BuiltinOperator {
   BuiltinOperator_MINIMUM = 57,
   BuiltinOperator_LESS = 58,
   BuiltinOperator_NEG = 59,
+  BuiltinOperator_PADV2 = 60,
+  BuiltinOperator_GREATER = 61,
+  BuiltinOperator_GREATER_EQUAL = 62,
+  BuiltinOperator_LESS_EQUAL = 63,
+  BuiltinOperator_SELECT = 64,
+  BuiltinOperator_SLICE = 65,
+  BuiltinOperator_SIN = 66,
+  BuiltinOperator_TRANSPOSE_CONV = 67,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_NEG
+  BuiltinOperator_MAX = BuiltinOperator_TRANSPOSE_CONV
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[59] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[67] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -340,7 +369,15 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[59] {
     BuiltinOperator_ARG_MAX,
     BuiltinOperator_MINIMUM,
     BuiltinOperator_LESS,
-    BuiltinOperator_NEG
+    BuiltinOperator_NEG,
+    BuiltinOperator_PADV2,
+    BuiltinOperator_GREATER,
+    BuiltinOperator_GREATER_EQUAL,
+    BuiltinOperator_LESS_EQUAL,
+    BuiltinOperator_SELECT,
+    BuiltinOperator_SLICE,
+    BuiltinOperator_SIN,
+    BuiltinOperator_TRANSPOSE_CONV
   };
   return values;
 }
@@ -407,6 +444,14 @@ inline const char **EnumNamesBuiltinOperator() {
     "MINIMUM",
     "LESS",
     "NEG",
+    "PADV2",
+    "GREATER",
+    "GREATER_EQUAL",
+    "LESS_EQUAL",
+    "SELECT",
+    "SLICE",
+    "SIN",
+    "TRANSPOSE_CONV",
     nullptr
   };
   return names;
@@ -461,11 +506,18 @@ enum BuiltinOptions {
   BuiltinOptions_ArgMaxOptions = 40,
   BuiltinOptions_LessOptions = 41,
   BuiltinOptions_NegOptions = 42,
+  BuiltinOptions_PadV2Options = 43,
+  BuiltinOptions_GreaterOptions = 44,
+  BuiltinOptions_GreaterEqualOptions = 45,
+  BuiltinOptions_LessEqualOptions = 46,
+  BuiltinOptions_SelectOptions = 47,
+  BuiltinOptions_SliceOptions = 48,
+  BuiltinOptions_TransposeConvOptions = 49,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_NegOptions
+  BuiltinOptions_MAX = BuiltinOptions_TransposeConvOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[43] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[50] {
   static BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -509,7 +561,14 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[43] {
     BuiltinOptions_MaximumMinimumOptions,
     BuiltinOptions_ArgMaxOptions,
     BuiltinOptions_LessOptions,
-    BuiltinOptions_NegOptions
+    BuiltinOptions_NegOptions,
+    BuiltinOptions_PadV2Options,
+    BuiltinOptions_GreaterOptions,
+    BuiltinOptions_GreaterEqualOptions,
+    BuiltinOptions_LessEqualOptions,
+    BuiltinOptions_SelectOptions,
+    BuiltinOptions_SliceOptions,
+    BuiltinOptions_TransposeConvOptions
   };
   return values;
 }
@@ -559,6 +618,13 @@ inline const char **EnumNamesBuiltinOptions() {
     "ArgMaxOptions",
     "LessOptions",
     "NegOptions",
+    "PadV2Options",
+    "GreaterOptions",
+    "GreaterEqualOptions",
+    "LessEqualOptions",
+    "SelectOptions",
+    "SliceOptions",
+    "TransposeConvOptions",
     nullptr
   };
   return names;
@@ -741,6 +807,34 @@ template<> struct BuiltinOptionsTraits<NegOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_NegOptions;
 };
 
+template<> struct BuiltinOptionsTraits<PadV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PadV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<GreaterOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GreaterOptions;
+};
+
+template<> struct BuiltinOptionsTraits<GreaterEqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GreaterEqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<LessEqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LessEqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<SelectOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SelectOptions;
+};
+
+template<> struct BuiltinOptionsTraits<SliceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SliceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<TransposeConvOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TransposeConvOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1108,6 +1202,62 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_NegOptions ?
       reinterpret_cast<const NegOptionsT *>(value) : nullptr;
   }
+  PadV2OptionsT *AsPadV2Options() {
+    return type == BuiltinOptions_PadV2Options ?
+      reinterpret_cast<PadV2OptionsT *>(value) : nullptr;
+  }
+  const PadV2OptionsT *AsPadV2Options() const {
+    return type == BuiltinOptions_PadV2Options ?
+      reinterpret_cast<const PadV2OptionsT *>(value) : nullptr;
+  }
+  GreaterOptionsT *AsGreaterOptions() {
+    return type == BuiltinOptions_GreaterOptions ?
+      reinterpret_cast<GreaterOptionsT *>(value) : nullptr;
+  }
+  const GreaterOptionsT *AsGreaterOptions() const {
+    return type == BuiltinOptions_GreaterOptions ?
+      reinterpret_cast<const GreaterOptionsT *>(value) : nullptr;
+  }
+  GreaterEqualOptionsT *AsGreaterEqualOptions() {
+    return type == BuiltinOptions_GreaterEqualOptions ?
+      reinterpret_cast<GreaterEqualOptionsT *>(value) : nullptr;
+  }
+  const GreaterEqualOptionsT *AsGreaterEqualOptions() const {
+    return type == BuiltinOptions_GreaterEqualOptions ?
+      reinterpret_cast<const GreaterEqualOptionsT *>(value) : nullptr;
+  }
+  LessEqualOptionsT *AsLessEqualOptions() {
+    return type == BuiltinOptions_LessEqualOptions ?
+      reinterpret_cast<LessEqualOptionsT *>(value) : nullptr;
+  }
+  const LessEqualOptionsT *AsLessEqualOptions() const {
+    return type == BuiltinOptions_LessEqualOptions ?
+      reinterpret_cast<const LessEqualOptionsT *>(value) : nullptr;
+  }
+  SelectOptionsT *AsSelectOptions() {
+    return type == BuiltinOptions_SelectOptions ?
+      reinterpret_cast<SelectOptionsT *>(value) : nullptr;
+  }
+  const SelectOptionsT *AsSelectOptions() const {
+    return type == BuiltinOptions_SelectOptions ?
+      reinterpret_cast<const SelectOptionsT *>(value) : nullptr;
+  }
+  SliceOptionsT *AsSliceOptions() {
+    return type == BuiltinOptions_SliceOptions ?
+      reinterpret_cast<SliceOptionsT *>(value) : nullptr;
+  }
+  const SliceOptionsT *AsSliceOptions() const {
+    return type == BuiltinOptions_SliceOptions ?
+      reinterpret_cast<const SliceOptionsT *>(value) : nullptr;
+  }
+  TransposeConvOptionsT *AsTransposeConvOptions() {
+    return type == BuiltinOptions_TransposeConvOptions ?
+      reinterpret_cast<TransposeConvOptionsT *>(value) : nullptr;
+  }
+  const TransposeConvOptionsT *AsTransposeConvOptions() const {
+    return type == BuiltinOptions_TransposeConvOptions ?
+      reinterpret_cast<const TransposeConvOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -2873,6 +3023,46 @@ inline flatbuffers::Offset<PadOptions> CreatePadOptions(
 
 flatbuffers::Offset<PadOptions> CreatePadOptions(flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct PadV2OptionsT : public flatbuffers::NativeTable {
+  typedef PadV2Options TableType;
+  PadV2OptionsT() {
+  }
+};
+
+struct PadV2Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef PadV2OptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  PadV2OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PadV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<PadV2Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct PadV2OptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit PadV2OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  PadV2OptionsBuilder &operator=(const PadV2OptionsBuilder &);
+  flatbuffers::Offset<PadV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<PadV2Options>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<PadV2Options> CreatePadV2Options(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  PadV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<PadV2Options> CreatePadV2Options(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct ReshapeOptionsT : public flatbuffers::NativeTable {
   typedef ReshapeOptions TableType;
   std::vector<int32_t> new_shape;
@@ -3995,6 +4185,86 @@ inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(
 
 flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct GreaterOptionsT : public flatbuffers::NativeTable {
+  typedef GreaterOptions TableType;
+  GreaterOptionsT() {
+  }
+};
+
+struct GreaterOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef GreaterOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  GreaterOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GreaterOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<GreaterOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GreaterOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit GreaterOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  GreaterOptionsBuilder &operator=(const GreaterOptionsBuilder &);
+  flatbuffers::Offset<GreaterOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<GreaterOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  GreaterOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GreaterEqualOptionsT : public flatbuffers::NativeTable {
+  typedef GreaterEqualOptions TableType;
+  GreaterEqualOptionsT() {
+  }
+};
+
+struct GreaterEqualOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef GreaterEqualOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  GreaterEqualOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GreaterEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<GreaterEqualOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GreaterEqualOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit GreaterEqualOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  GreaterEqualOptionsBuilder &operator=(const GreaterEqualOptionsBuilder &);
+  flatbuffers::Offset<GreaterEqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<GreaterEqualOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  GreaterEqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct LessOptionsT : public flatbuffers::NativeTable {
   typedef LessOptions TableType;
   LessOptionsT() {
@@ -4035,6 +4305,46 @@ inline flatbuffers::Offset<LessOptions> CreateLessOptions(
 
 flatbuffers::Offset<LessOptions> CreateLessOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct LessEqualOptionsT : public flatbuffers::NativeTable {
+  typedef LessEqualOptions TableType;
+  LessEqualOptionsT() {
+  }
+};
+
+struct LessEqualOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LessEqualOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LessEqualOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LessEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LessEqualOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LessEqualOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit LessEqualOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  LessEqualOptionsBuilder &operator=(const LessEqualOptionsBuilder &);
+  flatbuffers::Offset<LessEqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LessEqualOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  LessEqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct NegOptionsT : public flatbuffers::NativeTable {
   typedef NegOptions TableType;
   NegOptionsT() {
@@ -4075,12 +4385,172 @@ inline flatbuffers::Offset<NegOptions> CreateNegOptions(
 
 flatbuffers::Offset<NegOptions> CreateNegOptions(flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct SelectOptionsT : public flatbuffers::NativeTable {
+  typedef SelectOptions TableType;
+  SelectOptionsT() {
+  }
+};
+
+struct SelectOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SelectOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SelectOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SelectOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SelectOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SelectOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SelectOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SelectOptionsBuilder &operator=(const SelectOptionsBuilder &);
+  flatbuffers::Offset<SelectOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SelectOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SelectOptions> CreateSelectOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SelectOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SelectOptions> CreateSelectOptions(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SliceOptionsT : public flatbuffers::NativeTable {
+  typedef SliceOptions TableType;
+  SliceOptionsT() {
+  }
+};
+
+struct SliceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SliceOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SliceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SliceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SliceOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SliceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SliceOptionsBuilder &operator=(const SliceOptionsBuilder &);
+  flatbuffers::Offset<SliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SliceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SliceOptions> CreateSliceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SliceOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SliceOptions> CreateSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TransposeConvOptionsT : public flatbuffers::NativeTable {
+  typedef TransposeConvOptions TableType;
+  Padding padding;
+  int32_t stride_w;
+  int32_t stride_h;
+  TransposeConvOptionsT()
+      : padding(Padding_SAME),
+        stride_w(0),
+        stride_h(0) {
+  }
+};
+
+struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TransposeConvOptionsT NativeTableType;
+  enum {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8
+  };
+  Padding padding() const {
+    return static_cast<Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H) &&
+           verifier.EndTable();
+  }
+  TransposeConvOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TransposeConvOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TransposeConvOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TransposeConvOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_padding(Padding padding) {
+    fbb_.AddElement<int8_t>(TransposeConvOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(TransposeConvOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(TransposeConvOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  explicit TransposeConvOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  TransposeConvOptionsBuilder &operator=(const TransposeConvOptionsBuilder &);
+  flatbuffers::Offset<TransposeConvOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TransposeConvOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    Padding padding = Padding_SAME,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0) {
+  TransposeConvOptionsBuilder builder_(_fbb);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
   std::string custom_code;
+  int32_t version;
   OperatorCodeT()
-      : builtin_code(BuiltinOperator_ADD) {
+      : builtin_code(BuiltinOperator_ADD),
+        version(1) {
   }
 };
 
@@ -4088,7 +4558,8 @@ struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef OperatorCodeT NativeTableType;
   enum {
     VT_BUILTIN_CODE = 4,
-    VT_CUSTOM_CODE = 6
+    VT_CUSTOM_CODE = 6,
+    VT_VERSION = 8
   };
   BuiltinOperator builtin_code() const {
     return static_cast<BuiltinOperator>(GetField<int8_t>(VT_BUILTIN_CODE, 0));
@@ -4096,11 +4567,15 @@ struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const flatbuffers::String *custom_code() const {
     return GetPointer<const flatbuffers::String *>(VT_CUSTOM_CODE);
   }
+  int32_t version() const {
+    return GetField<int32_t>(VT_VERSION, 1);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_BUILTIN_CODE) &&
            VerifyOffset(verifier, VT_CUSTOM_CODE) &&
            verifier.Verify(custom_code()) &&
+           VerifyField<int32_t>(verifier, VT_VERSION) &&
            verifier.EndTable();
   }
   OperatorCodeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -4117,6 +4592,9 @@ struct OperatorCodeBuilder {
   void add_custom_code(flatbuffers::Offset<flatbuffers::String> custom_code) {
     fbb_.AddOffset(OperatorCode::VT_CUSTOM_CODE, custom_code);
   }
+  void add_version(int32_t version) {
+    fbb_.AddElement<int32_t>(OperatorCode::VT_VERSION, version, 1);
+  }
   explicit OperatorCodeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -4132,8 +4610,10 @@ struct OperatorCodeBuilder {
 inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(
     flatbuffers::FlatBufferBuilder &_fbb,
     BuiltinOperator builtin_code = BuiltinOperator_ADD,
-    flatbuffers::Offset<flatbuffers::String> custom_code = 0) {
+    flatbuffers::Offset<flatbuffers::String> custom_code = 0,
+    int32_t version = 1) {
   OperatorCodeBuilder builder_(_fbb);
+  builder_.add_version(version);
   builder_.add_custom_code(custom_code);
   builder_.add_builtin_code(builtin_code);
   return builder_.Finish();
@@ -4142,11 +4622,13 @@ inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(
 inline flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     BuiltinOperator builtin_code = BuiltinOperator_ADD,
-    const char *custom_code = nullptr) {
+    const char *custom_code = nullptr,
+    int32_t version = 1) {
   return tflite::CreateOperatorCode(
       _fbb,
       builtin_code,
-      custom_code ? _fbb.CreateString(custom_code) : 0);
+      custom_code ? _fbb.CreateString(custom_code) : 0,
+      version);
 }
 
 flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -4318,6 +4800,27 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const NegOptions *builtin_options_as_NegOptions() const {
     return builtin_options_type() == BuiltinOptions_NegOptions ? static_cast<const NegOptions *>(builtin_options()) : nullptr;
   }
+  const PadV2Options *builtin_options_as_PadV2Options() const {
+    return builtin_options_type() == BuiltinOptions_PadV2Options ? static_cast<const PadV2Options *>(builtin_options()) : nullptr;
+  }
+  const GreaterOptions *builtin_options_as_GreaterOptions() const {
+    return builtin_options_type() == BuiltinOptions_GreaterOptions ? static_cast<const GreaterOptions *>(builtin_options()) : nullptr;
+  }
+  const GreaterEqualOptions *builtin_options_as_GreaterEqualOptions() const {
+    return builtin_options_type() == BuiltinOptions_GreaterEqualOptions ? static_cast<const GreaterEqualOptions *>(builtin_options()) : nullptr;
+  }
+  const LessEqualOptions *builtin_options_as_LessEqualOptions() const {
+    return builtin_options_type() == BuiltinOptions_LessEqualOptions ? static_cast<const LessEqualOptions *>(builtin_options()) : nullptr;
+  }
+  const SelectOptions *builtin_options_as_SelectOptions() const {
+    return builtin_options_type() == BuiltinOptions_SelectOptions ? static_cast<const SelectOptions *>(builtin_options()) : nullptr;
+  }
+  const SliceOptions *builtin_options_as_SliceOptions() const {
+    return builtin_options_type() == BuiltinOptions_SliceOptions ? static_cast<const SliceOptions *>(builtin_options()) : nullptr;
+  }
+  const TransposeConvOptions *builtin_options_as_TransposeConvOptions() const {
+    return builtin_options_type() == BuiltinOptions_TransposeConvOptions ? static_cast<const TransposeConvOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -4512,6 +5015,34 @@ template<> inline const NegOptions *Operator::builtin_options_as<NegOptions>() c
   return builtin_options_as_NegOptions();
 }
 
+template<> inline const PadV2Options *Operator::builtin_options_as<PadV2Options>() const {
+  return builtin_options_as_PadV2Options();
+}
+
+template<> inline const GreaterOptions *Operator::builtin_options_as<GreaterOptions>() const {
+  return builtin_options_as_GreaterOptions();
+}
+
+template<> inline const GreaterEqualOptions *Operator::builtin_options_as<GreaterEqualOptions>() const {
+  return builtin_options_as_GreaterEqualOptions();
+}
+
+template<> inline const LessEqualOptions *Operator::builtin_options_as<LessEqualOptions>() const {
+  return builtin_options_as_LessEqualOptions();
+}
+
+template<> inline const SelectOptions *Operator::builtin_options_as<SelectOptions>() const {
+  return builtin_options_as_SelectOptions();
+}
+
+template<> inline const SliceOptions *Operator::builtin_options_as<SliceOptions>() const {
+  return builtin_options_as_SliceOptions();
+}
+
+template<> inline const TransposeConvOptions *Operator::builtin_options_as<TransposeConvOptions>() const {
+  return builtin_options_as_TransposeConvOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -5572,6 +6103,29 @@ inline flatbuffers::Offset<PadOptions> CreatePadOptions(flatbuffers::FlatBufferB
       _fbb);
 }
 
+inline PadV2OptionsT *PadV2Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new PadV2OptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void PadV2Options::UnPackTo(PadV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<PadV2Options> PadV2Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePadV2Options(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<PadV2Options> CreatePadV2Options(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const PadV2OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreatePadV2Options(
+      _fbb);
+}
+
 inline ReshapeOptionsT *ReshapeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new ReshapeOptionsT();
   UnPackTo(_o, _resolver);
@@ -6115,6 +6669,52 @@ inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatB
       _output_type);
 }
 
+inline GreaterOptionsT *GreaterOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new GreaterOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void GreaterOptions::UnPackTo(GreaterOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<GreaterOptions> GreaterOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGreaterOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GreaterOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateGreaterOptions(
+      _fbb);
+}
+
+inline GreaterEqualOptionsT *GreaterEqualOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new GreaterEqualOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void GreaterEqualOptions::UnPackTo(GreaterEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<GreaterEqualOptions> GreaterEqualOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGreaterEqualOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GreaterEqualOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateGreaterEqualOptions(
+      _fbb);
+}
+
 inline LessOptionsT *LessOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new LessOptionsT();
   UnPackTo(_o, _resolver);
@@ -6138,6 +6738,29 @@ inline flatbuffers::Offset<LessOptions> CreateLessOptions(flatbuffers::FlatBuffe
       _fbb);
 }
 
+inline LessEqualOptionsT *LessEqualOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LessEqualOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LessEqualOptions::UnPackTo(LessEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<LessEqualOptions> LessEqualOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLessEqualOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LessEqualOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLessEqualOptions(
+      _fbb);
+}
+
 inline NegOptionsT *NegOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new NegOptionsT();
   UnPackTo(_o, _resolver);
@@ -6161,6 +6784,84 @@ inline flatbuffers::Offset<NegOptions> CreateNegOptions(flatbuffers::FlatBufferB
       _fbb);
 }
 
+inline SelectOptionsT *SelectOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SelectOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SelectOptions::UnPackTo(SelectOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<SelectOptions> SelectOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSelectOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SelectOptions> CreateSelectOptions(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SelectOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSelectOptions(
+      _fbb);
+}
+
+inline SliceOptionsT *SliceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SliceOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SliceOptions::UnPackTo(SliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<SliceOptions> SliceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSliceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SliceOptions> CreateSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SliceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSliceOptions(
+      _fbb);
+}
+
+inline TransposeConvOptionsT *TransposeConvOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new TransposeConvOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void TransposeConvOptions::UnPackTo(TransposeConvOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = padding(); _o->padding = _e; };
+  { auto _e = stride_w(); _o->stride_w = _e; };
+  { auto _e = stride_h(); _o->stride_h = _e; };
+}
+
+inline flatbuffers::Offset<TransposeConvOptions> TransposeConvOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTransposeConvOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TransposeConvOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  return tflite::CreateTransposeConvOptions(
+      _fbb,
+      _padding,
+      _stride_w,
+      _stride_h);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -6172,6 +6873,7 @@ inline void OperatorCode::UnPackTo(OperatorCodeT *_o, const flatbuffers::resolve
   (void)_resolver;
   { auto _e = builtin_code(); _o->builtin_code = _e; };
   { auto _e = custom_code(); if (_e) _o->custom_code = _e->str(); };
+  { auto _e = version(); _o->version = _e; };
 }
 
 inline flatbuffers::Offset<OperatorCode> OperatorCode::Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -6184,10 +6886,12 @@ inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBuf
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const OperatorCodeT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _builtin_code = _o->builtin_code;
   auto _custom_code = _o->custom_code.empty() ? 0 : _fbb.CreateString(_o->custom_code);
+  auto _version = _o->version;
   return tflite::CreateOperatorCode(
       _fbb,
       _builtin_code,
-      _custom_code);
+      _custom_code,
+      _version);
 }
 
 inline OperatorT *Operator::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -6512,6 +7216,34 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const NegOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<const PadV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<const GreaterOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<const GreaterEqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<const LessEqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<const SelectOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<const SliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<const TransposeConvOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -6698,6 +7430,34 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const NegOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<const PadV2Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<const GreaterOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<const GreaterEqualOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<const LessEqualOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<const SelectOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<const SliceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<const TransposeConvOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -6872,6 +7632,34 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const NegOptionsT *>(value);
       return CreateNegOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<const PadV2OptionsT *>(value);
+      return CreatePadV2Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<const GreaterOptionsT *>(value);
+      return CreateGreaterOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<const GreaterEqualOptionsT *>(value);
+      return CreateGreaterEqualOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<const LessEqualOptionsT *>(value);
+      return CreateLessEqualOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<const SelectOptionsT *>(value);
+      return CreateSelectOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<const SliceOptionsT *>(value);
+      return CreateSliceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<const TransposeConvOptionsT *>(value);
+      return CreateTransposeConvOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -7046,6 +7834,34 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new NegOptionsT(*reinterpret_cast<NegOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_PadV2Options: {
+      value = new PadV2OptionsT(*reinterpret_cast<PadV2OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GreaterOptions: {
+      value = new GreaterOptionsT(*reinterpret_cast<GreaterOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      value = new GreaterEqualOptionsT(*reinterpret_cast<GreaterEqualOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      value = new LessEqualOptionsT(*reinterpret_cast<LessEqualOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SelectOptions: {
+      value = new SelectOptionsT(*reinterpret_cast<SelectOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SliceOptions: {
+      value = new SliceOptionsT(*reinterpret_cast<SliceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      value = new TransposeConvOptionsT(*reinterpret_cast<TransposeConvOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -7263,6 +8079,41 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<PadV2OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<GreaterOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<GreaterEqualOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<LessEqualOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<SelectOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<SliceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<TransposeConvOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 211de63d58d09374333f0e3fc87f1b0d0e0b7ec7..74fc32a12b12ec3bca81590a74b81bc3caff0d96 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -6,7 +6,8 @@ licenses(["notice"])  # Apache 2.0
 
 load(
     "//tensorflow/contrib/lite:build_def.bzl",
-    "gen_zipped_test_files",
+    "gen_zip_test",
+    "generated_test_models",
 )
 load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
 load(
@@ -14,52 +15,52 @@ load(
     "tf_cc_test",
 )
 
-gen_zipped_test_files(
-    name = "optest",
-    files = [
-        "add.zip",
-        "arg_max.zip",
-        "avg_pool.zip",
-        "batch_to_space_nd.zip",
-        "concat.zip",
-        "constant.zip",
-        "control_dep.zip",
-        "conv.zip",
-        "depthwiseconv.zip",
-        "div.zip",
-        "exp.zip",
-        "floor.zip",
-        "fully_connected.zip",
-        "fused_batch_norm.zip",
-        "gather.zip",
-        "global_batch_norm.zip",
-        "l2_pool.zip",
-        "l2norm.zip",
-        "less.zip",
-        "local_response_norm.zip",
-        "log_softmax.zip",
-        "max_pool.zip",
-        "maximum.zip",
-        "mean.zip",
-        "minimum.zip",
-        "mul.zip",
-        "neg.zip",
-        "pad.zip",
-        "relu.zip",
-        "relu1.zip",
-        "relu6.zip",
-        "reshape.zip",
-        "resize_bilinear.zip",
-        "sigmoid.zip",
-        "softmax.zip",
-        "space_to_batch_nd.zip",
-        "space_to_depth.zip",
-        "split.zip",
-        "squeeze.zip",
-        "strided_slice.zip",
-        "sub.zip",
-        "topk.zip",
-        "transpose.zip",
+[gen_zip_test(
+    name = "zip_test_%s" % test_name,
+    size = "large",
+    srcs = ["generated_examples_zip_test.cc"],
+    args = [
+        "--zip_file_path=$(location :zip_%s)" % test_name,
+        # TODO(angerson) We may be able to add an external unzip binary instead
+        # of relying on an existing one for OSS builds.
+        "--unzip_binary_path=/usr/bin/unzip",
+    ],
+    data = [
+        ":zip_%s" % test_name,
+    ],
+    shard_count = 20,
+    tags = [
+        "gen_zip_test",
+        "no_oss",
+        "tflite_not_portable",
+    ],
+    test_name = test_name,
+    deps = [
+        ":parse_testdata_lib",
+        ":tflite_driver",
+        ":util",
+        "@com_google_googletest//:gtest",
+        "@com_googlesource_code_re2//:re2",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+    ] + select({
+        "//conditions:default": [
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
+        ],
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:android_tensorflow_test_lib",
+        ],
+    }),
+) for test_name in generated_test_models()]
+
+test_suite(
+    name = "generated_zip_tests",
+    tags = [
+        "gen_zip_test",
     ],
 )
 
@@ -285,13 +286,9 @@ cc_library(
     deps = [
         ":generate_testspec",
         ":parse_testdata_lib",
-        ":split",
         ":tflite_driver",
-        ":util",
-        "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
     ],
 )
 
@@ -348,42 +345,4 @@ cc_binary(
     ],
 )
 
-tf_cc_test(
-    name = "generated_examples_zip_test",
-    size = "large",
-    srcs = ["generated_examples_zip_test.cc"],
-    args = [
-        "--zip_files_dir=tensorflow/contrib/lite/testing/optest",
-        # TODO(angerson) We may be able to add an external unzip binary instead
-        # of relying on an existing one for OSS builds.
-        "--unzip_binary_path=/usr/bin/unzip",
-    ],
-    data = [":optest"],
-    shard_count = 20,
-    tags = [
-        "no_oss",
-        "tflite_not_portable",
-    ],
-    deps = [
-        ":parse_testdata_lib",
-        ":tflite_driver",
-        ":util",
-        "@com_google_googletest//:gtest",
-        "@com_googlesource_code_re2//:re2",
-        "//tensorflow/contrib/lite:builtin_op_data",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-    ] + select({
-        "//conditions:default": [
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:test",
-        ],
-        "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
-        ],
-    }),
-)
-
 tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 926bb3f12182f85c9345d601069d4ab246cf11e7..0e036bda92e4c42056f3f5df9fd8bcddcd932c13 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -20,14 +20,21 @@ Usage:
 generate_examples <output directory>
 
 bazel run //tensorflow/contrib/lite/testing:generate_examples
+
+To more easily debug failures use (or override) the --save_graphdefs flag to
+place text proto graphdefs into the generated zip files.
 """
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import argparse
+import functools
 import itertools
+import operator
 import os
+import random
 import re
 import sys
 import tempfile
@@ -90,21 +97,14 @@ KNOWN_BUGS = {
     r"fully_connected.*transpose_.=True": "67586970",
     # Softmax graphs are too complex.
     r"softmax.*dim=0": "67749831",
-    r"softmax.*input_shape=\[1,3,4,3\]": "67749831",
     # SpaceToDepth only supports float32.
     r"space_to_depth.*(float16|int32|uint8|int64)": "68018134",
     # BatchToSpaceND only supports 4D tensors.
     r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733",
     # Div will use floordiv.
     r"div.*int32": "72051395",
-    # TOCO require matching dimensions in strided_slice.
-    r"strided_slice.*begin=\[0\].*end=\[1\].*": "73170889",
     # No support for SplitV
     r"split.*num_or_size_splits=\[2,2\]": "73377559",
-    # Needs support for dimensions other than the last one in argmax.
-    r"arg_max.*axis=0.*": "77546240",
-    r"arg_max.*axis=1.*": "77546240",
-    r"arg_max.*axis=2.*": "77546240",
 }
 
 
@@ -328,6 +328,11 @@ def normalize_output_name(output_name):
       ":0") else output_name
 
 
+# How many test cases we may have in a zip file. Too many test cases will
+# slow down the test data generation process.
+_MAX_TESTS_PER_ZIP = 500
+
+
 def make_zip_of_tests(zip_path,
                       test_parameters,
                       make_graph,
@@ -357,12 +362,26 @@ def make_zip_of_tests(zip_path,
   Raises:
     RuntimeError: if there are toco errors that can't be ignored.
   """
+  parameter_count = 0
+  for parameters in test_parameters:
+    parameter_count += functools.reduce(
+        operator.mul, [len(values) for values in parameters.values()])
+
+  if parameter_count > _MAX_TESTS_PER_ZIP:
+    raise RuntimeError(
+        "Too many parameter combinations for generating '%s'.\n"
+        "There are %d combinations while the upper limit is %d.\n"
+        "Having too many combinations will slow down the tests.\n"
+        "Please consider splitting the test into multiple functions.\n"
+        % (zip_path, parameter_count, _MAX_TESTS_PER_ZIP))
 
   # TODO(aselle): Make this allow multiple inputs outputs.
   archive = zipfile.PyZipFile(zip_path, "w")
   zip_manifest = []
   convert_report = []
   toco_errors = 0
+
+  processed_labels = set()
   for parameters in test_parameters:
     keys = parameters.keys()
     for curr in itertools.product(*parameters.values()):
@@ -370,6 +389,12 @@ def make_zip_of_tests(zip_path,
           "%s=%r" % z for z in sorted(zip(keys, curr))).replace(" ", ""))
       if label[0] == "/":
         label = label[1:]
+      if label in processed_labels:
+        # Do not populate data for the same label more than once. It will cause
+        # errors when unzipping.
+        continue
+      processed_labels.add(label)
+
       param_dict = dict(zip(keys, curr))
 
       def build_example(label, param_dict_real):
@@ -430,7 +455,7 @@ def make_zip_of_tests(zip_path,
         report["toco_log"] = toco_log
 
         if FLAGS.save_graphdefs:
-          archive.writestr(label + ".pb",
+          archive.writestr(label + ".pbtxt",
                            text_format.MessageToString(graph_def),
                            zipfile.ZIP_DEFLATED)
 
@@ -468,6 +493,7 @@ def make_zip_of_tests(zip_path,
                 report["toco_log"])
 
       convert_report.append((param_dict, report))
+
   report_io = StringIO()
   report_lib.make_report_table(report_io, zip_path, convert_report)
   archive.writestr("report.html", report_io.getvalue())
@@ -718,8 +744,8 @@ def make_mean_tests(zip_path):
       "const_axis": [True, False],
       "keepdims": [True, False],
   }, {
-      "input_dtype": [tf.float32, tf.int32, tf.int64],
-      "input_shape": [[1, 224, 224, 3]],
+      "input_dtype": [tf.float32],
+      "input_shape": [[1, 8, 8, 3]],
       "axis": [
           None, 0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2, 3],
           [3, 2, 1, 0], [3, 1, 0, 2], [2, 0], [3, 0], [3, 1], [1, 0], -1, -2,
@@ -1034,40 +1060,39 @@ def make_fused_batch_norm_tests(zip_path):
 def make_conv_tests(zip_path):
   """Make a set of tests to do convolution."""
 
-  test_parameters = [
-      {
-          "input_shape": [[1, 3, 4, 3]],
-          "filter_shape": [[1, 1, 3, 2]],
-          "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
-          "dilations": [[1, 1, 1, 1], [1, 3, 2, 1], [1, 2, 2, 1]],
-          "padding": ["SAME", "VALID"],
-          "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
-          "constant_filter": [True, False],
-      },
-      {
-          "input_shape": [[2, 14, 14, 2]],
-          "filter_shape": [[6, 6, 2, 2]],
-          "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
-          "dilations": [[1, 1, 1, 1], [1, 2, 2, 1]],
-          "padding": ["SAME", "VALID"],
-          "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
-          "constant_filter": [True, False],
-      }
-  ]
+  test_parameters = [{
+      "input_shape": [[1, 3, 4, 3], [4, 6, 6, 1]],
+      "filter_shape": [[1, 1], [2, 3], [3, 3]],
+      "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+      "dilations": [[1, 1, 1, 1], [1, 3, 2, 1], [1, 2, 2, 1]],
+      "padding": ["SAME", "VALID"],
+      "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
+      "constant_filter": [True, False],
+      "channel_multiplier": [1, 2],
+  }]
+
+  def get_tensor_shapes(parameters):
+    input_shape = parameters["input_shape"]
+    filter_size = parameters["filter_shape"]
+    filter_shape = filter_size + [
+        input_shape[3], parameters["channel_multiplier"]
+    ]
+    return [input_shape, filter_shape]
 
   def build_graph(parameters):
     """Build a conv graph given `parameters`."""
+    input_shape, filter_shape = get_tensor_shapes(parameters)
     input_tensor = tf.placeholder(
-        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+        dtype=tf.float32, name="input", shape=input_shape)
 
     # Get filter input either as a placeholder or constants. Also get a list of
     # the input tensors that are represented as placeholders.
     if parameters["constant_filter"]:
-      filter_input = create_tensor_data(np.float32, parameters["filter_shape"])
+      filter_input = create_tensor_data(np.float32, filter_shape)
       input_tensors = [input_tensor]
     else:
       filter_input = tf.placeholder(
-          dtype=tf.float32, name="filter", shape=parameters["filter_shape"])
+          dtype=tf.float32, name="filter", shape=filter_shape)
       input_tensors = [input_tensor, filter_input]
 
     out = tf.nn.conv2d(
@@ -1082,9 +1107,10 @@ def make_conv_tests(zip_path):
   def build_inputs(parameters, sess, inputs, outputs):
     # Build list of input values either containing 1 tensor (input) or 2 tensors
     # (input, filter) based on whether filter is constant or variable input.
-    values = [create_tensor_data(np.float32, parameters["input_shape"])]
+    input_shape, filter_shape = get_tensor_shapes(parameters)
+    values = [create_tensor_data(np.float32, input_shape)]
     if not parameters["constant_filter"]:
-      values.append(create_tensor_data(np.float32, parameters["filter_shape"]))
+      values.append(create_tensor_data(np.float32, filter_shape))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
@@ -1316,10 +1342,10 @@ def make_local_response_norm_tests(zip_path):
   # Chose a set of parameters
   test_parameters = [{
       "input_shape": [[1, 1, 1, 1], [1, 3, 4, 3], [3, 15, 14, 3]],
-      "depth_radius": [None, 0, 1, 3, 4, 5],
-      "bias": [None, 0.1, 0.3, -0.1],
-      "alpha": [None, 1, 2, -3],
-      "beta": [None, 0.5, 0.25, 2],
+      "depth_radius": [None, 0, 1, 3, 5],
+      "bias": [None, 0.3, -0.1],
+      "alpha": [None, 2, -3],
+      "beta": [None, 0.25, 2],
   }]
 
   def build_graph(parameters):
@@ -1391,6 +1417,60 @@ def make_pad_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_padv2_tests(zip_path):
+  """Make a set of tests to do padv2."""
+
+  # TODO(nupurgarg): Add test for tf.uint8.
+  test_parameters = [
+      {
+          "dtype": [tf.int32, tf.int64, tf.float32],
+          "input_shape": [[1, 1, 2, 1], [2, 1, 1, 1]],
+          "paddings": [[[0, 0], [0, 1], [2, 3], [0, 0]], [[0, 1], [0, 0],
+                                                          [0, 0], [2, 3]]],
+          "constant_paddings": [True, False],
+          "constant_values": [0, 2],
+      },
+      # Non-4D use case.
+      {
+          "dtype": [tf.int32, tf.int64, tf.float32],
+          "input_shape": [[1, 2], [0, 1, 2]],
+          "paddings": [[[0, 1], [2, 3]]],
+          "constant_paddings": [True, False],
+          "constant_values": [0, 2],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build a pad graph given `parameters`."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+
+    # Get paddings as either a placeholder or constants.
+    if parameters["constant_paddings"]:
+      paddings = parameters["paddings"]
+      input_tensors = [input_tensor]
+    else:
+      shape = [len(parameters["paddings"]), 2]
+      paddings = tf.placeholder(dtype=tf.int32, name="padding", shape=shape)
+      input_tensors = [input_tensor, paddings]
+
+    out = tf.pad(input_tensor, paddings=paddings,
+                 constant_values=parameters["constant_values"])
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_paddings"]:
+      values.append(np.array(parameters["paddings"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_reshape_tests(zip_path):
   """Make a set of tests to do reshape."""
 
@@ -1740,65 +1820,8 @@ def make_squeeze_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_strided_slice_tests(zip_path):
-  """Make a set of tests to do strided_slice."""
-
-  # TODO(soroosh): add test/support for uint8.
-  test_parameters = [
-      # 4-D
-      {
-          "dtype": [tf.float32, tf.int32, tf.int64],
-          "index_type": [tf.int32],
-          "input_shape": [[12, 2, 2, 5]],
-          "begin": [[0, 0, 0, 0], [1, 0, 1, 0]],
-          "end": [[8, 2, 2, 3], [12, 2, 2, 5]],
-          "strides": [None, [2, 1, 3, 1]],
-          "begin_mask": [None, 1, 8],
-          "end_mask": [None, 1, 8],
-          "shrink_axis_mask": [None, 1, 8, 11, 15, -1],
-          "constant_indices": [False, True],
-      },
-      # TODO(b/73170889) Restore test parameters removed in cl/191608113.
-      # 2-D
-      {
-          "dtype": [tf.float32, tf.int32, tf.int64],
-          "index_type": [tf.int32],
-          "input_shape": [[2, 3]],
-          "begin": [[0, 0], [1, 0]],
-          "end": [[2, 3], [2, 2]],
-          "strides": [None, [2, 2]],
-          "begin_mask": [None, 1, 2],
-          "end_mask": [None, 1, 2],
-          "shrink_axis_mask": [None, 1, 2, 3, -1],
-          "constant_indices": [False, True],
-      },
-      # 1-D Exhaustive
-      {
-          "dtype": [tf.float32],
-          "index_type": [tf.int32],
-          "input_shape": [[4]],
-          "begin": [[-100], [-3], [-2], [-1], [0], [1], [2], [3], [100]],
-          "end": [[-100], [-3], [-2], [-1], [0], [1], [2], [3], [100]],
-          "strides": [-2, -1, 1, 2],
-          "begin_mask": [0, 1],
-          "end_mask": [0, 1],
-          "shrink_axis_mask": [0],
-          "constant_indices": [False],
-      },
-      # Negative strides
-      {
-          "dtype": [tf.float32],
-          "index_type": [tf.int32],
-          "input_shape": [[2, 3]],
-          "begin": [[0, -1]],
-          "end": [[2, -3]],
-          "strides": [[1, -1]],
-          "begin_mask": [None, 1, 2],
-          "end_mask": [None, 1, 2],
-          "shrink_axis_mask": [None, 1, 2, 3, -1],
-          "constant_indices": [False],
-      },
-  ]
+def _make_strided_slice_tests(zip_path, test_parameters):
+  """Utility function to make strided_slice_tests based on parameters."""
 
   def build_graph(parameters):
     """Build graph for stride_slice test."""
@@ -1860,6 +1883,100 @@ def make_strided_slice_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_strided_slice_tests(zip_path):
+  """Make a set of tests to do strided_slice."""
+
+  # TODO(soroosh): add test/support for uint8.
+  test_parameters = [
+      # 4-D (basic cases with const/non-const indices).
+      {
+          "dtype": [tf.float32, tf.int32, tf.int64],
+          "index_type": [tf.int32],
+          "input_shape": [[12, 2, 2, 5]],
+          "strides": [None, [2, 1, 3, 1]],
+          "begin": [[0, 0, 0, 0]],
+          "end": [[12, 2, 2, 5]],
+          "begin_mask": [None],
+          "end_mask": [None],
+          "shrink_axis_mask": [None],
+          "constant_indices": [False, True],
+      },
+      # 4-D with non-trivial begin & end.
+      {
+          "dtype": [tf.float32],
+          "index_type": [tf.int32],
+          "input_shape": [[12, 2, 2, 5]],
+          "begin": [[0, 0, 0, 0], [1, 0, 1, 0]],
+          "end": [[8, 2, 2, 3], [12, 2, 2, 5]],
+          "strides": [None, [2, 1, 3, 1]],
+          "begin_mask": [None, 8],
+          "end_mask": [None, 3],
+          "shrink_axis_mask": [None, 15, -1],
+          "constant_indices": [True],
+      },
+      # Begin, end, strides dim are different from input shape
+      {
+          "dtype": [tf.float32],
+          "index_type": [tf.int32],
+          "input_shape": [[12, 2, 2, 5]],
+          "begin": [[0]],
+          "end": [[1]],
+          "strides": [None, [1]],
+          "begin_mask": [0],
+          "end_mask": [0],
+          "shrink_axis_mask": [1],
+          "constant_indices": [True],
+      },
+      # 2-D
+      {
+          "dtype": [tf.float32],
+          "index_type": [tf.int32],
+          "input_shape": [[2, 3]],
+          "begin": [[0, 0]],
+          "end": [[2, 2]],
+          "strides": [None, [2, 2]],
+          "begin_mask": [None, 1, 2],
+          "end_mask": [None, 1, 2],
+          "shrink_axis_mask": [None, 1, 2, 3, -1],
+          "constant_indices": [False, True],
+      },
+      # Negative strides
+      {
+          "dtype": [tf.float32],
+          "index_type": [tf.int32],
+          "input_shape": [[2, 3]],
+          "begin": [[0, -1]],
+          "end": [[2, -3]],
+          "strides": [[1, -1]],
+          "begin_mask": [None, 1, 2],
+          "end_mask": [None, 1, 2],
+          "shrink_axis_mask": [None, 1, 2, 3, -1],
+          "constant_indices": [False],
+      },
+  ]
+  _make_strided_slice_tests(zip_path, test_parameters)
+
+
+def make_strided_slice_1d_exhaustive_tests(zip_path):
+  """Make a set of exhaustive tests for 1D strided_slice."""
+  test_parameters = [
+      # 1-D Exhaustive
+      {
+          "dtype": [tf.float32],
+          "index_type": [tf.int32],
+          "input_shape": [[3]],
+          "begin": [[-2], [-1], [0], [1], [2]],
+          "end": [[-2], [-1], [0], [1], [2]],
+          "strides": [[-2], [-1], [1], [2]],
+          "begin_mask": [0, 1],
+          "end_mask": [0, 1],
+          "shrink_axis_mask": [0],
+          "constant_indices": [False],
+      },
+  ]
+  _make_strided_slice_tests(zip_path, test_parameters)
+
+
 def make_lstm_tests(zip_path):
   """Make a set of tests to do basic Lstm cell."""
 
@@ -1978,8 +2095,8 @@ def make_arg_max_tests(zip_path):
   test_parameters = [{
       "input_dtype": [tf.float32, tf.int32],
       "input_shape": [[1, 1, 1, 3], [2, 3, 4, 5], [2, 3, 3], [5, 5], [10]],
-      "axis": [0, 1, 2, 3],
       "output_type": [tf.int32, tf.int64],
+      "axis_is_last_dim": [True, False],
   }]
 
   def build_graph(parameters):
@@ -1988,7 +2105,10 @@ def make_arg_max_tests(zip_path):
         dtype=parameters["input_dtype"],
         name="input",
         shape=parameters["input_shape"])
-    axis = tf.constant(parameters["axis"], name="axis")
+    if parameters["axis_is_last_dim"]:
+      axis = len(parameters["input_shape"]) - 1
+    else:
+      axis = random.randint(0, max(len(parameters["input_shape"]) - 2, 0))
     out = tf.arg_max(input_value, axis, output_type=parameters["output_type"])
     return [input_value], [out]
 
@@ -2001,6 +2121,74 @@ def make_arg_max_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_greater_tests(zip_path):
+  """Make a set of tests to do greater."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32, tf.int64],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
+                           ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
+                           ([5, 5], [1]), ([10], [2, 4, 10])],
+  }]
+
+  def build_graph(parameters):
+    """Build the greater op testing graph."""
+    input_value1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape_pair"][0])
+    input_value2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input2",
+        shape=parameters["input_shape_pair"][1])
+    out = tf.greater(input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][0])
+    input_value2 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_greater_equal_tests(zip_path):
+  """Make a set of tests to do greater_equal."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32, tf.int64],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
+                           ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
+                           ([5, 5], [1]), ([10], [2, 4, 10])],
+  }]
+
+  def build_graph(parameters):
+    """Build the greater_equal op testing graph."""
+    input_value1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape_pair"][0])
+    input_value2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input2",
+        shape=parameters["input_shape_pair"][1])
+    out = tf.greater_equal(input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][0])
+    input_value2 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_less_tests(zip_path):
   """Make a set of tests to do less."""
 
@@ -2035,6 +2223,40 @@ def make_less_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_less_equal_tests(zip_path):
+  """Make a set of tests to do less_equal."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32, tf.int64],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
+                           ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
+                           ([5, 5], [1]), ([10], [2, 4, 10])],
+  }]
+
+  def build_graph(parameters):
+    """Build the less_equal op testing graph."""
+    input_value1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape_pair"][0])
+    input_value2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input2",
+        shape=parameters["input_shape_pair"][1])
+    out = tf.less_equal(input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][0])
+    input_value2 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_floor_tests(zip_path):
   """Make a set of tests to do floor."""
 
@@ -2086,6 +2308,183 @@ def make_neg_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_sin_tests(zip_path):
+  """Make a set of tests to do sin."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape": [[1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    """Build the sin op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape"])
+    out = tf.sin(input_value)
+    return [input_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict={inputs[0]: input_value})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_where_tests(zip_path):
+  """Make a set of tests to do where."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32],
+      "input_shape_set": [([1, 2, 3, 4], [1, 2, 3, 4]),],
+  }]
+
+  def build_graph(parameters):
+    """Build the where op testing graph."""
+    input_value1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input2",
+        shape=parameters["input_shape_set"][0])
+    input_value2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input3",
+        shape=parameters["input_shape_set"][1])
+    less = tf.less(input_value1, input_value2)
+    out = tf.where(less, input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_set"][0])
+    input_value2 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_set"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_slice_tests(zip_path):
+  """Make a set of tests to do slice."""
+
+  # TODO(renjieliu): add test/support for uint8.
+  test_parameters = [
+      # 4-D
+      {
+          "dtype": [tf.float32, tf.int32, tf.int64],
+          "index_type": [tf.int32, tf.int64],
+          "input_shape": [[12, 2, 2, 5]],
+          "begin": [[0, 0, 0, 0], [1, 0, 1, 0]],
+          "size": [[8, 2, 2, 3], [11, 2, 1, 5]],
+      },
+      # 2-D
+      {
+          "dtype": [tf.float32, tf.int32, tf.int64],
+          "index_type": [tf.int32, tf.int64],
+          "input_shape": [[2, 3]],
+          "begin": [[0, 0], [1, 0]],
+          "size": [[2, 3], [2, 2]],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build graph for slice test."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    begin = tf.placeholder(
+        dtype=parameters["index_type"],
+        name="begin",
+        shape=[len(parameters["input_shape"])])
+    size = tf.placeholder(
+        dtype=parameters["index_type"],
+        name="size",
+        shape=[len(parameters["input_shape"])])
+    tensors = [input_tensor, begin, size]
+    out = tf.slice(input_tensor, begin, size)
+    return tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Build inputs for slice test."""
+    input_values = create_tensor_data(parameters["dtype"],
+                                      parameters["input_shape"])
+    index_type = _TF_TYPE_INFO[parameters["index_type"]][0]
+
+    begin_values = np.array(parameters["begin"]).astype(index_type)
+    size_values = np.array(parameters["size"]).astype(index_type)
+    values = [input_values, begin_values, size_values]
+
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+# Since compute output_shape is fairly complicated for
+# tf.nn.conv2d_backprop_input input_sizes argument, so we here first perform a
+# "conv2d" operation to get the output, then we use the output to feed in
+# tf.nn.conv2d_backprop_input.
+# This test will depend on the "conv2d" operation's correctness.
+def make_transpose_conv_tests(zip_path):
+  """Make a set of tests to do transpose_conv."""
+
+  # Tensorflow only supports equal strides
+  test_parameters = [{
+      "input_shape": [[1, 3, 4, 1], [1, 10, 10, 3], [3, 20, 20, 1]],
+      "filter_size": [[1, 1], [1, 2], [3, 3]],
+      "strides": [[1, 1, 1, 1], [1, 3, 3, 1]],
+      "padding": ["SAME", "VALID"],
+      "data_format": ["NHWC"],
+      "channel_multiplier": [1, 2],
+  }]
+
+  def get_tensor_shapes(parameters):
+    input_shape = parameters["input_shape"]
+    filter_size = parameters["filter_size"]
+    filter_shape = filter_size + [
+        input_shape[3], parameters["channel_multiplier"]
+    ]
+    return [input_shape, filter_shape]
+
+  def build_graph(parameters):
+    """Build a transpose_conv graph given `parameters`."""
+    input_shape, filter_shape = get_tensor_shapes(parameters)
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=input_shape)
+
+    filter_input = tf.placeholder(
+        dtype=tf.float32, name="filter", shape=filter_shape)
+
+    conv_outputs = tf.nn.conv2d(
+        input_tensor,
+        filter_input,
+        strides=parameters["strides"],
+        padding=parameters["padding"],
+        data_format=parameters["data_format"])
+    out = tf.nn.conv2d_backprop_input(
+        input_shape,
+        filter_input,
+        conv_outputs,
+        strides=parameters["strides"],
+        padding=parameters["padding"],
+        data_format=parameters["data_format"])
+    input_tensors = [input_tensor, filter_input]
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_shape, filter_shape = get_tensor_shapes(parameters)
+    values = [
+        create_tensor_data(np.float32, input_shape),
+        create_tensor_data(np.float32, filter_shape)
+    ]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.cc b/tensorflow/contrib/lite/testing/generate_testspec.cc
index 6580845af42b3cdded19b578b41c682089aaf9ef..c0c861ff6da2fc144b9303dfdd48f19794cebeca 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.cc
+++ b/tensorflow/contrib/lite/testing/generate_testspec.cc
@@ -80,11 +80,30 @@ bool GenerateTestSpecFromTensorflowModel(
   // Invoke tensorflow model.
   TfDriver runner(input_layer, input_layer_type, input_layer_shape,
                   output_layer);
+  if (!runner.IsValid()) {
+    cerr << runner.GetErrorMessage() << endl;
+    return false;
+  }
+
   runner.LoadModel(tensorflow_model_path);
+  if (!runner.IsValid()) {
+    cerr << runner.GetErrorMessage() << endl;
+    return false;
+  }
+
   for (int i = 0; i < input_values.size(); i++) {
     runner.SetInput(i, input_values[i]);
+    if (!runner.IsValid()) {
+      cerr << runner.GetErrorMessage() << endl;
+      return false;
+    }
   }
+
   runner.Invoke();
+  if (!runner.IsValid()) {
+    cerr << runner.GetErrorMessage() << endl;
+    return false;
+  }
 
   // Write test spec.
   stream << "load_model: " << tflite_model_path << "\n";
@@ -99,6 +118,10 @@ bool GenerateTestSpecFromTensorflowModel(
   }
   for (int i = 0; i < output_layer.size(); i++) {
     stream << "  output: \"" << runner.ReadOutput(i) << "\"\n";
+    if (!runner.IsValid()) {
+      cerr << runner.GetErrorMessage() << endl;
+      return false;
+    }
   }
   stream << "}\n";
 
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 0673a3bb462d7ce4705e02d10bb66c40582b8c07..2f069ff8e79b4a08824121c49e9327619cfeb858 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -35,7 +35,7 @@ namespace {
 bool FLAGS_ignore_known_bugs = true;
 // TODO(b/71769302) zip_files_dir should have a more accurate default, if
 // possible
-string* FLAGS_zip_files_dir = new string("./");
+string* FLAGS_zip_file_path = new string("./");
 string* FLAGS_unzip_binary_path = new string("/usr/bin/unzip");
 }  // namespace
 
@@ -54,17 +54,15 @@ std::map<string, string> kBrokenTests = {
     {R"(^\/div.*int32)", "68808744"},
     {R"(^\/sub.*int32)", "68808744"},
 
-    // Pad only supports 4D tensors.
+    // Pad and PadV2 only supports 4D tensors.
     {R"(^\/pad.*,input_shape=\[.,.\],paddings=\[\[.,.\],\[.,.\]\])",
      "70527055"},
+    {R"(^\/padv2.*,input_shape=\[.,.\],paddings=\[\[.,.\],\[.,.\]\])",
+     "70527055"},
 
     // L2Norm only supports tensors with 4D or fewer.
     {R"(^\/l2normdim=.*,epsilon=.*,input_shape=\[.,.,.,.,.*\])", "67963684"},
 
-    // BatchToSpaceND doesn't support cropping. This catches test cases with
-    // non-const tensors as crops.
-    {R"(^\/batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\])", "70594634"},
-
     // SpaceToBatchND only supports 4D tensors.
     {R"(^\/space_to_batch_nd.*input_shape=\[1,4,4,4,1,1\])", "70848787"},
 
@@ -96,9 +94,11 @@ std::map<string, string> kBrokenTests = {
     {R"(^\/gather.*axis=1)", "76910444"},
 
     // No support for arbitrary dimensions in ArgMax.
-    {R"(^\/arg_max.*axis=0)", "77546240"},
-    {R"(^\/arg_max.*axis=1)", "77546240"},
-    {R"(^\/arg_max.*axis=2)", "77546240"},
+    {R"(^\/arg_max.*axis_is_last_dim=False.*input_shape=\[.,.,.,.\])",
+     "77546240"},
+    {R"(^\/arg_max.*axis_is_last_dim=False.*input_shape=\[.,.,.\])",
+     "77546240"},
+    {R"(^\/arg_max.*axis_is_last_dim=False.*input_shape=\[.,.\])", "77546240"},
 };
 
 // Allows test data to be unzipped into a temporary directory and makes
@@ -137,7 +137,10 @@ class ZipEnvironment : public ::testing::Environment {
       *out_dir = dir;
       return tensorflow::Status::OK();
     } else {
-      return tensorflow::Status(tensorflow::error::UNKNOWN, "unzip failed");
+      return tensorflow::Status(tensorflow::error::UNKNOWN,
+                                "unzip failed. "
+                                "stdout:\n" +
+                                    out + "\nstderr:\n" + err);
     }
   }
 
@@ -191,8 +194,7 @@ tensorflow::Status ReadManifest(const string& original_file, const string& dir,
 }
 
 // Get a list of tests from a zip file `zip_file_name`.
-std::vector<string> UnarchiveZipAndFindTestNames(const string& zip_file_name) {
-  string zip_file = *FLAGS_zip_files_dir + "/" + zip_file_name;
+std::vector<string> UnarchiveZipAndFindTestNames(const string& zip_file) {
   string decompress_tmp_dir;
   TF_CHECK_OK(zip_environment()->UnZip(zip_file, &decompress_tmp_dir));
   std::vector<string> stuff;
@@ -202,7 +204,7 @@ std::vector<string> UnarchiveZipAndFindTestNames(const string& zip_file_name) {
 
 class OpsTest : public ::testing::TestWithParam<string> {};
 
-TEST_P(OpsTest, RunStuff) {
+TEST_P(OpsTest, RunZipTests) {
   string test_path = GetParam();
   string tflite_test_case = test_path + "_tests.txt";
   string tflite_dir = test_path.substr(0, test_path.find_last_of("/"));
@@ -225,7 +227,9 @@ TEST_P(OpsTest, RunStuff) {
     EXPECT_TRUE(result) << test_driver.GetErrorMessage();
   } else {
     if (FLAGS_ignore_known_bugs) {
-      EXPECT_FALSE(result);
+      EXPECT_FALSE(result) << "Test was expected to fail but is now passing; "
+                              "you can mark http://b/"
+                           << bug_number << " as fixed! Yay!";
     } else {
       EXPECT_TRUE(result) << test_driver.GetErrorMessage()
                           << ": Possibly due to http://b/" << bug_number;
@@ -233,56 +237,26 @@ TEST_P(OpsTest, RunStuff) {
   }
 }
 
-// Instantiate a test. This assumes `zip_base`.zip is a declared data file
-// of this test.
-#define INSTANTIATE_TESTS(zip_base) \
-  INSTANTIATE_TEST_CASE_P(          \
-      zip_base, OpsTest,            \
-      ::testing::ValuesIn(UnarchiveZipAndFindTestNames(#zip_base ".zip")));
-
-INSTANTIATE_TESTS(add)
-INSTANTIATE_TESTS(arg_max)
-INSTANTIATE_TESTS(avg_pool)
-INSTANTIATE_TESTS(batch_to_space_nd)
-INSTANTIATE_TESTS(concat)
-INSTANTIATE_TESTS(constant)
-INSTANTIATE_TESTS(control_dep)
-INSTANTIATE_TESTS(conv)
-INSTANTIATE_TESTS(depthwiseconv)
-INSTANTIATE_TESTS(div)
-INSTANTIATE_TESTS(exp)
-INSTANTIATE_TESTS(floor)
-INSTANTIATE_TESTS(fully_connected)
-INSTANTIATE_TESTS(fused_batch_norm)
-INSTANTIATE_TESTS(gather)
-INSTANTIATE_TESTS(global_batch_norm)
-INSTANTIATE_TESTS(l2_pool)
-INSTANTIATE_TESTS(l2norm)
-INSTANTIATE_TESTS(less)
-INSTANTIATE_TESTS(local_response_norm)
-INSTANTIATE_TESTS(log_softmax)
-INSTANTIATE_TESTS(max_pool)
-INSTANTIATE_TESTS(maximum)
-INSTANTIATE_TESTS(mean)
-INSTANTIATE_TESTS(minimum)
-INSTANTIATE_TESTS(mul)
-INSTANTIATE_TESTS(neg)
-INSTANTIATE_TESTS(pad)
-// INSTANTIATE_TESTS(prelu)
-INSTANTIATE_TESTS(relu)
-INSTANTIATE_TESTS(relu1)
-INSTANTIATE_TESTS(relu6)
-INSTANTIATE_TESTS(reshape)
-INSTANTIATE_TESTS(resize_bilinear)
-INSTANTIATE_TESTS(sigmoid)
-INSTANTIATE_TESTS(softmax)
-INSTANTIATE_TESTS(space_to_batch_nd)
-INSTANTIATE_TESTS(space_to_depth)
-INSTANTIATE_TESTS(split)
-INSTANTIATE_TESTS(squeeze)
-INSTANTIATE_TESTS(strided_slice)
-INSTANTIATE_TESTS(sub)
-INSTANTIATE_TESTS(transpose)
+struct ZipPathParamName {
+  template <class ParamType>
+  string operator()(const ::testing::TestParamInfo<ParamType>& info) const {
+    string param_name = info.param;
+    size_t last_slash = param_name.find_last_of("\\/");
+    if (last_slash != string::npos) {
+      param_name = param_name.substr(last_slash);
+    }
+    for (size_t index = 0; index < param_name.size(); ++index) {
+      if (!isalnum(param_name[index]) && param_name[index] != '_')
+        param_name[index] = '_';
+    }
+    return param_name;
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    tests, OpsTest,
+    ::testing::ValuesIn(UnarchiveZipAndFindTestNames(*FLAGS_zip_file_path)),
+    ZipPathParamName());
 
 }  // namespace testing
 }  // namespace tflite
@@ -295,8 +269,8 @@ int main(int argc, char** argv) {
           "ignore_known_bugs", &tflite::testing::FLAGS_ignore_known_bugs,
           "If a particular model is affected by a known bug, the "
           "corresponding test should expect the outputs to not match."),
-      tensorflow::Flag("zip_files_dir", tflite::testing::FLAGS_zip_files_dir,
-                       "Required: Location of the test zips."),
+      tensorflow::Flag("zip_file_path", tflite::testing::FLAGS_zip_file_path,
+                       "Required: Location of the test zip file."),
       tensorflow::Flag("unzip_binary_path",
                        tflite::testing::FLAGS_unzip_binary_path,
                        "Required: Location of a suitable unzip binary.")};
diff --git a/tensorflow/contrib/lite/testing/join.h b/tensorflow/contrib/lite/testing/join.h
index ce8c072a21c6e61e8ab8ae12ba52418e6144009a..1edee01cf97da3c53be1895e667b005551ac2991 100644
--- a/tensorflow/contrib/lite/testing/join.h
+++ b/tensorflow/contrib/lite/testing/join.h
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tflite {
 namespace testing {
 
-// Join a list of data separated by delimieter.
+// Join a list of data separated by delimiter.
 template <typename T>
 string Join(T* data, size_t len, const string& delimiter) {
   if (len == 0 || data == nullptr) {
@@ -36,6 +36,22 @@ string Join(T* data, size_t len, const string& delimiter) {
   return result.str();
 }
 
+// Join a list of uint8 data separated by a delimiter. Cast data to int before
+// placing it in the string to prevent values from being treated like chars.
+template <>
+inline string Join<uint8_t>(uint8_t* data, size_t len,
+                            const string& delimiter) {
+  if (len == 0 || data == nullptr) {
+    return "";
+  }
+  std::stringstream result;
+  result << static_cast<int>(data[0]);
+  for (int i = 1; i < len; i++) {
+    result << delimiter << static_cast<int>(data[i]);
+  }
+  return result.str();
+}
+
 }  // namespace testing
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/testing/test_runner.h b/tensorflow/contrib/lite/testing/test_runner.h
index 05770beee23275ebe210606dbfd2b33eea17612d..96ab6be54e528334f9e4a8cc259e44f99878fefb 100644
--- a/tensorflow/contrib/lite/testing/test_runner.h
+++ b/tensorflow/contrib/lite/testing/test_runner.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
 #define TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
 
+#include <iostream>
 #include <memory>
 #include <string>
 #include <vector>
@@ -89,6 +90,7 @@ class TestRunner {
 
   // Invalidate the test runner, preventing it from executing any further.
   void Invalidate(const string& error_message) {
+    cerr << error_message << std::endl;
     error_message_ = error_message;
   }
   bool IsValid() const { return error_message_.empty(); }
diff --git a/tensorflow/contrib/lite/testing/tf_driver.cc b/tensorflow/contrib/lite/testing/tf_driver.cc
index 7b295875aab12bf48da2341ce05dd53442464cf0..3b27f6f3da92ce80c3830feb7c6af095e7c48e9c 100644
--- a/tensorflow/contrib/lite/testing/tf_driver.cc
+++ b/tensorflow/contrib/lite/testing/tf_driver.cc
@@ -103,7 +103,7 @@ void TfDriver::LoadModel(const string& bin_file_path) {
   session_.reset(tensorflow::NewSession(options));
   auto status = session_->Create(graphdef);
   if (!status.ok()) {
-    Invalidate("Failed to create session");
+    Invalidate("Failed to create session. " + status.error_message());
   }
 }
 
diff --git a/tensorflow/contrib/lite/tflite_static.bp b/tensorflow/contrib/lite/tflite_static.bp
index 2bce3d823bbd8edf34609e83af5129d5921a7400..d873ee234e802b62bed08a3bfa8dbca4069d9269 100644
--- a/tensorflow/contrib/lite/tflite_static.bp
+++ b/tensorflow/contrib/lite/tflite_static.bp
@@ -24,19 +24,23 @@ cc_library_static {
         "error_reporter.cc",
 	"graph_info.cc",
         "interpreter.cc",
-        "model.cc",
+        "model.cc",	
         "nnapi_delegate.cc",
         "optional_debug_tools.cc",
+        "op_resolver.cc",		
         "simple_memory_arena.cc",
         "string_util.cc",
         "util.cc",
+	"kernels/elementwise.cc",
+	"kernels/split.cc",
+	"kernels/topk_v2.cc",
         "kernels/activations.cc",
         "kernels/add.cc",
         "kernels/arg_max.cc",
         "kernels/basic_rnn.cc",
         "kernels/batch_to_space_nd.cc",
-        "kernels/bidirectional_sequence_rnn.cc",
         "kernels/bidirectional_sequence_lstm.cc",
+        "kernels/bidirectional_sequence_rnn.cc",
         "kernels/cast.cc",
         "kernels/comparisons.cc",
         "kernels/concatenation.cc",
@@ -45,9 +49,9 @@ cc_library_static {
         "kernels/dequantize.cc",
         "kernels/div.cc",
         "kernels/eigen_support.cc",
-        "kernels/exp.cc",
         "kernels/embedding_lookup.cc",
         "kernels/embedding_lookup_sparse.cc",
+        "kernels/exp.cc",
         "kernels/floor.cc",
         "kernels/fully_connected.cc",
         "kernels/gather.cc",
@@ -67,16 +71,17 @@ cc_library_static {
         "kernels/register.cc",
         "kernels/reshape.cc",
         "kernels/resize_bilinear.cc",
+        "kernels/select.cc",	
         "kernels/skip_gram.cc",
+        "kernels/slice.cc",
         "kernels/space_to_batch_nd.cc",
         "kernels/space_to_depth.cc",
-	"kernels/split.cc",
         "kernels/squeeze.cc",
         "kernels/strided_slice.cc",
         "kernels/sub.cc",
         "kernels/svdf.cc",
-	"kernels/topk_v2.cc",
         "kernels/transpose.cc",
+        "kernels/transpose_conv.cc",
         "kernels/unidirectional_sequence_lstm.cc",
         "kernels/unidirectional_sequence_rnn.cc",
         "kernels/internal/kernel_utils.cc",
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index ce0a74724a4008b6956bdda00e1f186e011e5ed2..b8acc9a8e0361a4c38fcbe2f16be172e637b95c6 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -273,6 +273,7 @@ cc_library(
         "graph_transformations/resolve_constant_range.cc",
         "graph_transformations/resolve_constant_reshape.cc",
         "graph_transformations/resolve_constant_shape_or_rank.cc",
+        "graph_transformations/resolve_constant_slice.cc",
         "graph_transformations/resolve_constant_stack.cc",
         "graph_transformations/resolve_constant_strided_slice.cc",
         "graph_transformations/resolve_constant_transpose.cc",
@@ -280,6 +281,7 @@ cc_library(
         "graph_transformations/resolve_mean_attributes.cc",
         "graph_transformations/resolve_multiply_by_zero.cc",
         "graph_transformations/resolve_pad_attributes.cc",
+        "graph_transformations/resolve_padv2_attributes.cc",
         "graph_transformations/resolve_reorder_axes.cc",
         "graph_transformations/resolve_reshape_attributes.cc",
         "graph_transformations/resolve_slice_attributes.cc",
diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.cc b/tensorflow/contrib/lite/toco/dump_graphviz.cc
index 166ead918471ee1b06d9683b8dc7baf7bcbdc427..6e5927295fe2a9d237683155c9bca90048d478c4 100644
--- a/tensorflow/contrib/lite/toco/dump_graphviz.cc
+++ b/tensorflow/contrib/lite/toco/dump_graphviz.cc
@@ -91,10 +91,7 @@ Color GetColorForArray(const Model& model, const string& array_name) {
   // We use gray colors for them because they are the majority
   // of arrays so we want to highlight other arrays instead of them.
   // First, we use a bolder gray for input/output arrays:
-  const auto& dump_options = *GraphVizDumpOptions::singleton();
-  if (IsInputArray(model, array_name) ||
-      array_name == dump_options.graphviz_first_array ||
-      array_name == dump_options.graphviz_last_array) {
+  if (IsInputArray(model, array_name)) {
     return Color(0x9E, 0x9E, 0x9E);
   }
   if (IsOutputArray(model, array_name)) {
@@ -287,47 +284,6 @@ NodeProperties GetPropertiesForOperator(const Operator& op) {
   return node_properties;
 }
 
-std::vector<const Operator*> OperatorsToDump(const Model& model) {
-  const auto& dump_options = *GraphVizDumpOptions::singleton();
-  bool first_specified = !dump_options.graphviz_first_array.empty();
-  bool last_specified = !dump_options.graphviz_last_array.empty();
-  CHECK_EQ(first_specified, last_specified);
-  std::vector<const Operator*> ops_to_dump;
-  if (last_specified) {
-    // Return only the part of the graph between graphviz_first_array
-    // and graphviz_last_array.
-    CHECK(model.HasArray(dump_options.graphviz_first_array));
-    CHECK(model.HasArray(dump_options.graphviz_last_array));
-    std::unordered_set<string> arrays_already_produced;
-    std::vector<string> arrays_to_produce;
-    arrays_to_produce.push_back(dump_options.graphviz_last_array);
-    while (!arrays_to_produce.empty()) {
-      const string array = arrays_to_produce.back();
-      arrays_to_produce.pop_back();
-      CHECK(!arrays_already_produced.count(array));
-      arrays_already_produced.insert(array);
-      const Operator* op = GetOpWithOutput(model, array);
-      if (!op) {
-        continue;
-      }
-      ops_to_dump.push_back(op);
-      for (const string& input : op->inputs) {
-        if (arrays_already_produced.count(input) ||
-            input == dump_options.graphviz_first_array) {
-          continue;
-        }
-        arrays_to_produce.push_back(input);
-      }
-    }
-  } else {
-    // Return the whole graph.
-    for (const auto& op : model.operators) {
-      ops_to_dump.push_back(op.get());
-    }
-  }
-  return ops_to_dump;
-}
-
 }  // namespace
 
 void DumpGraphviz(const Model& model, string* output_file_contents) {
@@ -348,10 +304,9 @@ void DumpGraphviz(const Model& model, string* output_file_contents) {
   constexpr char kRNNBackEdgeFormat[] =
       "\t \"%s\" -> \"%s\" [color=\"#0F9D58\"];\n";
 
-  std::vector<const Operator*> ops_to_dump = OperatorsToDump(model);
   std::set<string> already_added_arrays;
-  for (int op_index = 0; op_index < ops_to_dump.size(); op_index++) {
-    const Operator& op = *ops_to_dump[op_index];
+  for (int op_index = 0; op_index < model.operators.size(); op_index++) {
+    const Operator& op = *model.operators[op_index];
     // Add node for operator.
     auto op_properties = GetPropertiesForOperator(op);
     string operator_id = StringF("op%05d", op_index);
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 99ccfaea648077b7b72af30b32dd53b42b85d3a2..f5157149afca17383a8625c489f15a23ce6dd224 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -1492,6 +1492,37 @@ void ConvertPadOperator(const Model& model, const PadOperator& src_op,
   shape->add_dim()->set_size(2);
 }
 
+void ConvertPadV2Operator(const Model& model, const PadV2Operator& src_op,
+                          GraphDef* tensorflow_graph) {
+  auto* new_op = tensorflow_graph->add_node();
+  new_op->set_op("PadV2");
+  new_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *new_op->add_input() = src_op.inputs[0];
+  *new_op->add_input() = src_op.inputs[1];
+  *new_op->add_input() = src_op.inputs[2];
+
+  const auto params_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*new_op->mutable_attr())["T"].set_type(params_type);
+
+  // Create the params tensor.
+  auto* params_op = tensorflow_graph->add_node();
+  params_op->set_op("Const");
+  params_op->set_name(src_op.inputs[1]);
+  (*params_op->mutable_attr())["dtype"].set_type(DT_INT32);
+  auto* tensor = (*params_op->mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_INT32);
+
+  CHECK_EQ(src_op.left_padding.size(), src_op.right_padding.size());
+  for (int i = 0; i < src_op.left_padding.size(); ++i) {
+    tensor->add_int_val(src_op.left_padding[i]);
+    tensor->add_int_val(src_op.right_padding[i]);
+  }
+  auto* shape = tensor->mutable_tensor_shape();
+  shape->add_dim()->set_size(src_op.left_padding.size());
+  shape->add_dim()->set_size(2);
+}
+
 void CreateSliceInput(const string& input_name, const std::vector<int>& values,
                       GraphDef* tensorflow_graph) {
   auto* params_op = tensorflow_graph->add_node();
@@ -1643,6 +1674,19 @@ void ConvertTensorFlowMaximumOperator(const Model& model,
   (*sub_op->mutable_attr())["T"].set_type(data_type);
 }
 
+void ConvertSelectOperator(const Model& model, const SelectOperator& src_op,
+                           GraphDef* tensorflow_graph) {
+  auto* sub_op = tensorflow_graph->add_node();
+  sub_op->set_op("Select");
+  sub_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 3);
+  *sub_op->add_input() = src_op.inputs[0];
+  *sub_op->add_input() = src_op.inputs[1];
+  *sub_op->add_input() = src_op.inputs[2];
+  const auto data_type = GetTensorFlowDataType(model, src_op.inputs[1]);
+  (*sub_op->mutable_attr())["T"].set_type(data_type);
+}
+
 void ConvertTopKV2Operator(const Model& model, const TopKV2Operator& src_op,
                            GraphDef* tensorflow_graph) {
   auto* topk_op = tensorflow_graph->add_node();
@@ -1671,6 +1715,19 @@ void ConvertRandomUniformOperator(const Model& model,
   (*new_op->mutable_attr())["seed2"].set_i(src_op.seed2);
 }
 
+void ConvertComparisonOperator(const Model& model, const Operator& src_op,
+                               const char* op_name,
+                               GraphDef* tensorflow_graph) {
+  auto* comparison_op = tensorflow_graph->add_node();
+  comparison_op->set_op(op_name);
+  comparison_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *comparison_op->add_input() = src_op.inputs[0];
+  *comparison_op->add_input() = src_op.inputs[1];
+  const auto data_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*comparison_op->mutable_attr())["T"].set_type(data_type);
+}
+
 void ConvertOperator(const Model& model, const Operator& src_op,
                      GraphDef* tensorflow_graph) {
   if (src_op.fused_activation_function != FusedActivationFunctionType::kNone) {
@@ -1795,6 +1852,9 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kPad) {
     ConvertPadOperator(model, static_cast<const PadOperator&>(src_op),
                        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kPadV2) {
+    ConvertPadV2Operator(model, static_cast<const PadV2Operator&>(src_op),
+                         tensorflow_graph);
   } else if (src_op.type == OperatorType::kStridedSlice) {
     ConvertStridedSliceOperator(
         model, static_cast<const StridedSliceOperator&>(src_op),
@@ -1859,6 +1919,17 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertRandomUniformOperator(
         model, static_cast<const RandomUniformOperator&>(src_op),
         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowGreater) {
+    ConvertComparisonOperator(model, src_op, "Greater", tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowGreaterEqual) {
+    ConvertComparisonOperator(model, src_op, "GreaterEqual", tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowLess) {
+    ConvertComparisonOperator(model, src_op, "Less", tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowLessEqual) {
+    ConvertComparisonOperator(model, src_op, "LessEqual", tensorflow_graph);
+  } else if (src_op.type == OperatorType::kSelect) {
+    ConvertSelectOperator(model, static_cast<const SelectOperator&>(src_op),
+                          tensorflow_graph);
   } else {
     LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
   }
diff --git a/tensorflow/contrib/lite/toco/format_port.h b/tensorflow/contrib/lite/toco/format_port.h
index eb81e90faf20133ed722185928f86ef45ac4f8f6..44e668457152376fd8b2e2fa063301468090c3f0 100644
--- a/tensorflow/contrib/lite/toco/format_port.h
+++ b/tensorflow/contrib/lite/toco/format_port.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// This file is used to provide equivalents of internal util::format::FormatF
-// and util::format::AppendF. Unfortunately, type safety is not as good as a
+// This file is used to provide equivalents of internal absl::FormatF
+// and absl::StrAppendFormat. Unfortunately, type safety is not as good as a
 // a full C++ example.
 // TODO(aselle): When absl adds support for StrFormat, use that instead.
 #ifndef TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 72ffd51db45d0808feab3d07436c13db2420a680..8da242aa9c2ca4917a681c95c3eded894664c046 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -174,6 +174,7 @@ DECLARE_GRAPH_TRANSFORMATION(UnrollBatchMatMul)
 DECLARE_GRAPH_TRANSFORMATION(ResolveSpaceToBatchNDAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveBatchToSpaceNDAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolvePadAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolvePadV2Attributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveStridedSliceAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveSliceAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveMeanAttributes)
@@ -181,6 +182,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveTransposeAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantRandomUniform)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantRange)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantShapeOrRank)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantSlice)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantStack)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantStridedSlice)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantFill)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
index 437e30a91803bfc847bf246875fa2924b7c0d3fe..d63ee7c9519d169a2f44ec1afe81125217db8976 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -188,6 +188,32 @@ bool HardcodeMinMaxFromFirstInput(Model* model, Operator* op) {
   return true;
 }
 
+bool HardcodeMinMaxForSelect(Model* model, Operator* op) {
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.minmax) {
+    return false;
+  }
+  const auto& input_array_1 = model->GetArray(op->inputs[1]);
+  if (!input_array_1.minmax) {
+    return false;
+  }
+  const auto& input_array_2 = model->GetArray(op->inputs[2]);
+  if (!input_array_2.minmax) {
+    return false;
+  }
+
+  const auto& input_minmax_1 = input_array_1.GetMinMax();
+  const auto& input_minmax_2 = input_array_2.GetMinMax();
+
+  CHECK_EQ(input_minmax_1.min, input_minmax_2.min);
+  CHECK_EQ(input_minmax_1.max, input_minmax_2.max);
+  CHECK(!output_array.minmax);
+  auto& output_minmax = output_array.GetOrCreateMinMax();
+  output_minmax.min = input_minmax_1.min;
+  output_minmax.max = input_minmax_1.max;
+  return true;
+}
+
 bool HardcodeMinMaxForOutput(Model* model, Operator* op, double min,
                              double max) {
   CHECK_EQ(op->outputs.size(), 1);
@@ -345,7 +371,9 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
     case OperatorType::kMean:
       changed = HardcodeMinMaxFromFirstInput(model, op);
       break;
-
+    case OperatorType::kSelect:
+      changed = HardcodeMinMaxForSelect(model, op);
+      break;
     case OperatorType::kLogistic:
       // We hardcode quantization_params to: zero_point=0, scale=1/256.
       // This choice of minmax is the one that is equivalent to that.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
index c1cf79f62614c44606daaf9294d0822c50019f92..6342cf3e8af4d85ad869a5d60a63d62ca2b00588 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -152,6 +152,17 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
       // Yield on ExpandDim until it is converted to Reshape
       return false;
     }
+    case OperatorType::kSelect: {
+      // Select produces outputs with the same type as their 2nd input
+      CHECK_EQ(op->inputs.size(), 3);
+      const ArrayDataType data_type_x =
+          model->GetArray(op->inputs[1]).data_type;
+      const ArrayDataType data_type_y =
+          model->GetArray(op->inputs[2]).data_type;
+      CHECK(data_type_x == data_type_y);
+      SetDataTypeForAllOutputs(model, op, data_type_x);
+      break;
+    }
     default: {
       // These operators produce outputs with the same type as their 1st input
       CHECK_GT(op->inputs.size(), 0);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
index 0bce183c1897dfba6f2c393ffc0306c054366725..6d51fc8c31e6c86701c3dc1fd07a9a5479114738 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
@@ -102,6 +102,7 @@ bool DoesOpBlockBackwardPropagation(const Operator& op) {
       // Gathers need their parameters changed to the appropriate data type.
     case OperatorType::kTensorFlowReshape:
     case OperatorType::kTranspose:
+    case OperatorType::kSelect:
       // Reshapes and transposes don't change values.
       return false;
     default:
@@ -113,6 +114,8 @@ bool DoesOpBlockBackwardPropagation(const Operator& op) {
 // propagation.
 bool DoesOpInputBlockBackwardPropagation(const Operator& op, int input_index) {
   switch (op.type) {
+    case OperatorType::kSelect:
+      return input_index == 0;
     case OperatorType::kGather:
       // Ignore gather indices.
       return input_index != 0;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 4923f83d91defb77655728e97eddccd9595225f6..9d1d27f3ef01a572c2ae232b1f172a8e05374381 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -499,8 +499,8 @@ void ProcessTensorFlowReshapeOperator(Model* model,
       << op->outputs[0] << "\". Are your input shapes correct?";
 }
 
-void ProcessSimpleOperator(Model* model, Operator* op) {
-  const auto& input_array = model->GetArray(op->inputs[0]);
+void ProcessSimpleOperator(Model* model, Operator* op, int input_index) {
+  const auto& input_array = model->GetArray(op->inputs[input_index]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -529,6 +529,21 @@ void ProcessSimpleBinaryOperator(Model* model, Operator* op) {
                                   &output_array);
 }
 
+void ProcessSelectOperator(Model* model, SelectOperator* op) {
+  // Yield until all input dims have been resolved.
+  for (const auto& input : op->inputs) {
+    const auto& input_array = model->GetArray(input);
+    if (!input_array.has_shape()) {
+      return;
+    }
+  }
+
+  // Select's output matches the second and third output.
+  const auto& input1_array = model->GetArray(op->inputs[1]);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  output_array.copy_shape(input1_array.shape());
+}
+
 void ProcessAddNOperator(Model* model, Operator* op) {
   // Yield until all input dims have been resolved.
   //
@@ -670,8 +685,7 @@ void ProcessConcatenationOperator(Model* model, ConcatenationOperator* op) {
   const auto& first_input_array = model->GetArray(op->inputs[0]);
   output_array.copy_shape(first_input_array.shape());
   // Negative axis means the count starts at the back of the dims().
-  int axis = op->axis;
-  if (axis < 0) axis += first_input_array.shape().dims().size();
+  if (op->axis < 0) op->axis += first_input_array.shape().dims().size();
   // Determine the concat size, and enfore that all inputs have
   // the same dimensions count.
   int concat_size = 0;
@@ -684,14 +698,14 @@ void ProcessConcatenationOperator(Model* model, ConcatenationOperator* op) {
     CHECK_EQ(input_array.shape().dimensions_count(),
              output_array.shape().dimensions_count());
     const std::vector<int>& input_dims = input_array.shape().dims();
-    CHECK_LT(axis, input_dims.size());
-    concat_size += input_dims[axis];
+    CHECK_LT(op->axis, input_dims.size());
+    concat_size += input_dims[op->axis];
   }
   // Write out the concat_size on the output array shape.
   auto& output_shape = *output_array.mutable_shape();
   auto& output_dims = *output_shape.mutable_dims();
-  CHECK_LT(axis, output_shape.dimensions_count());
-  output_dims[axis] = concat_size;
+  CHECK_LT(op->axis, output_shape.dimensions_count());
+  output_dims[op->axis] = concat_size;
 }
 
 void ProcessRangeOperator(Model* model, RangeOperator* op) {
@@ -1147,6 +1161,32 @@ void ProcessPadOperator(Model* model, PadOperator* op) {
   output_array.copy_shape(output_shape);
 }
 
+void ProcessPadV2Operator(Model* model, PadV2Operator* op) {
+  CHECK_EQ(op->inputs.size(), 3);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  const auto& input_array = model->GetArray(op->inputs[0]);
+
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) return;
+
+  if (op->left_padding.empty()) return;
+  CHECK_EQ(op->left_padding.size(), op->right_padding.size());
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.has_shape()) return;
+
+  Shape output_shape = input_array.shape();
+  std::vector<int>& dims = *output_shape.mutable_dims();
+  CHECK_EQ(op->left_padding.size(), dims.size());
+
+  for (int i = 0; i < op->left_padding.size(); ++i) {
+    dims[i] += op->left_padding[i] + op->right_padding[i];
+  }
+
+  output_array.copy_shape(output_shape);
+}
+
 void ProcessRankOperator(Model* model, RankOperator* op) {
   CHECK_GE(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
@@ -1474,7 +1514,8 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kCast:
     case OperatorType::kFloor:
     case OperatorType::kExp:
-      ProcessSimpleOperator(model, op);
+    case OperatorType::kSin:
+      ProcessSimpleOperator(model, op, 0);
       break;
     case OperatorType::kGather:
       ProcessGatherOperator(model, static_cast<GatherOperator*>(op));
@@ -1545,7 +1586,9 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kMean:
       ProcessTensorFlowReductionOperator(model, op);
       break;
-
+    case OperatorType::kSelect:
+      ProcessSelectOperator(model, static_cast<SelectOperator*>(op));
+      break;
     case OperatorType::kSlice:
       ProcessSliceOperator(model, static_cast<SliceOperator*>(op));
       break;
@@ -1629,6 +1672,9 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kPad:
       ProcessPadOperator(model, static_cast<PadOperator*>(op));
       break;
+    case OperatorType::kPadV2:
+      ProcessPadV2Operator(model, static_cast<PadV2Operator*>(op));
+      break;
     case OperatorType::kStridedSlice:
       ProcessStridedSliceOperator(model,
                                   static_cast<StridedSliceOperator*>(op));
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index 347302c7a50b81a73b6a921c3d5c339d8da427b6..142841fcc460e8a5e9e4f2333496f4ece2557275 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -48,13 +48,19 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kLogSoftmax ||
          type == OperatorType::kTensorFlowSplit || type == OperatorType::kSub ||
          type == OperatorType::kSqueeze || type == OperatorType::kPad ||
+         type == OperatorType::kPadV2 ||
          type == OperatorType::kTensorFlowReshape ||
          type == OperatorType::kTanh || type == OperatorType::kMul ||
          type == OperatorType::kSpaceToDepth ||
          type == OperatorType::kStridedSlice ||
          type == OperatorType::kDepthToSpace ||
          type == OperatorType::kLstmCell || type == OperatorType::kGather ||
-         type == OperatorType::kTranspose || type == OperatorType::kMean;
+         type == OperatorType::kTranspose || type == OperatorType::kMean ||
+         type == OperatorType::kTensorFlowGreater ||
+         type == OperatorType::kTensorFlowGreaterEqual ||
+         type == OperatorType::kTensorFlowLess ||
+         type == OperatorType::kTensorFlowLessEqual ||
+         type == OperatorType::kSelect;
 }
 
 const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
@@ -256,8 +262,7 @@ bool ChooseHardcodedQuantizationForOperatorOutput(
         IsExactlyRepresentable(0., *quantized_data_type, *quantization_params));
     return true;
   }
-  if ((op.type == OperatorType::kLogistic) ||
-      (op.type == OperatorType::kSoftmax)) {
+  if (op.type == OperatorType::kLogistic || op.type == OperatorType::kSoftmax) {
     // Logistic and Softmax have range: [0, 1].
     //
     // For Logistic, 0.5 should be exactly representable, as implementations
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index 3e021b819fc82d66fb70596a62fd7cee4911d4e8..a950fe6442bc656b725a1f0687f4c024f4fb0f84 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -85,9 +85,11 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
         "Removing %s, keeping its non-constant input array %s and removing %s",
         LogName(*passthru_op), main_input_name, output_name);
     RerouteEdges(output_name, main_input_name, model);
-  } else if (IsDiscardableArray(*model, main_input_name)) {
+  } else if (IsDiscardableArray(*model, main_input_name) &&
+             !IsConstantParameterArray(*model, main_input_name)) {
     transformation->AddMessageF(
-        "Removing %s, keeping its output array %s and removing input %s",
+        "Removing %s, keeping its output array %s and removing non-constant "
+        "input %s",
         LogName(*passthru_op), output_name, main_input_name);
     RerouteEdges(main_input_name, output_name, model);
   } else {
@@ -95,10 +97,23 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
         "Cannot remove %s, neither its main input nor its output may be "
         "discarded",
         LogName(*passthru_op));
-    return false;
+    if (passthru_op->type != OperatorType::kTensorFlowReshape &&
+        model->GetArray(main_input_name).has_shape()) {
+      // We can't remove either array but we can remove the op. Converting it to
+      // a reshape gives us some hope of later on fixing that (either in the
+      // final runtime or as an additional fixup step).
+      //
+      // Note that we don't try to insert copies in place of reshapes as the
+      // copy itself is a trivial reshape and we'd go into an infinite loop!
+      transformation->AddMessageF("Replacing with a copy (reshape) instead");
+      InsertCopyOperator(model, main_input_name, output_name);
+    } else {
+      return false;
+    }
   }
 
   // Remove the pass-through node.
+  CHECK_EQ(passthru_it->get(), passthru_op);
   model->operators.erase(passthru_it);
 
   // Remove any array that is no longer used.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
index 2b3ee36ad10e24ab7367ca44c03a234688a63a9b..8f2c1f81628398d8c823d27ff50d59e80497d0e1 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -134,9 +134,9 @@ bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
   }
 
   // Remove the old param arrays
-  model->EraseArray(bn_op->inputs[1]);
-  model->EraseArray(bn_op->inputs[2]);
-  model->EraseArray(bn_op->inputs[3]);
+  DeleteArrayIfUsedOnce(bn_op->inputs[1], model);
+  DeleteArrayIfUsedOnce(bn_op->inputs[2], model);
+  DeleteArrayIfUsedOnce(bn_op->inputs[3], model);
 
   // Remove the old operator
   DCHECK_EQ(bn_it->get(), bn_op);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b35c3e19c43b1c62e6bdbfe379631480e1d41703
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc
@@ -0,0 +1,165 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+template <ArrayDataType Type>
+bool Slice(SliceOperator const& op, Array const& input_array,
+           Array* output_array) {
+  // Implementation is taken from the tflite kernel.
+
+  CHECK(input_array.data_type == Type);
+  CHECK(output_array->data_type == Type);
+  const auto& input_data = input_array.GetBuffer<Type>().data;
+
+  // Create a buffer for the output array.
+  std::vector<DataType<Type>>& output_data =
+      output_array->GetMutableBuffer<Type>().data;
+  output_data.resize(RequiredBufferSizeForShape(output_array->shape()));
+
+  std::vector<int> size = op.size;
+  if (size.size() != op.begin.size()) {
+    // Broadcast the end positions.
+    CHECK_EQ(op.size.size(), 1);
+    int broadcast_size = size[0];
+    while (size.size() < op.begin.size()) size.push_back(broadcast_size);
+  }
+
+  // Calculate begin and end indices along each dimension.
+  CHECK_LE(op.begin.size(), 4);
+  CHECK_LE(size.size(), 4);
+  std::vector<int> begin = op.begin;
+  std::vector<int> end;
+  for (int i = 0; i < begin.size(); ++i) {
+    int dim_size = size[i];
+    if (dim_size == -1) {
+      // -1 means the rest of the dimension.
+      dim_size = input_array.shape().dims()[i] - begin[i];
+    }
+    CHECK_GE(dim_size, 1);
+    end.push_back(begin[i] + dim_size - 1);
+  }
+
+  // Pad out so that we always have 4 dims, makes this loop easier.
+  while (begin.size() < 4) begin.insert(begin.begin(), 0);
+  while (end.size() < 4) end.insert(end.begin(), 0);
+  Shape padded_shape = input_array.shape();
+  while (padded_shape.dimensions_count() < 4) {
+    padded_shape.mutable_dims()->insert(padded_shape.mutable_dims()->begin(),
+                                        1);
+  }
+
+  auto* out_ptr = output_data.data();
+  for (int in_b = begin[0]; in_b <= end[0]; ++in_b) {
+    for (int in_h = begin[1]; in_h <= end[1]; ++in_h) {
+      for (int in_w = begin[2]; in_w <= end[2]; ++in_w) {
+        for (int in_d = begin[3]; in_d <= end[3]; ++in_d) {
+          *out_ptr++ =
+              input_data[Offset(padded_shape, {in_b, in_h, in_w, in_d})];
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+}  // namespace
+
+bool ResolveConstantSlice::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  const auto* base_op = it->get();
+  if (base_op->type != OperatorType::kSlice) {
+    return false;
+  }
+
+  const SliceOperator* op = static_cast<const SliceOperator*>(base_op);
+
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes.
+    return false;
+  }
+
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes.
+    return false;
+  }
+
+  if (op->begin.empty() || op->size.empty()) {
+    // Attributes have not resolved yet.
+    return false;
+  }
+
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  if (!input_array.has_shape()) {
+    // Yield until the value shape has been resolved.
+    return false;
+  }
+  if (!IsConstantParameterArray(*model, op->inputs[0])) {
+    // Yield until the value is constant.
+    return false;
+  }
+
+  CHECK(!output_array.buffer);
+  switch (output_array.data_type) {
+    case ArrayDataType::kFloat:
+      if (!Slice<ArrayDataType::kFloat>(*op, input_array, &output_array)) {
+        return false;
+      }
+      break;
+    case ArrayDataType::kUint8:
+      if (!Slice<ArrayDataType::kUint8>(*op, input_array, &output_array)) {
+        return false;
+      }
+      break;
+    case ArrayDataType::kInt32:
+      if (!Slice<ArrayDataType::kInt32>(*op, input_array, &output_array)) {
+        return false;
+      }
+      break;
+    case ArrayDataType::kInt64:
+      if (!Slice<ArrayDataType::kInt64>(*op, input_array, &output_array)) {
+        return false;
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported data type input to Slice op with output \""
+                 << op->outputs[0] << "\"";
+      break;
+  }
+
+  // Erase input array if no longer used.
+  if (IsDiscardableArray(*model, op->inputs[0]) &&
+      CountOpsWithInput(*model, op->inputs[0]) == 1) {
+    model->EraseArray(op->inputs[0]);
+  }
+
+  // Erase the operator
+  model->operators.erase(it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_padv2_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_padv2_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ebb023e34223a57a2ad5708662d9c443949fcd0a
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_padv2_attributes.cc
@@ -0,0 +1,55 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolvePadV2Attributes::Run(Model* model, std::size_t op_index) {
+  const auto pad_it = model->operators.begin() + op_index;
+  auto* pad_op = pad_it->get();
+  if (pad_op->type != OperatorType::kPadV2) return false;
+
+  auto* op = static_cast<PadV2Operator*>(pad_op);
+  if (!op->left_padding.empty()) return false;
+
+  CHECK_EQ(op->inputs.size(), 3);
+  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
+
+  const auto& array = model->GetArray(op->inputs[1]);
+  if (!array.has_shape()) return false;
+
+  const std::vector<int>& dims = array.shape().dims();
+  CHECK_EQ(dims.size(), 2);
+
+  std::vector<int> buffer = array.GetBuffer<ArrayDataType::kInt32>().data;
+
+  for (int i = 0; i < dims[0]; ++i) {
+    op->left_padding.push_back(buffer[i * 2]);
+    op->right_padding.push_back(buffer[i * 2 + 1]);
+  }
+
+  // TODO(dkalenichenko): Delete the extra input?
+
+  return true;
+}
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
index 021e9918f2cf22d3854491762c31061832359a46..65132d7d1ef0626e0ad41a88b8e7999a1c1cf684 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
@@ -19,6 +19,24 @@ limitations under the License.
 
 namespace toco {
 
+int PadAttributeArray(Array* attribute_array, std::vector<int> pad_values,
+                      int mask) {
+  int attribute_dim_count = attribute_array->shape().dims(0);
+  int dim_count = pad_values.size();
+  if (attribute_dim_count < dim_count) {
+    Shape strided_slice_shape = Shape({dim_count});
+    attribute_array->copy_shape(strided_slice_shape);
+    Buffer<ArrayDataType::kInt32>* buffer =
+        &(attribute_array->GetMutableBuffer<ArrayDataType::kInt32>());
+    buffer->data.resize(RequiredBufferSizeForShape(strided_slice_shape));
+    for (int i = attribute_dim_count; i < dim_count; i++) {
+      buffer->data[i] = pad_values[i];
+      mask |= 1 << i;
+    }
+  }
+  return mask;
+}
+
 bool ResolveStridedSliceAttributes::Run(Model* model, std::size_t op_index) {
   const auto slice_it = model->operators.begin() + op_index;
   auto* slice_op = slice_it->get();
@@ -37,52 +55,63 @@ bool ResolveStridedSliceAttributes::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
-  const auto& start_array = model->GetArray(op->inputs[1]);
+  auto& start_array = model->GetArray(op->inputs[1]);
   if (!start_array.has_shape()) return false;
   if (toco::RequiredBufferSizeForShape(start_array.shape()) > 4) {
     // Only 1-4D arrays are supported for now.
     return false;
   }
 
-  const auto& stop_array = model->GetArray(op->inputs[2]);
+  auto& stop_array = model->GetArray(op->inputs[2]);
   if (!stop_array.has_shape()) return false;
 
-  const auto& stride_array = model->GetArray(op->inputs[3]);
+  auto& stride_array = model->GetArray(op->inputs[3]);
   if (!stride_array.has_shape()) return false;
 
   if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
   if (!IsConstantParameterArray(*model, op->inputs[2])) return false;
   if (!IsConstantParameterArray(*model, op->inputs[3])) return false;
 
-  op->start_indices = start_array.GetBuffer<ArrayDataType::kInt32>().data;
-  op->stop_indices = stop_array.GetBuffer<ArrayDataType::kInt32>().data;
-  op->strides = stride_array.GetBuffer<ArrayDataType::kInt32>().data;
+  int num_input_axes = input_array.shape().dimensions_count();
+  int start_indices_size = start_array.shape().dims(0);
+  int stop_indices_size = stop_array.shape().dims(0);
+  int stride_indices_size = stride_array.shape().dims(0);
 
-  CHECK_GE(op->start_indices.size(), 1);
-  CHECK_LE(op->start_indices.size(), 4);
-  CHECK_EQ(op->stop_indices.size(), op->start_indices.size());
-  CHECK_EQ(op->strides.size(), op->stop_indices.size());
+  CHECK_GE(start_indices_size, 1);
+  CHECK_LE(start_indices_size, 4);
+  CHECK_LE(stop_indices_size, 4);
+  CHECK_LE(stride_indices_size, 4);
 
   // The TensorFlow documentation is not explicit on how it handles fewer
   // supplied indices than dimensions, but they are accepted. We emulate TF's
   // behavior by fully iterating over each omitted dimension.
-  int num_input_axes = input_array.shape().dimensions_count();
-  CHECK_LE(op->start_indices.size(), num_input_axes)
+  CHECK_LE(start_indices_size, num_input_axes)
       << "StridedSlice op requires no more than " << num_input_axes
       << " start indices";
-  CHECK_LE(op->stop_indices.size(), num_input_axes)
+  CHECK_LE(stop_indices_size, num_input_axes)
       << "StridedSlice op requires no more than " << num_input_axes
       << " stop indices";
-  CHECK_LE(op->strides.size(), num_input_axes)
+  CHECK_LE(stride_indices_size, num_input_axes)
       << "StridedSlice op requires no more than " << num_input_axes
       << " strides";
-  op->PadIndices(num_input_axes);
 
   // Ideally, we would remove the input arrays after they have been resolved.
   // However, we must then reconstitute these input arrays for all supported
   // export formats. For now, leave the arrays so we don't have to modify our
   // exporters. Ideally, we wouldn't have op attributes, and would work directly
   // with the input arrays.
+  std::vector<int> begin_pad_values(num_input_axes, 0);
+  op->begin_mask =
+      PadAttributeArray(&start_array, begin_pad_values, op->begin_mask);
+  op->end_mask =
+      PadAttributeArray(&stop_array, input_array.shape().dims(), op->end_mask);
+  std::vector<int> stride_pad_values(num_input_axes, 1);
+  PadAttributeArray(&stride_array, stride_pad_values, 0);
+
+  op->start_indices = start_array.GetBuffer<ArrayDataType::kInt32>().data;
+  op->stop_indices = stop_array.GetBuffer<ArrayDataType::kInt32>().data;
+  op->strides = stride_array.GetBuffer<ArrayDataType::kInt32>().data;
+
   return true;
 }
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 8efe6ab7b9c608debdbe9b9d77d0e635763216c8..ea051bb84ac1b70612397b7a929cf9c5d82c59de 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -189,6 +189,7 @@ Status ImportFloatArray(const TensorProto& input_tensor, Array* output_array) {
       output_array->GetMutableBuffer<ArrayDataType::kFloat>().data;
   output_float_data.resize(RequiredBufferSizeForShape(output_array->shape()),
                            0.f);
+  CHECK_GE(output_float_data.size(), input_flat_size);
   if (input_tensor.float_val_size() == 1) {
     for (int i = 0; i < input_flat_size; i++) {
       output_float_data[i] = input_tensor.float_val(0);
@@ -202,9 +203,13 @@ Status ImportFloatArray(const TensorProto& input_tensor, Array* output_array) {
     toco::port::CopyToBuffer(input_tensor.tensor_content(),
                              reinterpret_cast<char*>(output_float_data.data()));
   } else {
-    return Status(false,
-                  "Neither input_content nor float_val have the right "
-                  "dimensions for this float tensor");
+    return Status(
+        false,
+        absl::StrCat("Neither input_content (",
+                     input_tensor.tensor_content().size() / sizeof(float),
+                     ") nor float_val (", input_tensor.float_val_size(),
+                     ") have the right dimensions (", input_flat_size,
+                     ") for this float tensor"));
   }
   return Status::OK();
 }
@@ -221,6 +226,7 @@ Status ImportQuint8Array(const TensorProto& input_tensor, Array* output_array) {
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kUint8>().data;
   output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
+  CHECK_GE(output_int_data.size(), input_flat_size);
   if (input_tensor.int_val_size()) {
     for (int i = 0; i < input_tensor.int_val_size(); i++) {
       output_int_data[i] = input_tensor.int_val(i);
@@ -230,9 +236,13 @@ Status ImportQuint8Array(const TensorProto& input_tensor, Array* output_array) {
     toco::port::CopyToBuffer(input_tensor.tensor_content(),
                              reinterpret_cast<char*>(output_int_data.data()));
   } else {
-    return Status(false,
-                  "Neither input_content nor int_val have the right dimensions "
-                  "for this uint8 tensor");
+    return Status(
+        false,
+        absl::StrCat("Neither input_content (",
+                     input_tensor.tensor_content().size() / sizeof(uint8_t),
+                     ") nor int_val (", input_tensor.int_val_size(),
+                     ") have the right dimensions (", input_flat_size,
+                     ") for this uint8 tensor"));
   }
   return Status::OK();
 }
@@ -249,6 +259,7 @@ Status ImportInt32Array(const TensorProto& input_tensor, Array* output_array) {
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt32>().data;
   output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
+  CHECK_GE(output_int_data.size(), input_flat_size);
   if (input_tensor.int_val_size()) {
     for (int i = 0; i < input_tensor.int_val_size(); i++) {
       output_int_data[i] = input_tensor.int_val(i);
@@ -258,9 +269,13 @@ Status ImportInt32Array(const TensorProto& input_tensor, Array* output_array) {
     toco::port::CopyToBuffer(input_tensor.tensor_content(),
                              reinterpret_cast<char*>(output_int_data.data()));
   } else {
-    return Status(false,
-                  "Neither input_content nor int_val have the right dimensions "
-                  "for this int32 tensor");
+    return Status(
+        false,
+        absl::StrCat("Neither input_content (",
+                     input_tensor.tensor_content().size() / sizeof(int32),
+                     ") nor int_val (", input_tensor.int_val_size(),
+                     ") have the right dimensions (", input_flat_size,
+                     ") for this int32 tensor"));
   }
   return Status::OK();
 }
@@ -277,6 +292,7 @@ Status ImportInt64Array(const TensorProto& input_tensor, Array* output_array) {
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt64>().data;
   output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
+  CHECK_GE(output_int_data.size(), input_flat_size);
   if (input_tensor.int64_val_size()) {
     for (int i = 0; i < input_tensor.int64_val_size(); i++) {
       output_int_data[i] = input_tensor.int64_val(i);
@@ -286,9 +302,13 @@ Status ImportInt64Array(const TensorProto& input_tensor, Array* output_array) {
     toco::port::CopyToBuffer(input_tensor.tensor_content(),
                              reinterpret_cast<char*>(output_int_data.data()));
   } else {
-    return Status(false,
-                  "Neither input_content nor int64_val have the right "
-                  "dimensions for this int64 tensor");
+    return Status(
+        false,
+        absl::StrCat("Neither input_content (",
+                     input_tensor.tensor_content().size() / sizeof(int64),
+                     ") nor int64_val (", input_tensor.int64_val_size(),
+                     ") have the right dimensions (", input_flat_size,
+                     ") for this int64 tensor"));
   }
   return Status::OK();
 }
@@ -306,6 +326,7 @@ Status ImportBoolArray(const TensorProto& input_tensor, Array* output_array) {
       output_array->GetMutableBuffer<ArrayDataType::kBool>().data;
   output_bool_data.resize(RequiredBufferSizeForShape(output_array->shape()),
                           false);
+  CHECK_GE(output_bool_data.size(), input_flat_size);
   if (input_tensor.bool_val_size()) {
     for (int i = 0; i < input_tensor.bool_val_size(); i++) {
       output_bool_data[i] = input_tensor.bool_val(i);
@@ -322,9 +343,12 @@ Status ImportBoolArray(const TensorProto& input_tensor, Array* output_array) {
     // So far only encountered that in an array with 1 entry, let's
     // require that until we encounter a graph where that's not the case.
     if (output_bool_data.size() != 1) {
-      return Status(false,
-                    "Neither input_content nor bool_val have the right "
-                    "dimensions for this bool tensor");
+      return Status(
+          false, absl::StrCat("Neither input_content (",
+                              input_tensor.tensor_content().size(),
+                              ") nor bool_val (", input_tensor.bool_val_size(),
+                              ") have the right dimensions (", input_flat_size,
+                              ") for this bool tensor"));
     }
     output_bool_data[0] = false;
   }
@@ -340,13 +364,16 @@ Status ImportStringArray(const TensorProto& input_tensor, Array* output_array) {
                             output_array->mutable_shape());
   if (!status.ok()) return status;
 
+  if (input_flat_size != input_tensor.string_val_size()) {
+    return Status(false,
+                  "Input_content string_val doesn't have the right dimensions "
+                  "for this string tensor");
+  }
+
   auto& output_string_data =
       output_array->GetMutableBuffer<ArrayDataType::kString>().data;
   output_string_data.resize(RequiredBufferSizeForShape(output_array->shape()));
-  if (input_flat_size != input_tensor.string_val_size()) {
-    LOG(FATAL) << "Input_content string_val doesn't have the right "
-                  "dimensions for this string tensor.";
-  }
+  CHECK_GE(output_string_data.size(), input_flat_size);
   for (int i = 0; i < input_flat_size; ++i) {
     output_string_data[i] = input_tensor.string_val(i);
   }
@@ -925,6 +952,19 @@ void ConvertPadOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
+void ConvertPadV2Operator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "PadV2");
+  CheckInputsCount(node, tf_import_flags, 3);
+  auto* op = new PadV2Operator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->inputs.push_back(node.input(2));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
 void ConvertShapeOperator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
                           Model* model) {
@@ -1227,6 +1267,19 @@ void ConvertLessEqualOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
+void ConvertSinOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "Sin");
+  auto* op = new SinOperator;
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
 void ConvertGreaterOperator(const NodeDef& node,
                             const TensorFlowImportFlags& tf_import_flags,
                             Model* model) {
@@ -1328,7 +1381,23 @@ void ConvertUnsupportedOperator(const NodeDef& node,
     for (int i = 0; i < output_types.type_size(); ++i) {
       op->output_data_types.push_back(ConvertDataType(output_types.type(i)));
     }
+  } else if (HasAttr(node, "Tout")) {
+    const auto& output_type = GetDataTypeAttr(node, "Tout");
+    op->output_data_types.push_back(ConvertDataType(output_type));
+  }
+}
+
+void ConvertSelectOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
+  CheckInputsCount(node, tf_import_flags, 3);
+
+  auto* op = new SelectOperator;
+  for (const auto& input : node.input()) {
+    op->inputs.push_back(input);
   }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
 }
 
 void ConvertStridedSliceOperator(const NodeDef& node,
@@ -2169,6 +2238,8 @@ Status ImportTensorFlowNode(const tensorflow::NodeDef& node,
     ConvertMergeOperator(node, tf_import_flags, model);
   } else if (node.op() == "Pad") {
     ConvertPadOperator(node, tf_import_flags, model);
+  } else if (node.op() == "PadV2") {
+    ConvertPadV2Operator(node, tf_import_flags, model);
   } else if (node.op() == "StridedSlice") {
     ConvertStridedSliceOperator(node, tf_import_flags, model);
   } else if (node.op() == "Shape") {
@@ -2239,6 +2310,10 @@ Status ImportTensorFlowNode(const tensorflow::NodeDef& node,
     ConvertDynamicStitchOperator(node, tf_import_flags, model);
   } else if (node.op() == "RandomUniform") {
     ConvertRandomUniform(node, tf_import_flags, model);
+  } else if (node.op() == "Sin") {
+    ConvertSinOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Select") {
+    ConvertSelectOperator(node, tf_import_flags, model);
   } else {
     ConvertUnsupportedOperator(node, tf_import_flags, model);
   }
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow_test.cc b/tensorflow/contrib/lite/toco/import_tensorflow_test.cc
index 5dc78f73ad2e2ab6f1fcb1ee430513488ce47027..835676662b9cb7ed20e578e2a35747a64ba443dc 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow_test.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow_test.cc
@@ -148,10 +148,11 @@ TEST_P(ShapeImportTest, ValidShapeButZeroElements) {
   NodeDef node;
   BuildConstNode({1, 2, 2, 2}, GetParam(), 0, &node);
   auto status = ImportNode(node);
-  EXPECT_THAT(status.error_message(),
-              ::testing::MatchesRegex(
-                  "Neither input_content nor .*_val have the right dimensions "
-                  "for this .* tensor .while processing node 'Node1'."));
+  EXPECT_THAT(
+      status.error_message(),
+      ::testing::MatchesRegex(
+          "Neither input_content .0. nor .*_val .0. have the right "
+          "dimensions .8. for this .* tensor .while processing node 'Node1'."));
 }
 INSTANTIATE_TEST_CASE_P(ValidShapeButZeroElements, ShapeImportTest,
                         ::testing::ValuesIn(TestTypes()));
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 482cc71d8b34705041130c7518b58f7fe183bc3c..d878ac54e4d819efc1b0951acbbab23b3387eac5 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -78,10 +78,12 @@ enum class OperatorType {
   kFloor,
   kGather,
   kResizeBilinear,
+  kSin,
   kSpaceToBatchND,
   kStack,
   kBatchToSpaceND,
   kPad,
+  kPadV2,
   kStridedSlice,
   kSlice,
   kSqueeze,
@@ -132,6 +134,7 @@ enum class OperatorType {
   // instead of being given as plain constant arrays. So we need to insert
   // special nodes in the graph to shuffle axes.
   kReorderAxes,
+  kSelect,
 };
 
 // Helper to deal with TensorFlow arrays using a different ordering of
@@ -616,6 +619,17 @@ struct TanhOperator : Operator {
   TanhOperator() : Operator(OperatorType::kTanh) {}
 };
 
+// Element-wise Sin operator:
+//   x -> Sin(x) = sin(x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Sin
+struct SinOperator : Operator {
+  SinOperator() : Operator(OperatorType::kSin) {}
+};
+
 // Element-wise addition operator.
 //
 // Inputs:
@@ -825,6 +839,29 @@ struct PadOperator : Operator {
   std::vector<int> right_padding;
 };
 
+// PaddingV2 operator. Pads a tensor with the given constant value.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the padding array
+//   inputs[2]: required: the scalar constant_values
+//
+// This operation pads input according to the paddings and constant_values you
+// specify. paddings is an integer tensor with shape [Dn, 2], where n is the
+// rank of input. For each dimension D of input, paddings[D, 0] indicates how
+// many padding values to add before the contents of input in that dimension,
+// and paddings[D, 1] indicates how many padding values to add after the
+// contents of input in that dimension. constant_values is a scalar tensor of
+// the same type as input that indicates the value to use for padding input.
+//
+// TensorFlow equivalent: PadV2
+struct PadV2Operator : Operator {
+  PadV2Operator() : Operator(OperatorType::kPadV2) {}
+
+  std::vector<int> left_padding;
+  std::vector<int> right_padding;
+};
+
 // Strided slice operator.
 //
 // Inputs:
@@ -1063,6 +1100,18 @@ struct NegOperator : Operator {
   NegOperator() : Operator(OperatorType::kNeg) {}
 };
 
+// Element-wise select operator choosing elements from inputs[1] or input[2]
+//
+// Inputs:
+//  inputs[0]: required: boolean mask per index
+//  inputs[1]: required: tensor of values if true
+//  inputs[2]: required: tensor of values if false
+//
+//  TensorFlow equivalent: Select
+struct SelectOperator : Operator {
+  SelectOperator() : Operator(OperatorType::kSelect) {}
+};
+
 // Element-wise reciprocal-square-root (x^-0.5) operator.
 //
 // Inputs:
@@ -1780,6 +1829,8 @@ class Model {
   }
   const ArrayMap& GetArrayMap() const { return arrays; }
 
+  int64 ArithmeticOpsCount() const { return ops_count; }
+
   // Optional arrays are used for optional tensors,
   // these tensors do not have data, but with reserved names as op inputs.
   std::set<string> optional_arrays;
@@ -1796,6 +1847,8 @@ class Model {
   std::size_t transient_data_size = 0;
   // For code-generation only: required alignment of the transient_data buffer
   std::size_t transient_data_alignment = 0;
+  // Arithmatic operations performed in the model.
+  int64 ops_count = 0;
 
  private:
   // The associative array mapping names to Array's.
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 7bbeab7c9d1e42d28f221f1a1134d9d05fe6ab51..f875c85d1a7447432ea0f3a7d68b028e52cb78d4 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -124,14 +124,6 @@ bool ParseModelFlagsFromCommandLineFlags(
            parsed_flags.model_checks.default_value(),
            "A list of model checks to be applied to verify the form of the "
            "model.  Applied after the graph transformations after import."),
-      Flag("graphviz_first_array", parsed_flags.graphviz_first_array.bind(),
-           parsed_flags.graphviz_first_array.default_value(),
-           "If set, defines the start of the sub-graph to be dumped to "
-           "GraphViz."),
-      Flag(
-          "graphviz_last_array", parsed_flags.graphviz_last_array.bind(),
-          parsed_flags.graphviz_last_array.default_value(),
-          "If set, defines the end of the sub-graph to be dumped to GraphViz."),
       Flag("dump_graphviz", parsed_flags.dump_graphviz.bind(),
            parsed_flags.dump_graphviz.default_value(),
            "Dump graphviz during LogDump call. If string is non-empty then "
@@ -180,8 +172,6 @@ bool ParseModelFlagsFromCommandLineFlags(
     if (!tensorflow::Flags::Parse(argc, argv, flags)) return false;
   }
   auto& dump_options = *GraphVizDumpOptions::singleton();
-  dump_options.graphviz_first_array = parsed_flags.graphviz_first_array.value();
-  dump_options.graphviz_last_array = parsed_flags.graphviz_last_array.value();
   dump_options.dump_graphviz_video = parsed_flags.dump_graphviz_video.value();
   dump_options.dump_graphviz = parsed_flags.dump_graphviz.value();
 
diff --git a/tensorflow/contrib/lite/toco/python/toco.i b/tensorflow/contrib/lite/toco/python/toco.i
index 3787cba4a371f1893d877daadcfe31e59eb5b3f6..0d2fbdd67b3aa59af9d5f32c4f1693fe044a7efa 100644
--- a/tensorflow/contrib/lite/toco/python/toco.i
+++ b/tensorflow/contrib/lite/toco/python/toco.i
@@ -24,9 +24,12 @@ namespace toco {
 // Convert a model represented in `input_contents`. `model_flags_proto`
 // describes model parameters. `toco_flags_proto` describes conversion
 // parameters (see relevant .protos for more information). Returns a string
-// representing the contents of the converted model.
+// representing the contents of the converted model. When extended_return
+// flag is set to true returns a dictionary that contains string representation
+// of the converted model and some statitics like arithmetic ops count.
 PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
                         PyObject* toco_flags_proto_txt_raw,
-                        PyObject* input_contents_txt_raw);
+                        PyObject* input_contents_txt_raw,
+                        bool extended_return = false);
 
 } // namespace toco
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/toco/python/toco_python_api.cc b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
index 153c117d17e4564d7cb0aaea64d792f63a587d91..5b1db852b4f8e89c1a591cfe18a0ab0aa2db04c9 100644
--- a/tensorflow/contrib/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
@@ -37,7 +37,7 @@ namespace toco {
 // sure we input and output bytes rather than unicode strings for Python3.
 PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
                       PyObject* toco_flags_proto_txt_raw,
-                      PyObject* input_contents_txt_raw) {
+                      PyObject* input_contents_txt_raw, bool extended_return) {
   // Use Python C API to validate and convert arguments. In py3 (bytes),
   // in py2 (str).
   auto ConvertArg = [&](PyObject* obj, bool* error) {
@@ -78,6 +78,16 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
   Export(toco_flags, *model, toco_flags.allow_custom_ops(),
          &output_file_contents_txt);
 
+  if (extended_return) {
+    PyObject* dict = PyDict_New();
+    PyDict_SetItemString(
+        dict, "flatbuffer",
+        TOCO_FROM_CPPSTRING_TO_PY(output_file_contents_txt.data(),
+                                  output_file_contents_txt.size()));
+    PyDict_SetItemString(dict, "arithmetic_ops",
+                         PyLong_FromLong(model->ArithmeticOpsCount()));
+    return dict;
+  }
   // Convert arguments back to byte (py3) or str (py2)
   return TOCO_FROM_CPPSTRING_TO_PY(output_file_contents_txt.data(),
                                    output_file_contents_txt.size());
diff --git a/tensorflow/contrib/lite/toco/python/toco_python_api.h b/tensorflow/contrib/lite/toco/python/toco_python_api.h
index dc378353f79945f4fbb72305899b2b604be785ad..9af38e937c29804f950ea53ee86b70e3ccb02360 100644
--- a/tensorflow/contrib/lite/toco/python/toco_python_api.h
+++ b/tensorflow/contrib/lite/toco/python/toco_python_api.h
@@ -23,10 +23,13 @@ namespace toco {
 // Convert a model represented in `input_contents`. `model_flags_proto`
 // describes model parameters. `toco_flags_proto` describes conversion
 // parameters (see relevant .protos for more information). Returns a string
-// representing the contents of the converted model.
+// representing the contents of the converted model. When extended_return
+// flag is set to true returns a dictionary that contains string representation
+// of the converted model and some statitics like arithmetic ops count.
 PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
                       PyObject* toco_flags_proto_txt_raw,
-                      PyObject* input_contents_txt_raw);
+                      PyObject* input_contents_txt_raw,
+                      bool extended_return = false);
 
 }  // namespace toco
 
diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index 335b496dccdbdb7e342515868e1d7195c98f0351..5daa703c80b3b5d9152c5d21976260f21679a3f2 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -45,14 +45,20 @@ using ::tflite::Tensor;
 
 namespace {
 
-details::OperatorKey GetOperatorKey(const ::toco::Operator& op) {
+details::OperatorKey GetOperatorKey(
+    const ::toco::Operator& op,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
   string custom_code;
   if (op.type == OperatorType::kTensorFlowUnsupported) {
     const TensorFlowUnsupportedOperator& unsupported_op =
         static_cast<const TensorFlowUnsupportedOperator&>(op);
     custom_code = unsupported_op.tensorflow_op;
   }
-  return details::OperatorKey(op.type, custom_code);
+  int version = 1;
+  if (ops_by_type.count(op.type) != 0) {
+    version = ops_by_type.at(op.type)->GetVersion(op);
+  }
+  return details::OperatorKey(op.type, custom_code, version);
 }
 
 }  // Anonymous namespace.
@@ -74,11 +80,13 @@ void LoadTensorsMap(const Model& model, TensorsMap* tensors_map) {
   }
 }
 
-void LoadOperatorsMap(const Model& model, OperatorsMap* operators_map) {
+void LoadOperatorsMap(
+    const Model& model, OperatorsMap* operators_map,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
   // First find a list of unique operator types.
   std::set<OperatorKey> keys;
   for (const auto& op : model.operators) {
-    keys.insert(GetOperatorKey(*op));
+    keys.insert(GetOperatorKey(*op, ops_by_type));
   }
   // Now assign indices to them and fill in the map.
   int index = 0;
@@ -185,8 +193,9 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
   std::map<int, Offset<OperatorCode>> ordered_opcodes;
 
   for (const auto& op : model.operators) {
-    const details::OperatorKey operator_key = GetOperatorKey(*op);
+    const details::OperatorKey operator_key = GetOperatorKey(*op, ops_by_type);
     int op_index = operators_map.at(operator_key);
+    int op_version = operator_key.version;
 
     string name = HelpfulOperatorTypeName(*op);
     bool is_builtin = false;
@@ -197,7 +206,7 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
 
     if (is_builtin) {
       ordered_opcodes[op_index] =
-          CreateOperatorCode(*builder, builtin_ops[name], 0);
+          CreateOperatorCode(*builder, builtin_ops[name], 0, op_version);
     } else {
       // This could be a kTensorFlowUnsupported, in which case we should be
       // able to retrieve the original Tensorflow name from the OperatorKey, or
@@ -211,8 +220,9 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
       if (error_summary) {
         error_summary->insert(name);
       }
-      ordered_opcodes[op_index] = CreateOperatorCode(
-          *builder, BuiltinOperator_CUSTOM, builder->CreateString(name));
+      ordered_opcodes[op_index] =
+          CreateOperatorCode(*builder, BuiltinOperator_CUSTOM,
+                             builder->CreateString(name), op_version);
     }
   }
 
@@ -244,7 +254,7 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
       outputs.push_back(tensors_map.at(output));
     }
 
-    int op_index = operators_map.at(GetOperatorKey(*op));
+    int op_index = operators_map.at(GetOperatorKey(*op, ops_by_type));
 
     // This is a custom op unless we can find it in ops_by_type, and even then
     // it could be a custom op (such as kTensorFlowUnsupported).
@@ -279,15 +289,20 @@ Offset<Vector<Offset<Buffer>>> ExportBuffers(
 
 void Export(const Model& model, bool allow_custom_ops,
             string* output_file_contents) {
-  flatbuffers::FlatBufferBuilder builder(/*initial_size=*/10240);
-
   const auto ops_by_type = BuildOperatorByTypeMap();
+  Export(model, allow_custom_ops, output_file_contents, ops_by_type);
+}
+
+void Export(
+    const Model& model, bool allow_custom_ops, string* output_file_contents,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
+  flatbuffers::FlatBufferBuilder builder(/*initial_size=*/10240);
 
   details::TensorsMap tensors_map;
   details::LoadTensorsMap(model, &tensors_map);
 
   details::OperatorsMap operators_map;
-  details::LoadOperatorsMap(model, &operators_map);
+  details::LoadOperatorsMap(model, &operators_map, ops_by_type);
 
   std::vector<const Array*> buffers_to_write;
   Array empty_array;
@@ -312,12 +327,14 @@ void Export(const Model& model, bool allow_custom_ops,
     error_summary.erase(fake_quant_operation_name);
   }
   if (!allow_custom_ops && !error_summary.empty()) {
-    LOG(QFATAL) << "Some of the operators in the model are not supported by "
-                   "the standard TensorFlow Lite runtime. If you have a custom "
-                   "implementation for them you can disable this error with "
-                   "--allow_custom_ops. Here is a list of operators for which "
-                   "you will need custom implementations: "
-                << absl::StrJoin(error_summary, ", ") << ".";
+    LOG(QFATAL)
+        << "Some of the operators in the model are not supported by "
+           "the standard TensorFlow Lite runtime. If you have a custom "
+           "implementation for them you can disable this error with "
+           "--allow_custom_ops, or by setting allow_custom_ops=True "
+           "when calling tf.contrib.lite.toco_convert(). Here is a list "
+           "of operators for which  you will need custom implementations: "
+        << absl::StrJoin(error_summary, ", ") << ".";
   }
 
   auto ops =
diff --git a/tensorflow/contrib/lite/toco/tflite/export.h b/tensorflow/contrib/lite/toco/tflite/export.h
index 8c79cb820015e16847ce48c171e8f6e41f60c319..90abfb94d8d091525cc6ce7b12e2e29c7e648160 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.h
+++ b/tensorflow/contrib/lite/toco/tflite/export.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
 
 #include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tflite/operator.h"
 
 namespace toco {
 
@@ -25,11 +26,18 @@ namespace tflite {
 // result in the given string.
 void Export(const Model& model, bool allow_custom_ops,
             string* output_file_contents);
+
 // This if backward-compatibility.
+// TODO(ycling): Remove the deprecated entry functions.
 inline void Export(const Model& model, string* output_file_contents) {
   Export(model, true, output_file_contents);
 }
 
+// Export API with custom TFLite operator mapping.
+void Export(
+    const Model& model, bool allow_custom_ops, string* output_file_contents,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type);
+
 namespace details {
 
 // A maps from tensor name to its final position in the TF Lite buffer.
@@ -39,25 +47,47 @@ using TensorsMap = std::unordered_map<string, int>;
 // Only when `type` is `kTensorFlowUnsupported`, `custom_code` is filled to
 // identify which operation is used.
 struct OperatorKey {
-  OperatorKey(OperatorType type, const std::string& custom_code)
-      : type(type), custom_code(custom_code) {}
+  OperatorKey(OperatorType type, const std::string& custom_code, int version)
+      : type(type), custom_code(custom_code), version(version) {}
   const OperatorType type;
   const std::string custom_code;
+  const int version;
 
   bool operator<(const OperatorKey& other) const {
     if (type < other.type) return true;
-    if (type > other.type) return false;
-    return custom_code < other.custom_code;
+    else if (type > other.type)
+      return false;
+    else if (custom_code < other.custom_code)
+      return true;
+    else if (custom_code > other.custom_code)
+      return false;
+    else
+      return version < other.version;
   }
 
   bool operator==(const OperatorKey& other) const {
-    return type == other.type && custom_code == other.custom_code;
+    return type == other.type && custom_code == other.custom_code &&
+           version == other.version;
   }
 
   struct Hash {
-    std::size_t operator()(const OperatorKey& key) const {
-      return std::hash<size_t>()(static_cast<size_t>(key.type)) ^
-             std::hash<std::string>()(key.custom_code);
+    size_t operator()(const OperatorKey& key) const {
+      return CombineHashes({std::hash<size_t>()(static_cast<size_t>(key.type)),
+                            std::hash<std::string>()(key.custom_code),
+                            std::hash<int>()(key.version)});
+    }
+
+   private:
+    // TODO(ycling): Refactoring and extract this function into a common
+    // utility module.
+    static size_t CombineHashes(std::initializer_list<size_t> hashes) {
+      size_t result = 0;
+      // Hash combiner used by TensorFlow core.
+      for (size_t hash : hashes) {
+        result = result ^ (hash + 0x9e3779b97f4a7800ULL + (result << 10) +
+                           (result >> 4));
+      }
+      return result;
     }
   };
 };
@@ -66,11 +96,12 @@ struct OperatorKey {
 using OperatorsMap = std::unordered_map<OperatorKey, int, OperatorKey::Hash>;
 
 void LoadTensorsMap(const Model& model, TensorsMap* tensors_map);
-void LoadOperatorsMap(const Model& model, OperatorsMap* operators_map);
+void LoadOperatorsMap(
+    const Model& model, OperatorsMap* operators_map,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type);
 
 }  // namespace details
 }  // namespace tflite
-
 }  // namespace toco
 
 #endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/export_test.cc b/tensorflow/contrib/lite/toco/tflite/export_test.cc
index 6754372330797ae30230af26a3b478c24ad44005..409e7d72a57076ec2832c5d12b52829477624f74 100644
--- a/tensorflow/contrib/lite/toco/tflite/export_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export_test.cc
@@ -17,6 +17,9 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/toco/tflite/builtin_operator.h"
+#include "tensorflow/contrib/lite/toco/tflite/operator.h"
+#include "tensorflow/contrib/lite/toco/tflite/types.h"
 
 namespace toco {
 namespace tflite {
@@ -65,12 +68,13 @@ TEST_F(ExportTest, LoadOperatorsMap) {
   BuildTestModel();
 
   details::OperatorsMap operators;
-  details::LoadOperatorsMap(input_model_, &operators);
-  EXPECT_EQ(0, operators[details::OperatorKey(OperatorType::kAdd, "")]);
-  EXPECT_EQ(1, operators[details::OperatorKey(OperatorType::kConv, "")]);
-  EXPECT_EQ(2, operators[details::OperatorKey(OperatorType::kSub, "")]);
+  const auto ops_by_type = BuildOperatorByTypeMap();
+  details::LoadOperatorsMap(input_model_, &operators, ops_by_type);
+  EXPECT_EQ(0, operators[details::OperatorKey(OperatorType::kAdd, "", 1)]);
+  EXPECT_EQ(1, operators[details::OperatorKey(OperatorType::kConv, "", 1)]);
+  EXPECT_EQ(2, operators[details::OperatorKey(OperatorType::kSub, "", 1)]);
   EXPECT_EQ(3, operators[details::OperatorKey(
-                   OperatorType::kTensorFlowUnsupported, "MyCrazyOp")]);
+                   OperatorType::kTensorFlowUnsupported, "MyCrazyOp", 1)]);
 }
 
 TEST_F(ExportTest, Export) {
@@ -104,6 +108,160 @@ TEST_F(ExportTest, Export) {
   EXPECT_THAT(indices, ElementsAre(1, 0, 3, 2));
 }
 
+// This test is based on a hypothetical scenario that dilation is supported
+// only in Conv version 2. So Toco populates version=1 when dialation
+// parameters are all 1, and version=2 otehrwise.
+class FakeConvolutionOperator
+    : public BuiltinOperator<ConvOperator, ::tflite::Conv2DOptions,
+                             ::tflite::BuiltinOptions_Conv2DOptions> {
+ public:
+  FakeConvolutionOperator()
+      : BuiltinOperator(::tflite::BuiltinOperator_CONV_2D,
+                        OperatorType::kConv) {}
+
+  // Returning the op version according to the op parameters.
+  int GetVersion(const Operator& op) const override {
+    const TocoOperator& conv_op = static_cast<const TocoOperator&>(op);
+    if (conv_op.dilation_width_factor != 1 ||
+        conv_op.dilation_height_factor != 1) {
+      // Version 2 if dilation is used.
+      return 2;
+    }
+    return 1;
+  }
+
+  // Note: The read / write code doesn't need to be changed if we stick with
+  // the restrictions:
+  // * Only adding parameters at the bottom of the Flatbuffer tables.
+  // * When the default value of parameters are used, the op works consistently
+  //   with the previous version.
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto padding = Padding::Serialize(op.padding.type);
+    auto activation_function =
+        ActivationFunction::Serialize(op.fused_activation_function);
+    return ::tflite::CreateConv2DOptions(*builder, padding, op.stride_width,
+                                         op.stride_height, activation_function,
+                                         op.dilation_width_factor,
+                                         op.dilation_height_factor);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->padding.type = Padding::Deserialize(options.padding());
+    op->stride_width = options.stride_w();
+    op->stride_height = options.stride_h();
+    op->dilation_width_factor = options.dilation_w_factor();
+    op->dilation_height_factor = options.dilation_h_factor();
+    op->fused_activation_function =
+        ActivationFunction::Deserialize(options.fused_activation_function());
+  }
+};
+
+class VersionedOpExportTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    input_model_.GetOrCreateArray("input");
+    input_model_.GetOrCreateArray("filter");
+    input_model_.GetOrCreateArray("output");
+  }
+  void AddConvOp(bool use_dialation) {
+    {
+      auto* op = new ConvOperator;
+      op->inputs.push_back("input");
+      op->inputs.push_back("filter");
+      op->inputs.push_back("output");
+
+      op->padding.type = PaddingType::kSame;
+      op->stride_width = 1;
+      op->stride_height = 1;
+      if (use_dialation) {
+        op->dilation_width_factor = 2;
+        op->dilation_height_factor = 2;
+      } else {
+        op->dilation_width_factor = 1;
+        op->dilation_height_factor = 1;
+      }
+      input_model_.operators.emplace_back(op);
+    }
+  }
+
+  std::map<OperatorType, std::unique_ptr<BaseOperator>>
+  BuildFakeOperatorByTypeMap() {
+    std::map<OperatorType, std::unique_ptr<BaseOperator>> result;
+    result[OperatorType::kConv] =
+        std::unique_ptr<BaseOperator>(new FakeConvolutionOperator);
+    return result;
+  }
+
+  Model input_model_;
+};
+
+TEST_F(VersionedOpExportTest, LoadOperatorsMapWithOpV1) {
+  AddConvOp(false);
+
+  details::OperatorsMap operators;
+  const auto ops_by_type = BuildFakeOperatorByTypeMap();
+  details::LoadOperatorsMap(input_model_, &operators, ops_by_type);
+
+  EXPECT_EQ(1, operators.size());
+  EXPECT_EQ(0, operators.at(details::OperatorKey(OperatorType::kConv, "", 1)));
+}
+
+TEST_F(VersionedOpExportTest, LoadOperatorsMapWithOpV2) {
+  AddConvOp(true);
+
+  details::OperatorsMap operators;
+  const auto ops_by_type = BuildFakeOperatorByTypeMap();
+  details::LoadOperatorsMap(input_model_, &operators, ops_by_type);
+
+  EXPECT_EQ(1, operators.size());
+  EXPECT_EQ(0, operators.at(details::OperatorKey(OperatorType::kConv, "", 2)));
+}
+
+TEST_F(VersionedOpExportTest, LoadOperatorsMapWithBothVersions) {
+  AddConvOp(false);
+  AddConvOp(true);
+
+  details::OperatorsMap operators;
+  const auto ops_by_type = BuildFakeOperatorByTypeMap();
+  details::LoadOperatorsMap(input_model_, &operators, ops_by_type);
+
+  EXPECT_EQ(2, operators.size());
+  EXPECT_EQ(0, operators.at(details::OperatorKey(OperatorType::kConv, "", 1)));
+  EXPECT_EQ(1, operators.at(details::OperatorKey(OperatorType::kConv, "", 2)));
+}
+
+TEST_F(VersionedOpExportTest, Export) {
+  AddConvOp(false);
+  AddConvOp(true);
+
+  string result;
+  const auto ops_by_type = BuildFakeOperatorByTypeMap();
+  Export(input_model_, true, &result, ops_by_type);
+
+  auto* model = ::tflite::GetModel(result.data());
+  auto operator_codes = model->operator_codes();
+
+  // Verify that 2 operator codes are populdated. Both are CONV_2D but with
+  // different versions.
+  EXPECT_EQ(2, operator_codes->size());
+  EXPECT_EQ(::tflite::BuiltinOperator_CONV_2D,
+            (*operator_codes)[0]->builtin_code());
+  EXPECT_EQ(1, (*operator_codes)[0]->version());
+  EXPECT_EQ(::tflite::BuiltinOperator_CONV_2D,
+            (*operator_codes)[1]->builtin_code());
+  EXPECT_EQ(2, (*operator_codes)[1]->version());
+
+  // Verify that the 2 operators points to the correct indices of the operation
+  // codes.
+  auto operators = (*model->subgraphs())[0]->operators();
+  EXPECT_EQ(2, operators->size());
+  EXPECT_EQ(0, (*operators)[0]->opcode_index());
+  EXPECT_EQ(1, (*operators)[1]->opcode_index());
+}
+
 // TODO(ahentz): tests for tensors, inputs, outpus, opcodes and operators.
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index e18ae805c044b9fb5450c0419cb375cf6db8111a..6922e5055a602b8d2eb43f88cde15b0d505eac40 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -53,6 +53,8 @@ class AveragePool
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Convolution
@@ -83,6 +85,8 @@ class Convolution
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class DepthwiseConvolution
@@ -112,6 +116,8 @@ class DepthwiseConvolution
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Add : public BuiltinOperator<AddOperator, ::tflite::AddOptions,
@@ -132,6 +138,8 @@ class Add : public BuiltinOperator<AddOperator, ::tflite::AddOptions,
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class SpaceToBatchND
@@ -149,6 +157,8 @@ class SpaceToBatchND
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
@@ -169,6 +179,8 @@ class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Div : public BuiltinOperator<DivOperator, ::tflite::DivOptions,
@@ -189,6 +201,8 @@ class Div : public BuiltinOperator<DivOperator, ::tflite::DivOptions,
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class BatchToSpaceND
@@ -206,6 +220,8 @@ class BatchToSpaceND
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Cast : public BuiltinOperator<CastOperator, ::tflite::CastOptions,
@@ -225,6 +241,8 @@ class Cast : public BuiltinOperator<CastOperator, ::tflite::CastOptions,
     op->src_data_type = DataType::Deserialize(options.in_data_type());
     op->dst_data_type = DataType::Deserialize(options.out_data_type());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Concatenation
@@ -243,6 +261,8 @@ class Concatenation
                    TocoOperator* op) const override {
     op->axis = options.axis();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class DepthToSpace : public CustomOperator<DepthToSpaceOperator> {
@@ -255,6 +275,8 @@ class DepthToSpace : public CustomOperator<DepthToSpaceOperator> {
   void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override {
     op->block_size = m["block_size"].AsInt64();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class FakeQuant : public CustomOperator<FakeQuantOperator> {
@@ -274,6 +296,8 @@ class FakeQuant : public CustomOperator<FakeQuantOperator> {
     const auto& num_bits = m["num_bits"];
     op->num_bits = num_bits.IsInt() ? num_bits.AsInt32() : 8;
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class FullyConnected
@@ -295,6 +319,8 @@ class FullyConnected
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Gather : public BuiltinOperator<GatherOperator, ::tflite::GatherOptions,
@@ -311,6 +337,8 @@ class Gather : public BuiltinOperator<GatherOperator, ::tflite::GatherOptions,
                    TocoOperator* op) const override {
     op->axis = options.axis();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Svdf : public BuiltinOperator<SvdfOperator, ::tflite::SVDFOptions,
@@ -331,6 +359,8 @@ class Svdf : public BuiltinOperator<SvdfOperator, ::tflite::SVDFOptions,
         ActivationFunction::Deserialize(options.fused_activation_function());
     op->rank = options.rank();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class L2Normalization
@@ -351,6 +381,8 @@ class L2Normalization
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class L2Pool : public BuiltinOperator<L2PoolOperator, ::tflite::Pool2DOptions,
@@ -378,6 +410,8 @@ class L2Pool : public BuiltinOperator<L2PoolOperator, ::tflite::Pool2DOptions,
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class LocalResponseNormalization
@@ -401,6 +435,8 @@ class LocalResponseNormalization
     op->alpha = options.alpha();
     op->beta = options.beta();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class MaxPool : public BuiltinOperator<MaxPoolOperator, ::tflite::Pool2DOptions,
@@ -428,6 +464,8 @@ class MaxPool : public BuiltinOperator<MaxPoolOperator, ::tflite::Pool2DOptions,
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
@@ -448,6 +486,8 @@ class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Pad : public BuiltinOperator<PadOperator, ::tflite::PadOptions,
@@ -463,6 +503,25 @@ class Pad : public BuiltinOperator<PadOperator, ::tflite::PadOptions,
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
+class PadV2 : public BuiltinOperator<PadV2Operator, ::tflite::PadV2Options,
+                                     ::tflite::BuiltinOptions_PadV2Options> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreatePadV2Options(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Reshape
@@ -484,6 +543,8 @@ class Reshape
     op->shape.insert(op->shape.end(), options.new_shape()->begin(),
                      options.new_shape()->end());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Softmax
@@ -501,6 +562,8 @@ class Softmax
                    TocoOperator* op) const override {
     op->beta = options.beta();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class SpaceToDepth
@@ -519,6 +582,8 @@ class SpaceToDepth
                    TocoOperator* op) const override {
     op->block_size = options.block_size();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Transpose
@@ -534,6 +599,8 @@ class Transpose
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Lstm : public BuiltinOperator<LstmCellOperator, ::tflite::LSTMOptions,
@@ -556,6 +623,8 @@ class Lstm : public BuiltinOperator<LstmCellOperator, ::tflite::LSTMOptions,
     CHECK(options.fused_activation_function() ==
           ::tflite::ActivationFunctionType_TANH);
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Mean : public BuiltinOperator<MeanOperator, ::tflite::MeanOptions,
@@ -572,6 +641,8 @@ class Mean : public BuiltinOperator<MeanOperator, ::tflite::MeanOptions,
                    TocoOperator* op) const override {
     op->keep_dims = options.keep_dims();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class ResizeBilinear
@@ -590,6 +661,8 @@ class ResizeBilinear
                    TocoOperator* op) const override {
     op->align_corners = options.align_corners();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Squeeze
@@ -611,6 +684,8 @@ class Squeeze
                             options.squeeze_dims()->begin(),
                             options.squeeze_dims()->end());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Split
@@ -629,6 +704,8 @@ class Split
                    TocoOperator* op) const override {
     op->num_split = options.num_splits();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class StridedSlice
@@ -653,6 +730,8 @@ class StridedSlice
     op->new_axis_mask = options.new_axis_mask();
     op->shrink_axis_mask = options.shrink_axis_mask();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class TopK_V2 : public BuiltinOperator<TopKV2Operator, ::tflite::TopKV2Options,
@@ -667,6 +746,8 @@ class TopK_V2 : public BuiltinOperator<TopKV2Operator, ::tflite::TopKV2Options,
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class ArgMax : public BuiltinOperator<ArgMaxOperator, ::tflite::ArgMaxOptions,
@@ -684,6 +765,33 @@ class ArgMax : public BuiltinOperator<ArgMaxOperator, ::tflite::ArgMaxOptions,
                    TocoOperator* op) const override {
     op->output_data_type = DataType::Deserialize(options.output_type());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
+class TransposeConv
+    : public BuiltinOperator<TransposeConvOperator,
+                             ::tflite::TransposeConvOptions,
+                             ::tflite::BuiltinOptions_TransposeConvOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto padding = Padding::Serialize(op.padding.type);
+    return ::tflite::CreateTransposeConvOptions(
+        *builder, padding, op.stride_width, op.stride_height);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->padding.type = Padding::Deserialize(options.padding());
+    op->stride_width = options.stride_w();
+    op->stride_height = options.stride_h();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class TensorFlowUnsupported : public BaseOperator {
@@ -790,6 +898,12 @@ class TensorFlowUnsupported : public BaseOperator {
     }
     node_def.SerializeToString(&op->tensorflow_node_def);
   }
+
+  int GetVersion(const Operator& op) const override {
+    // TODO(ycling): Deisng and implement a way to plumb the version of
+    // custom ops.
+    return 1;
+  }
 };
 
 namespace {
@@ -832,6 +946,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
                                OperatorType::kMaxPool));
   ops.emplace_back(new Mul(::tflite::BuiltinOperator_MUL, OperatorType::kMul));
   ops.emplace_back(new Pad(::tflite::BuiltinOperator_PAD, OperatorType::kPad));
+  ops.emplace_back(
+      new PadV2(::tflite::BuiltinOperator_PADV2, OperatorType::kPadV2));
   ops.emplace_back(new Reshape(::tflite::BuiltinOperator_RESHAPE,
                                OperatorType::kTensorFlowReshape));
   ops.emplace_back(
@@ -860,6 +976,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new Cast(::tflite::BuiltinOperator_CAST, OperatorType::kCast));
   ops.emplace_back(
       new ArgMax(::tflite::BuiltinOperator_ARG_MAX, OperatorType::kArgMax));
+  ops.emplace_back(new TransposeConv(::tflite::BuiltinOperator_TRANSPOSE_CONV,
+                                     OperatorType::kTransposeConv));
 
   // Custom Operators.
   ops.emplace_back(
@@ -898,9 +1016,21 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       "MAXIMUM", OperatorType::kTensorFlowMaximum));
   ops.emplace_back(new SimpleOperator<TensorFlowMinimumOperator>(
       "MINIMUM", OperatorType::kTensorFlowMinimum));
+  ops.emplace_back(new SimpleOperator<TensorFlowGreaterOperator>(
+      "GREATER", OperatorType::kTensorFlowGreater));
+  ops.emplace_back(new SimpleOperator<TensorFlowGreaterEqualOperator>(
+      "GREATER_EQUAL", OperatorType::kTensorFlowGreaterEqual));
   ops.emplace_back(new SimpleOperator<TensorFlowLessOperator>(
       "LESS", OperatorType::kTensorFlowLess));
+  ops.emplace_back(new SimpleOperator<TensorFlowLessEqualOperator>(
+      "LESS_EQUAL", OperatorType::kTensorFlowLessEqual));
   ops.emplace_back(new SimpleOperator<NegOperator>("NEG", OperatorType::kNeg));
+  ops.emplace_back(
+      new SimpleOperator<SelectOperator>("SELECT", OperatorType::kSelect));
+  ops.emplace_back(
+      new SimpleOperator<SliceOperator>("SLICE", OperatorType::kSlice));
+  ops.emplace_back(new SimpleOperator<SinOperator>("SIN", OperatorType::kSin));
+
   return ops;
 }
 }  // namespace
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.h b/tensorflow/contrib/lite/toco/tflite/operator.h
index 85f7bdafe04979abc14f826ef667b3fa1aeec65c..5e9c20e40dd6274e0839379883b6dbe53064a0fc 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.h
+++ b/tensorflow/contrib/lite/toco/tflite/operator.h
@@ -77,6 +77,16 @@ class BaseOperator {
       const BuiltinOptions* builtin_options,
       const CustomOptions* custom_options) const = 0;
 
+  // Get the op version by op parameters.
+  // The function need to be overridden to return the op version based on the
+  // parameters. Note:
+  // * The first version for each op should be 1 (to be consistent with the
+  //   default value in Flatbuffer. `return 1;` is okay for newly implemented
+  //   ops.
+  // * When multiple versions are defined for an op, this function need to be
+  //   overridden. (See example in `operator_test.cc`)
+  virtual int GetVersion(const Operator& op) const = 0;
+
  private:
   string name_;
   OperatorType type_;
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 2b6c32b07c4a2ed4ebdc552dff8c6c38bca2283d..fe594c6da9826ab904d162c9e28e1455b1bf69f6 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -116,6 +116,9 @@ TEST_F(OperatorTest, SimpleOperators) {
   CheckSimpleOperator<TensorFlowLessOperator>("LESS",
                                               OperatorType::kTensorFlowLess);
   CheckSimpleOperator<NegOperator>("NEG", OperatorType::kNeg);
+  CheckSimpleOperator<SelectOperator>("SELECT", OperatorType::kSelect);
+  CheckSimpleOperator<SliceOperator>("SLICE", OperatorType::kSlice);
+  CheckSimpleOperator<SinOperator>("SIN", OperatorType::kSin);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -405,6 +408,18 @@ TEST_F(OperatorTest, BuiltinArgMax) {
   EXPECT_EQ(op.output_data_type, output_toco_op->output_data_type);
 }
 
+TEST_F(OperatorTest, BuiltinTransposeConv) {
+  TransposeConvOperator op;
+  op.stride_width = 123;
+  op.stride_height = 124;
+  op.padding.type = PaddingType::kValid;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("TRANSPOSE_CONV", OperatorType::kTransposeConv), op);
+  EXPECT_EQ(op.stride_width, output_toco_op->stride_width);
+  EXPECT_EQ(op.stride_height, output_toco_op->stride_height);
+  EXPECT_EQ(op.padding.type, output_toco_op->padding.type);
+}
+
 TEST_F(OperatorTest, TensorFlowUnsupported) {
   TensorFlowUnsupportedOperator op;
   op.tensorflow_op = "MyCustomUnsupportedOp";
diff --git a/tensorflow/contrib/lite/toco/tflite/simple_operator.h b/tensorflow/contrib/lite/toco/tflite/simple_operator.h
index 72678c82a22a7168f858747b0b1c6a2b515b6578..a7f7e886f61d3bbf221c0ab7a24d6c3e629ec274 100644
--- a/tensorflow/contrib/lite/toco/tflite/simple_operator.h
+++ b/tensorflow/contrib/lite/toco/tflite/simple_operator.h
@@ -41,6 +41,8 @@ class SimpleOperator : public BaseOperator {
       const CustomOptions* custom_options) const override {
     return std::unique_ptr<Operator>(new T);
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/toco/tflite/types.cc b/tensorflow/contrib/lite/toco/tflite/types.cc
index c9c2e9ba0184ef3f531f325091afaf6976e07f4f..4867c3a62e68406428644cd05bddf212008c2656 100644
--- a/tensorflow/contrib/lite/toco/tflite/types.cc
+++ b/tensorflow/contrib/lite/toco/tflite/types.cc
@@ -36,6 +36,16 @@ DataBuffer::FlatBufferOffset CopyStringToBuffer(
   return builder->CreateVector(dst_data.data(), bytes);
 }
 
+// vector<bool> may be implemented using a bit-set, so we can't just
+// reinterpret_cast, accesing it data as vector<bool> and let flatbuffer
+// CreateVector handle it.
+// Background: https://isocpp.org/blog/2012/11/on-vectorbool
+DataBuffer::FlatBufferOffset CopyBoolToBuffer(
+    const Array& array, flatbuffers::FlatBufferBuilder* builder) {
+  const auto& src_data = array.GetBuffer<ArrayDataType::kBool>().data;
+  return builder->CreateVector(src_data);
+}
+
 template <ArrayDataType T>
 DataBuffer::FlatBufferOffset CopyBuffer(
     const Array& array, flatbuffers::FlatBufferBuilder* builder) {
@@ -86,6 +96,8 @@ void CopyBuffer(const ::tflite::Buffer& buffer, Array* array) {
       return ::tflite::TensorType_UINT8;
     case ArrayDataType::kString:
       return ::tflite::TensorType_STRING;
+    case ArrayDataType::kBool:
+      return ::tflite::TensorType_BOOL;
     default:
       // FLOAT32 is filled for unknown data types.
       // TODO(ycling): Implement type inference in TF Lite interpreter.
@@ -105,6 +117,8 @@ ArrayDataType DataType::Deserialize(int tensor_type) {
       return ArrayDataType::kString;
     case ::tflite::TensorType_UINT8:
       return ArrayDataType::kUint8;
+    case ::tflite::TensorType_BOOL:
+      return ArrayDataType::kBool;
     default:
       LOG(FATAL) << "Unhandled tensor type '" << tensor_type << "'.";
   }
@@ -125,6 +139,8 @@ flatbuffers::Offset<flatbuffers::Vector<uint8_t>> DataBuffer::Serialize(
       return CopyStringToBuffer(array, builder);
     case ArrayDataType::kUint8:
       return CopyBuffer<ArrayDataType::kUint8>(array, builder);
+    case ArrayDataType::kBool:
+      return CopyBoolToBuffer(array, builder);
     default:
       LOG(FATAL) << "Unhandled array data type.";
   }
@@ -146,6 +162,8 @@ void DataBuffer::Deserialize(const ::tflite::Tensor& tensor,
       return CopyStringFromBuffer(buffer, array);
     case ::tflite::TensorType_UINT8:
       return CopyBuffer<ArrayDataType::kUint8>(buffer, array);
+    case ::tflite::TensorType_BOOL:
+      return CopyBuffer<ArrayDataType::kBool>(buffer, array);
     default:
       LOG(FATAL) << "Unhandled tensor type.";
   }
diff --git a/tensorflow/contrib/lite/toco/tflite/types_test.cc b/tensorflow/contrib/lite/toco/tflite/types_test.cc
index efb849f42283de5867c3b1ad914655212a13edb5..564f303b9bb41a777633ecabd666aa93ec3faefe 100644
--- a/tensorflow/contrib/lite/toco/tflite/types_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/types_test.cc
@@ -28,8 +28,7 @@ using flatbuffers::Vector;
 
 // These are types that exist in TF Mini but don't have a correspondence
 // in TF Lite.
-static const ArrayDataType kUnsupportedTocoTypes[] = {ArrayDataType::kNone,
-                                                      ArrayDataType::kBool};
+static const ArrayDataType kUnsupportedTocoTypes[] = {ArrayDataType::kNone};
 
 // These are TF Lite types for which there is no correspondence in TF Mini.
 static const ::tflite::TensorType kUnsupportedTfLiteTypes[] = {
@@ -71,7 +70,8 @@ TEST(DataType, SupportedTypes) {
       {ArrayDataType::kUint8, ::tflite::TensorType_UINT8},
       {ArrayDataType::kInt32, ::tflite::TensorType_INT32},
       {ArrayDataType::kInt64, ::tflite::TensorType_INT64},
-      {ArrayDataType::kFloat, ::tflite::TensorType_FLOAT32}};
+      {ArrayDataType::kFloat, ::tflite::TensorType_FLOAT32},
+      {ArrayDataType::kBool, ::tflite::TensorType_BOOL}};
   for (auto x : testdata) {
     EXPECT_EQ(x.second, DataType::Serialize(x.first));
     EXPECT_EQ(x.first, DataType::Deserialize(x.second));
@@ -158,6 +158,13 @@ TEST(DataBuffer, String) {
               ::testing::ElementsAre("AA", "BBB", "Best. String. Ever."));
 }
 
+TEST(DataBuffer, Bool) {
+  Array recovered =
+      ToFlatBufferAndBack<ArrayDataType::kBool>({true, false, true});
+  EXPECT_THAT(recovered.GetBuffer<ArrayDataType::kBool>().data,
+              ::testing::ElementsAre(true, false, true));
+}
+
 TEST(Padding, All) {
   EXPECT_EQ(::tflite::Padding_SAME, Padding::Serialize(PaddingType::kSame));
   EXPECT_EQ(PaddingType::kSame, Padding::Deserialize(::tflite::Padding_SAME));
diff --git a/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h b/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
index d6c3ba6543378b3e15b5fb7816f52376fe05123d..7cdd55e5422589aa000000b82d09b9d8397d7a88 100644
--- a/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
+++ b/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
@@ -21,8 +21,6 @@ namespace toco {
 
 // Global data for determining whether to output graph viz format from toco.
 struct GraphVizDumpOptions {
-  std::string graphviz_first_array;
-  std::string graphviz_last_array;
   std::string dump_graphviz;
   bool dump_graphviz_video = false;
 
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 6973b22c5a817bef0153a168e35522ddc94d3087..b5531ca2f4785e0c95703f95977be93a0ba2a8e2 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -86,6 +86,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveConstantRandomUniform);
   transformations->Add(new ResolveConstantRange);
   transformations->Add(new ResolveConstantReshape);
+  transformations->Add(new ResolveConstantSlice);
   transformations->Add(new ResolveConstantStack);
   transformations->Add(new ResolveConstantStridedSlice);
   transformations->Add(new ResolveConstantTranspose);
@@ -106,6 +107,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveSpaceToBatchNDAttributes);
   transformations->Add(new ResolveBatchToSpaceNDAttributes);
   transformations->Add(new ResolvePadAttributes);
+  transformations->Add(new ResolvePadV2Attributes);
   transformations->Add(new ResolveStridedSliceAttributes);
   transformations->Add(new ResolveSliceAttributes);
   transformations->Add(new ResolveMeanAttributes);
@@ -371,6 +373,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     LOG(INFO) << "Estimated count of arithmetic ops: " << 1e-9 * ops_count
               << " billion (note that a multiply-add is counted as 2 ops).";
   }
+  model->ops_count = ops_count;
 }
 
 void Export(const TocoFlags& toco_flags, const Model& model,
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 86ee1f3761330d54b343390c0f8f74ff8c7e8c57..1e6314f2dc78297c8bdacb19cf89292603695e3f 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -143,6 +143,10 @@ int CountOpsWithInput(const Model& model, const string& array_name) {
     for (auto& input : op->inputs) {
       if (input == array_name) {
         count++;
+        // Breaking here is important: some graphs have ops that use the
+        // same array as more than one of their inputs, and in that case
+        // we want it counted only once.
+        break;
       }
     }
   }
@@ -333,6 +337,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(LogSoftmax)
     HANDLE_OPERATORTYPENAME_CASE(Div)
     HANDLE_OPERATORTYPENAME_CASE(Tanh)
+    HANDLE_OPERATORTYPENAME_CASE(Sin)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowAll)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowAssert)
     HANDLE_OPERATORTYPENAME_CASE(ExpandDims)
@@ -352,6 +357,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowMinimum)
     HANDLE_OPERATORTYPENAME_CASE(Neg)
     HANDLE_OPERATORTYPENAME_CASE(Pad)
+    HANDLE_OPERATORTYPENAME_CASE(PadV2)
     HANDLE_OPERATORTYPENAME_CASE(StridedSlice)
     HANDLE_OPERATORTYPENAME_CASE(Stack)
     HANDLE_OPERATORTYPENAME_CASE(Range)
@@ -386,6 +392,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Exp)
     HANDLE_OPERATORTYPENAME_CASE(DynamicPartition)
     HANDLE_OPERATORTYPENAME_CASE(DynamicStitch)
+    HANDLE_OPERATORTYPENAME_CASE(Select)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
@@ -980,7 +987,7 @@ void FixOperatorOrdering(Model* model) {
     for (auto i : remaining) {
       bool can_insert = true;
       auto& op = old_operators[i];
-      CHECK(op.get());
+      CHECK(op);
       for (const auto& input : op->inputs) {
         if (!IsConstantParameterArray(*model, input) &&
             !arrays_behind_us.count(input)) {
@@ -2067,15 +2074,21 @@ bool ReshapeIsEquivalentToTranspose(const Model& model,
 void CheckFinalDataTypesSatisfied(const Model& model) {
   for (const auto& array_entry : model.GetArrayMap()) {
     const auto& array = *array_entry.second;
+    if (array.data_type == ArrayDataType::kBool) {
+      // Boolean values are never quantized.
+      continue;
+    }
+
     // If the final data type is int16, the data type may be float, for example
     // after dequantization.
     if (array.final_data_type != ArrayDataType::kNone &&
         array.final_data_type != ArrayDataType::kInt16) {
-      CHECK(array.final_data_type == array.data_type)
+      CHECK(array.data_type == array.final_data_type)
           << "Array \"" << array_entry.first
-          << "\" has mis-matching actual and final data types ("
-          << ArrayDataTypeName(array.data_type) << ","
-          << ArrayDataTypeName(array.final_data_type) << ").";
+          << "\" has mis-matching actual and final data types (data_type="
+          << ArrayDataTypeName(array.data_type)
+          << ", final_data_type=" << ArrayDataTypeName(array.final_data_type)
+          << ").";
     }
   }
 }
@@ -2092,6 +2105,8 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type) {
       return ArrayDataType::kInt32;
     case INT64:
       return ArrayDataType::kInt64;
+    case BOOL:
+      return ArrayDataType::kBool;
     default:
       return ArrayDataType::kNone;
   }
diff --git a/tensorflow/contrib/lite/toco/types.proto b/tensorflow/contrib/lite/toco/types.proto
index 03bd6150bc86bb27221814cd191b17f1a09585fa..421667a83c14a738231488b1c14eb6071487bafb 100644
--- a/tensorflow/contrib/lite/toco/types.proto
+++ b/tensorflow/contrib/lite/toco/types.proto
@@ -37,4 +37,7 @@ enum IODataType {
 
   // Int16, quantized
   QUANTIZED_INT16 = 6;
+
+  // Boolean
+  BOOL = 7;
 }
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 7b3569ea9c8b15959b15e8ba46cf44d159d5528c..824a164651073bac846a514505726a8ee85cc41d 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -42,7 +42,6 @@ tf_cc_binary(
         "//conditions:default": [],
     }),
     deps = [
-        ":mutable_op_resolver",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
@@ -87,13 +86,6 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "mutable_op_resolver",
-    srcs = ["mutable_op_resolver.cc"],
-    hdrs = ["mutable_op_resolver.h"],
-    deps = ["//tensorflow/contrib/lite:framework"],
-)
-
 cc_library(
     name = "verifier",
     srcs = ["verifier.cc"],
@@ -103,7 +95,6 @@ cc_library(
         "//tensorflow/contrib/lite:schema_fbs_version",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
-        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
@@ -115,11 +106,9 @@ cc_test(
         "tflite_not_portable",
     ],
     deps = [
-        ":mutable_op_resolver",
         ":verifier",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:schema_fbs_version",
-        "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/testing:util",
         "//tensorflow/core:framework_lite",
diff --git a/tensorflow/contrib/lite/tools/benchmark_model.cc b/tensorflow/contrib/lite/tools/benchmark_model.cc
index 93c80e0f5e021f76bff6858b0ea3370724393d6d..869c531b3e3db37f634761e7b25d4ffa1e8304a7 100644
--- a/tensorflow/contrib/lite/tools/benchmark_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark_model.cc
@@ -23,8 +23,8 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
 #include "tensorflow/contrib/lite/string_util.h"
-#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
@@ -354,7 +354,7 @@ int Main(int argc, char** argv) {
   string output_layer_string;  // e.g.: output
   int num_runs = 50;
   string run_delay = "-1.0";
-  int num_threads = -1;
+  int num_threads = 1;
   string benchmark_name = "";
   string output_prefix = "";
   int warmup_runs = 1;
diff --git a/tensorflow/contrib/lite/tools/gen_op_registration_main.cc b/tensorflow/contrib/lite/tools/gen_op_registration_main.cc
index 17b514c9169817479e18eecf5799ea4371f3b051..f7df80821fc383063c6e19148bfb13801368b334 100644
--- a/tensorflow/contrib/lite/tools/gen_op_registration_main.cc
+++ b/tensorflow/contrib/lite/tools/gen_op_registration_main.cc
@@ -55,7 +55,7 @@ void GenerateFileContent(const std::string& tflite_path,
   std::ofstream fout(filename);
 
   fout << "#include \"" << tflite_path << "/model.h\"\n";
-  fout << "#include \"" << tflite_path << "/tools/mutable_op_resolver.h\"\n";
+  fout << "#include \"" << tflite_path << "/op_resolver.h\"\n";
 
   fout << "namespace tflite {\n";
   fout << "namespace ops {\n";
diff --git a/tensorflow/contrib/lite/tools/mutable_op_resolver.cc b/tensorflow/contrib/lite/tools/mutable_op_resolver.cc
deleted file mode 100644
index 8a921d7c5aa20ce3a9dc279d8f0c7c253905b078..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/mutable_op_resolver.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
-
-namespace tflite {
-
-TfLiteRegistration* MutableOpResolver::FindOp(
-    tflite::BuiltinOperator op) const {
-  auto it = builtins_.find(op);
-  return it != builtins_.end() ? it->second : nullptr;
-}
-
-TfLiteRegistration* MutableOpResolver::FindOp(const char* op) const {
-  auto it = custom_ops_.find(op);
-  return it != custom_ops_.end() ? it->second : nullptr;
-}
-
-void MutableOpResolver::AddBuiltin(tflite::BuiltinOperator op,
-                                   TfLiteRegistration* registration) {
-  registration->builtin_code = op;
-  builtins_.insert(std::make_pair(op, registration));
-}
-
-void MutableOpResolver::AddCustom(const char* name,
-                                  TfLiteRegistration* registration) {
-  registration->builtin_code = BuiltinOperator_CUSTOM;
-  custom_ops_.insert(std::make_pair(std::string(name), registration));
-}
-
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/mutable_op_resolver.h b/tensorflow/contrib/lite/tools/mutable_op_resolver.h
deleted file mode 100644
index 573a359c458acb6e4320c5a21cb378cdde720924..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/mutable_op_resolver.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
-
-#include <map>
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/model.h"
-
-// Needed to resolve unordered_set hash on older compilers.
-namespace std {
-template <>
-struct hash<tflite::BuiltinOperator> {
-  size_t operator()(const tflite::BuiltinOperator& op) const {
-    return std::hash<int>()(op);
-  }
-};
-}  // namespace std
-
-namespace tflite {
-
-// An OpResolver that is mutable, also used as the op in gen_op_registration.
-// A typical usage:
-//   MutableOpResolver resolver;
-//   resolver.AddBuiltin(BuiltinOperator_ADD, Register_ADD());
-//   resolver.AddCustom("CustomOp", Register_CUSTOM_OP());
-//   InterpreterBuilder(model, resolver)(&interpreter);
-class MutableOpResolver : public OpResolver {
- public:
-  MutableOpResolver() {}
-  TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override;
-  TfLiteRegistration* FindOp(const char* op) const override;
-  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration);
-  void AddCustom(const char* name, TfLiteRegistration* registration);
-
- private:
-  std::map<int, TfLiteRegistration*> builtins_;
-  std::map<std::string, TfLiteRegistration*> custom_ops_;
-};
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/tools/verifier.cc b/tensorflow/contrib/lite/tools/verifier.cc
index 8818a7dc85d9ffdc1da450fb389d5ed11139bc31..8d3a7a624265ca6f9933f36949fd6fdbb3c39c40 100644
--- a/tensorflow/contrib/lite/tools/verifier.cc
+++ b/tensorflow/contrib/lite/tools/verifier.cc
@@ -246,15 +246,16 @@ bool VerifyOps(const Model& model, const OpResolver& resolver,
     }
 
     if (opcode->builtin_code() == BuiltinOperator_CUSTOM) {
-      if (!resolver.FindOp(opcode->custom_code()->c_str())) {
-        ReportError(error_reporter, "Unsupported custom op: %s",
-                    opcode->custom_code()->c_str());
+      if (!resolver.FindOp(opcode->custom_code()->c_str(), opcode->version())) {
+        ReportError(error_reporter, "Unsupported custom op: %s, version: %d",
+                    opcode->custom_code()->c_str(), opcode->version());
         return false;
       }
     } else {
-      if (!resolver.FindOp(opcode->builtin_code())) {
-        ReportError(error_reporter, "Unsupported builtin op: %s",
-                    EnumNameBuiltinOperator(opcode->builtin_code()));
+      if (!resolver.FindOp(opcode->builtin_code(), opcode->version())) {
+        ReportError(error_reporter, "Unsupported builtin op: %s, version: %d",
+                    EnumNameBuiltinOperator(opcode->builtin_code()),
+                    opcode->version());
         return false;
       }
     }
diff --git a/tensorflow/contrib/lite/tools/verifier.h b/tensorflow/contrib/lite/tools/verifier.h
index b7ce4e830576af14002d6bd9080af1da5764b1c9..a596c650a0c2533b6ece3cc7c692d863c2d3f860 100644
--- a/tensorflow/contrib/lite/tools/verifier.h
+++ b/tensorflow/contrib/lite/tools/verifier.h
@@ -26,12 +26,13 @@ namespace tflite {
 class AlwaysTrueResolver : public OpResolver {
  public:
   AlwaysTrueResolver() {}
-  TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override {
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override {
     static TfLiteRegistration null_registration = {nullptr, nullptr, nullptr,
                                                    nullptr};
     return &null_registration;
   }
-  TfLiteRegistration* FindOp(const char* op) const override {
+  const TfLiteRegistration* FindOp(const char* op, int version) const override {
     static TfLiteRegistration null_registration = {nullptr, nullptr, nullptr,
                                                    nullptr};
     return &null_registration;
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc
index 03b93afe3ed04b4bff13bc01d7c7c8e9fae9bdf3..ce8a7857d2dd66b12e9ea970911ef1dd01e4550e 100644
--- a/tensorflow/contrib/lite/tools/verifier_test.cc
+++ b/tensorflow/contrib/lite/tools/verifier_test.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/allocation.h"
 #include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/testing/util.h"
-#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
 #include "tensorflow/contrib/lite/tools/verifier.h"
 #include "tensorflow/contrib/lite/version.h"
 #include "tensorflow/core/framework/numeric_types.h"
@@ -31,7 +31,6 @@ namespace tflite {
 
 using flatbuffers::FlatBufferBuilder;
 using flatbuffers::Offset;
-using flatbuffers::Vector;
 
 // Build single subgraph model.
 class TfLiteFlatbufferModelBuilder {
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index bdad34a665e47a4e060fcaddfffecfdc876a8fb0..651de4e2f446b2da39b000cde2541872116cbdba 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -482,9 +482,12 @@ def hinge_loss(logits, labels=None, scope=None):
   """Method that returns the loss tensor for hinge loss.
 
   Args:
-    logits: The logits, a float tensor.
+    logits: The logits, a float tensor. Note that logits are assumed to be
+      unbounded and 0-centered. A value > 0 (resp. < 0) is considered a positive
+      (resp. negative) binary prediction.
     labels: The ground truth output tensor. Its shape should match the shape of
-      logits. The values of the tensor are expected to be 0.0 or 1.0.
+      logits. The values of the tensor are expected to be 0.0 or 1.0. Internally
+      the {0,1} labels are converted to {-1,1} when calculating the hinge loss.
     scope: The scope for the operations performed in computing the loss.
 
   Returns:
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops_test.py b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
index 1417772e0496cb571488e5b30bd4f3fb1b591730..2a442a8fc85c8ab70dfa3b2183fc50f5c9a468e4 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops_test.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
@@ -24,10 +24,8 @@ from tensorflow.contrib.framework.python.ops import arg_scope
 from tensorflow.contrib.losses.python.losses import loss_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -275,7 +273,6 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertAlmostEqual(np.average(weights) * 10.0, loss, 3)
 
 
-@test_util.with_c_api
 class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
 
   def testNoneWeightRaisesValueError(self):
@@ -473,11 +470,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([[0, 1], [2, 3]])
       weights = constant_op.constant([1.2, 3.4, 5.6, 7.8])
 
-      if ops._USE_C_API:
-        error_type = ValueError
-      else:
-        error_type = errors_impl.InvalidArgumentError
-      with self.assertRaises(error_type):
+      with self.assertRaises(ValueError):
         loss_ops.sparse_softmax_cross_entropy(
             logits, labels, weights=weights).eval()
 
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index e8c6edd7ba9aa6a45d956d1d5655b2809d8d2309..a28fc3a87f9503074806d780a11878a9274efc6f 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -270,7 +270,7 @@ for arch in $archs; do
                         PLATFORM_LDFLAGS=-pthread
                         MKDEP=${CC} -M -std=c++11
                         PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
-                                   ../../platform/c++11/src/per_thread_waiter.cc \
+                                   ../../platform/posix/src/per_thread_waiter.c \
                                    ../../platform/c++11/src/yield.cc \
                                    ../../platform/c++11/src/time_rep_timespec.cc \
                                    ../../platform/c++11/src/nsync_panic.cc
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index d4c3f2eda8be0c70e961afe582983b9f73769c77..89db9ee2794ddf0a99951dca327e74c5d9694d23 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -300,7 +300,6 @@ tensorflow/core/kernels/spacetobatch_op.cc
 tensorflow/core/kernels/batchtospace_op.cc
 tensorflow/core/kernels/warn_about_ints.cc
 tensorflow/core/kernels/segment_reduction_ops.cc
-tensorflow/core/kernels/batch_util.cc
 tensorflow/core/ops/audio_ops.cc
 tensorflow/core/kernels/decode_proto_op.cc
 tensorflow/core/kernels/encode_proto_op.cc
diff --git a/tensorflow/contrib/metrics/BUILD b/tensorflow/contrib/metrics/BUILD
index e050f3c8d4fc61adfdd3d15869e533ed2b51c4a8..4f2c82ca23011667662c74507fcbd99bcde4c7c0 100644
--- a/tensorflow/contrib/metrics/BUILD
+++ b/tensorflow/contrib/metrics/BUILD
@@ -77,7 +77,7 @@ py_test(
 py_test(
     name = "metric_ops_test",
     srcs = ["python/ops/metric_ops_test.py"],
-    shard_count = 8,
+    shard_count = 16,
     srcs_version = "PY2AND3",
     tags = ["noasan"],  # times out b/63678675
     deps = [
diff --git a/tensorflow/contrib/mixed_precision/BUILD b/tensorflow/contrib/mixed_precision/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3dfb95e0a006b13c23ea362bf622d80fd73703e6
--- /dev/null
+++ b/tensorflow/contrib/mixed_precision/BUILD
@@ -0,0 +1,32 @@
+# Mixed precision training optimizers
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "mixed_precision",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/mixed_precision/python:loss_scale_manager",
+        "//tensorflow/contrib/mixed_precision/python:loss_scale_optimizer",
+    ],
+)
diff --git a/tensorflow/contrib/mixed_precision/__init__.py b/tensorflow/contrib/mixed_precision/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..43e98cdda09222dc1334932265e516c6d460cdfc
--- /dev/null
+++ b/tensorflow/contrib/mixed_precision/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# mixed_precisiond under the License is mixed_precisiond on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library for mixed precision training."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.mixed_precision.python.loss_scale_manager import *
+from tensorflow.contrib.mixed_precision.python.loss_scale_optimizer import *
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "LossScaleManager",
+    "FixedLossScaleManager",
+    "ExponentialUpdateLossScaleManager",
+    "LossScaleOptimizer",
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/mixed_precision/python/BUILD b/tensorflow/contrib/mixed_precision/python/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..1d769e16141e3eff664c449fc05b8441ee49d706
--- /dev/null
+++ b/tensorflow/contrib/mixed_precision/python/BUILD
@@ -0,0 +1,74 @@
+# Mixed precision training optimizers
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "loss_scale_manager",
+    srcs = ["loss_scale_manager.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+py_test(
+    name = "loss_scale_manager_test",
+    size = "small",
+    srcs = ["loss_scale_manager_test.py"],
+    deps = [
+        ":loss_scale_manager",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "loss_scale_optimizer",
+    srcs = ["loss_scale_optimizer.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":loss_scale_manager",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_test(
+    name = "loss_scale_optimizer_test",
+    size = "small",
+    srcs = ["loss_scale_optimizer_test.py"],
+    deps = [
+        ":loss_scale_optimizer",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_manager.py b/tensorflow/contrib/mixed_precision/python/loss_scale_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..be7377b1519f3bdab8755411af3de7aa0c2dc9eb
--- /dev/null
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_manager.py
@@ -0,0 +1,200 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""LossScaleManager classes for mixed precision training."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import six
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+
+
+@six.add_metaclass(abc.ABCMeta)
+class LossScaleManager(object):
+  """Abstract loss scale manager class.
+
+  Loss scale managers with a different strategy should subclass this class.
+  Loss scaling is a process that:
+
+  1) Applies a multiplier on the loss before computing gradients, and
+  2) Applies the reciprocal of the multiplier on the gradients before they are
+     applied on variables.
+
+  This class is used together with
+  @{tf.contrib.mixed_precision.LossScaleOptimizer} for mixed precision training
+  (float32 variables and float16 ops) on Nvidia GPUs in order to achieve the
+  same model quality as single precision training, with the benefits of
+  potential higher throughput.
+
+  See @{tf.contrib.mixed_precision.LossScaleOptimizer} for more details.
+  """
+
+  @abc.abstractmethod
+  def get_loss_scale(self):
+    """Returns the loss scale as a scalar `float32` tensor."""
+    pass
+
+  @abc.abstractmethod
+  def update_loss_scale(self, finite_grads):
+    """Updates loss scale based on if gradients are finite in current step.
+
+    Args:
+      finite_grads: bool scalar tensor indicating if all gradients are
+        finite (i.e., not inf or nan).
+
+    Returns:
+      An op, when executed updates the loss scale. If eager execution is
+      enabled, does not return anything.
+    """
+    del finite_grads
+    return
+
+
+class FixedLossScaleManager(LossScaleManager):
+  """Loss scale manager with a fixed loss scale.
+
+  The loss scale is not updated for the lifetime of the class.
+  """
+
+  def __init__(self, loss_scale):
+    """Creates the fixed loss scale manager.
+
+    Args:
+      loss_scale: A Python float. Its ideal value varies depending on models to
+        run. Choosing a too small loss_scale might affect model quality; a too
+        big loss_scale might cause inf or nan. There is no single right
+        loss_scale to apply. There is no harm choosing a relatively big number
+        as long as no nan or inf is encountered in training.
+
+    Raises:
+      ValueError: If loss_scale is less than 1.
+    """
+    if loss_scale < 1:
+      raise ValueError("loss scale must be at least 1.")
+    self._loss_scale = ops.convert_to_tensor(loss_scale, dtype=dtypes.float32)
+
+  def get_loss_scale(self):
+    return self._loss_scale
+
+  def update_loss_scale(self, finite_grads):
+    del finite_grads
+    return gen_control_flow_ops.no_op()
+
+
+class ExponentialUpdateLossScaleManager(LossScaleManager):
+  """Loss scale manager uses an exponential update strategy.
+
+  In general, the strategy increases loss scale by a greater-than-one factor
+  after encountering a consecutive series of steps with finite gradients;
+  Similarly, it decreases the loss scale by a factor when the accumulated number
+  of steps with non-finite (nan or inf) gradients are met. An update is not
+  applied if its result is less than 1 or overflows the float32 dynamic range.
+
+  The number of finite and non-finite steps are cleared every time the loss
+  scale is changed. The condition to decrease the loss scale is looser than to
+  increase it since the former does not require the steps to be consecutive.
+  """
+
+  def __init__(self,
+               init_loss_scale,
+               incr_every_n_steps,
+               decr_every_n_nan_or_inf=2,
+               incr_ratio=2,
+               decr_ratio=0.8):
+    """Constructor of exponential-update loss scale manager.
+
+    Args:
+      init_loss_scale: A Python float.  The loss scale to use at the beginning.
+      incr_every_n_steps: Increases loss scale every n consecutive steps with
+        finite gradients.
+      decr_every_n_nan_or_inf: Decreases loss scale every n accumulated steps
+        with nan or inf gradients.
+      incr_ratio: The multiplier to use when increasing the loss scale.
+      decr_ratio: The less-than-one-multiplier to use when decreasing the loss
+        scale.
+    """
+    self._incr_every_n_steps = incr_every_n_steps
+    self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
+    self._incr_ratio = incr_ratio
+    self._decr_ratio = decr_ratio
+    self._loss_scale = variable_scope.variable(
+        name="loss_scale",
+        initial_value=ops.convert_to_tensor(init_loss_scale, dtypes.float32),
+        dtype=dtypes.float32,
+        trainable=False)
+    self._num_good_steps = variable_scope.variable(
+        name="good_steps", initial_value=0, dtype=dtypes.int32, trainable=False)
+    self._num_bad_steps = variable_scope.variable(
+        name="bad_steps", initial_value=0, dtype=dtypes.int32, trainable=False)
+
+  def _reset_stats(self):
+    return control_flow_ops.group(
+        state_ops.assign(self._num_good_steps, 0),
+        state_ops.assign(self._num_bad_steps, 0))
+
+  def get_loss_scale(self):
+    """Returns the loss scale."""
+    return self._loss_scale
+
+  def update_loss_scale(self, finite_grads):
+    """Updates loss scale based on if gradients are finite in current step."""
+
+    def update_if_finite_grads():
+      """Branch function when grads are all finite."""
+
+      def incr_loss_scale():
+        new_loss_scale = control_flow_ops.cond(
+            gen_math_ops.is_finite(self._loss_scale * self._incr_ratio),
+            lambda: self._loss_scale * self._incr_ratio,
+            lambda: self._loss_scale)
+        update_op = state_ops.assign(self._loss_scale, new_loss_scale)
+        # When loss_scale is updated, both good and bad steps are reset.
+        return control_flow_ops.group(update_op, self._reset_stats())
+
+      return control_flow_ops.cond(
+          self._num_good_steps + 1 >= self._incr_every_n_steps,
+          incr_loss_scale,
+          lambda: state_ops.assign_add(self._num_good_steps, 1).op)
+
+    def update_if_not_finite_grads():
+      """Branch function when any grad is not finite."""
+
+      def decr_loss_scale():
+        update_op = state_ops.assign(
+            self._loss_scale,
+            gen_math_ops.maximum(1., self._loss_scale * self._decr_ratio))
+        # When loss_scale is updated, both good and bad steps are reset.
+        return control_flow_ops.group(update_op, self._reset_stats())
+
+      def just_update_steps():
+        # When bad_steps is incremented, good_step is reset.
+        return control_flow_ops.group(
+            state_ops.assign_add(self._num_bad_steps, 1),
+            state_ops.assign(self._num_good_steps, 0))
+
+      return control_flow_ops.cond(
+          self._num_bad_steps + 1 >= self._decr_every_n_nan_or_inf,
+          decr_loss_scale, just_update_steps)
+
+    return control_flow_ops.cond(finite_grads, update_if_finite_grads,
+                                 update_if_not_finite_grads)
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py b/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..480f5f6eaf493c5c87c27cc9f8e510ea9c085a72
--- /dev/null
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
@@ -0,0 +1,182 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for LossScaleManager classes.."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.mixed_precision.python import loss_scale_manager as lsm_lib
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def _GetExampleIter(inputs):
+  dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
+  return dataset.make_one_shot_iterator()
+
+
+class FixedLossScaleManagerTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_basic(self):
+    itr = _GetExampleIter([True] * 10 + [False] * 10)
+
+    loss_scale = 1000
+    lsm = lsm_lib.FixedLossScaleManager(loss_scale)
+    update_fn = lambda: lsm.update_loss_scale(itr.get_next())
+
+    self.evaluate(variables.global_variables_initializer())
+    if not context.executing_eagerly():
+      update_op = update_fn()
+    for _ in range(10):
+      if context.executing_eagerly():
+        update_fn()
+      else:
+        self.evaluate(update_op)
+      self.assertEqual(loss_scale, self.evaluate(lsm.get_loss_scale()))
+
+
+class ExponentialUpdateLossScaleManagerTest(test.TestCase):
+
+  def _test_helper(self,
+                   inputs,
+                   expected_outputs,
+                   init_loss_scale=1,
+                   incr_every_n_step=2,
+                   decr_every_n_nan_or_inf=2):
+    ratio = 2
+    lsm = lsm_lib.ExponentialUpdateLossScaleManager(
+        init_loss_scale=init_loss_scale,
+        incr_every_n_steps=incr_every_n_step,
+        decr_every_n_nan_or_inf=decr_every_n_nan_or_inf,
+        incr_ratio=ratio,
+        decr_ratio=1. / ratio)
+    itr = _GetExampleIter(inputs)
+    update_fn = lambda: lsm.update_loss_scale(itr.get_next())
+
+    self.evaluate(variables.global_variables_initializer())
+    actual_outputs = []
+
+    if not context.executing_eagerly():
+      update_op = update_fn()
+    for _ in range(len(inputs)):
+      if context.executing_eagerly():
+        update_fn()
+      else:
+        self.evaluate(update_op)
+      actual_outputs.append(self.evaluate(lsm.get_loss_scale()))
+    self.assertEqual(actual_outputs, expected_outputs)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_increase_every_n_steps(self):
+    inputs = [True] * 6
+    expected_outputs = [1, 2, 2, 4, 4, 8]
+    self._test_helper(inputs, expected_outputs)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_keep_increasing_until_capped(self):
+    init_loss_scale = np.finfo(np.float32).max / 4 + 10
+    max_float = np.finfo(np.float32).max
+
+    inputs = [True] * 6
+    # Output is capped the 2nd time it doubles.
+    expected_outputs = [
+        init_loss_scale, init_loss_scale * 2, init_loss_scale * 2, max_float,
+        max_float, max_float
+    ]
+
+    self._test_helper(inputs, expected_outputs, init_loss_scale)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_decrease_every_n_steps(self):
+    inputs = [False] * 6
+    init_loss_scale = 1024
+    expected_outputs = [1024, 512, 512, 256, 256, 128]
+
+    self._test_helper(inputs, expected_outputs, init_loss_scale)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_keep_decreasing_until_one(self):
+    inputs = [False] * 10
+    init_loss_scale = 16
+    expected_outputs = [16, 8, 8, 4, 4, 2, 2, 1, 1, 1]
+
+    self._test_helper(inputs, expected_outputs, init_loss_scale)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_incr_bad_step_clear_good_step(self):
+    inputs = [True, True, True, False, True]
+    expected_outputs = [1, 2, 2, 2, 2]
+    self._test_helper(inputs, expected_outputs)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_incr_good_step_does_not_clear_bad_step(self):
+    inputs = [True, True, True, False, True, False]
+    expected_outputs = [1, 2, 2, 2, 2, 1]
+    self._test_helper(inputs, expected_outputs)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_trigger_loss_scale_update_each_step(self):
+    """Test when incr_every_n_step and decr_every_n_nan_or_inf is 1."""
+    init_loss_scale = 1
+    incr_every_n_step = 1
+    decr_every_n_nan_or_inf = 1
+
+    inputs = [True] * 3 + [False, True, True]
+    expected_outputs = [2, 4, 8, 4, 8, 16]
+
+    self._test_helper(inputs, expected_outputs, init_loss_scale,
+                      incr_every_n_step, decr_every_n_nan_or_inf)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_alternating_good_and_bad_gradients_trigger_each_step(self):
+    init_loss_scale = 1
+    incr_every_n_step = 1
+    decr_every_n_nan_or_inf = 1
+
+    inputs = [True, False] * 4 + [True]
+    expected_outputs = [2, 1, 2, 1, 2, 1, 2, 1, 2]
+    self._test_helper(inputs, expected_outputs, init_loss_scale,
+                      incr_every_n_step, decr_every_n_nan_or_inf)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_alternating_good_and_bad_gradients_trigger_incr_every_2steps(self):
+    init_loss_scale = 32
+    incr_every_n_step = 2
+    decr_every_n_nan_or_inf = 1
+
+    inputs = [True, False] * 3 + [True]
+    expected_outputs = [32, 16, 16, 8, 8, 4, 4]
+    self._test_helper(inputs, expected_outputs, init_loss_scale,
+                      incr_every_n_step, decr_every_n_nan_or_inf)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_random_mix_good_and_bad_gradients(self):
+    init_loss_scale = 4
+    inputs = [
+        False, False, True, True, True, False, True, False, True, True, True,
+        False
+    ]
+    expected_outputs = [4, 2, 2, 4, 4, 4, 4, 2, 2, 4, 4, 4]
+    self._test_helper(inputs, expected_outputs, init_loss_scale)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4e5ccc33472ad5a12bd8111fb1ff6ebbd6f45f9
--- /dev/null
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py
@@ -0,0 +1,166 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Loss scaling optimizer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import optimizer
+
+
+class LossScaleOptimizer(optimizer.Optimizer):
+  """An optimizer that applies loss scaling in backprop.
+
+  This class is useful for mixed precision training on GPUs (or other potential
+  accelerators), which is an approach to improve compute throughput without loss
+  of model quality.
+
+  The commmon configuration of mixed precision models is the following:
+  * variables are kept in high precision (e.g. float32).
+  * computations are done in lower precision (e.g. float16). variables are
+    casted to lower precision before they're used.
+  * (in training), final gradients are casted back to variable precision and get
+    applied.
+
+  Because computations happen in lower precision, gradients in the backprop pass
+  might underflow in the smaller dynamic range, causing a model to converge at a
+  suboptimal level. This optimizer multiplies the loss by a factor before
+  backprop starts to prevent underflow. Before gradients are applied, they are
+  casted to higher precision and down-scaled by the same factor, so
+  mathematically the variable updates are no different from regular
+  same-precision training.
+
+  See [Nvidia's manual on mixed precision training](
+  https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
+  for more details.
+
+  To use loss scale optimizer, one only needs choose a loss scale strategy and
+  wrap a regular optimizer. See examples below.
+
+  ```
+  loss = loss_fn()
+  opt = tf.AdamOptimizer(learning_rate=...)
+
+  # Choose a loss scale manager which decides how to pick the right loss scale
+  # throughout the training process.
+  loss_scale_manger = tf.contrib.mixed_precision.FixedLossScaleManager(5000)
+
+  # Wraps the original optimizer in a LossScaleOptimizer.
+  loss_scale_optimizer = LossScaleOptimizer(opt, loss_scale_manager)
+
+  # Call minimize() on the loss scale optimizer.
+  train_op = loss_scale_optimizer.minimize(loss)
+  ```
+
+  If gradients clipping is applied, one can call
+  `optimizer.compute_gradients()` and `optimizer.apply_gradients()`
+  seperately.
+
+  Notice the following way of using LossScaleOptimizer is not intended. Always
+  use `loss_scale_optimizer.compute_gradients()` to compute gradients instead of
+  `tf.gradients()` if doing mixed precision training.
+
+  ```
+  # The following is a wrong way to use LossScaleOptimizer along with
+  # tf.gradients().
+
+  # Always use loss_scale_optimizer.compute_gradients() to compute grads, or
+  # loss scale is not correctly applied.
+  grads = tf.gradients(loss, ...)
+
+  # Do some custom grad clipping.
+  grads = clip_grads(grads, ...)
+
+  loss_scale_optimizer.apply(grads_and_vars)
+  ```
+  """
+
+  def __init__(self, opt, loss_scale_manager):
+    """Construct a loss scaling optimizer.
+
+    Args:
+      opt: The actual optimizer that will be used to compute and apply the
+        gradients. Must be an implementation of the @{tf.train.Optimizer}
+        interface.
+      loss_scale_manager: A LossScaleManager object.
+    """
+    self._opt = opt
+    self._loss_scale_manager = loss_scale_manager
+
+  def compute_gradients(self,
+                        loss,
+                        var_list=None,
+                        gate_gradients=optimizer.Optimizer.GATE_OP,
+                        aggregation_method=None,
+                        colocate_gradients_with_ops=False,
+                        grad_loss=None):
+    """Compute gradients. See base class @{tf.train.Optimizer}."""
+    loss_scale = self._loss_scale_manager.get_loss_scale()
+    if context.executing_eagerly():
+
+      def scaled_loss():
+        loss_val = loss()
+        return loss_val * math_ops.cast(loss_scale, loss_val.dtype.base_dtype)
+    else:
+      if callable(loss):
+        loss_val = loss()
+      else:
+        loss_val = loss
+      scaled_loss = loss_val * math_ops.cast(loss_scale,
+                                             loss_val.dtype.base_dtype)
+    grads_and_vars = self._opt.compute_gradients(
+        scaled_loss,
+        var_list=var_list,
+        gate_gradients=gate_gradients,
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        grad_loss=grad_loss)
+    return self._down_scale(grads_and_vars, loss_scale)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients. See base class @{tf.train.Optimizer}."""
+    grads = [g for (g, _) in grads_and_vars]
+
+    is_finite_grad = []
+    for g in grads:
+      is_finite_grad.append(math_ops.reduce_all(gen_math_ops.is_finite(g)))
+    is_overall_finite = math_ops.reduce_all(is_finite_grad)
+
+    # Only update gradients when all grads are finite.
+    def true_apply_gradients_fn():
+      return self._opt.apply_gradients(grads_and_vars, global_step, name)
+
+    update_vars = control_flow_ops.cond(
+        is_overall_finite, true_apply_gradients_fn, gen_control_flow_ops.no_op)
+    # Potentially adjust gradient scale in case of finite gradients.
+    return control_flow_ops.group(
+        update_vars,
+        self._loss_scale_manager.update_loss_scale(is_overall_finite))
+
+  def _down_scale(self, grads_vars, loss_scale):
+    # Down scale grads by the loss_scale.
+    gv = []
+    inv_loss_scale = gen_math_ops.reciprocal(loss_scale)
+    for g, v in grads_vars:
+      if g is not None:
+        gv.append((g * math_ops.cast(inv_loss_scale, g.dtype.base_dtype), v))
+      else:
+        gv.append((g, v))
+    return gv
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..dded61ccd58eb79b338d7264e8a057c9456c8695
--- /dev/null
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
@@ -0,0 +1,216 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for LossScaleOptimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.mixed_precision.python import loss_scale_manager as lsm_lib
+from tensorflow.contrib.mixed_precision.python import loss_scale_optimizer as lso
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent as gd
+
+
+class LossScaleOptimizerTest(test.TestCase):
+
+  def _build_graph(self, lr, init_val, loss_scale_opt_fn=None):
+    x = variable_scope.get_variable(
+        "x", initializer=init_val, dtype=dtypes.float32)
+    c1 = constant_op.constant(1e4, dtype=dtypes.float16)
+    c2 = constant_op.constant(1e-4, dtype=dtypes.float16)
+    c3 = constant_op.constant(1e-4, dtype=dtypes.float16)
+    if context.executing_eagerly():
+      loss = lambda: math_ops.cast(x, dtypes.float16) * c1 * c2 * c3
+    else:
+      loss = math_ops.cast(x, dtypes.float16) * c1 * c2 * c3
+
+    opt = gd.GradientDescentOptimizer(lr)
+    if loss_scale_opt_fn:
+      opt = loss_scale_opt_fn(opt)
+    return x, loss, opt
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_float16_underflow_without_loss_scale(self):
+    lr = 1
+    init_val = 1.
+    x, loss, opt = self._build_graph(lr, init_val)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(opt.minimize(loss, var_list=[x]))
+
+    # Symbolic grad is c1 * c2 * c3 = 1e-4 and actual grad is 0, since in
+    # backprop, c2 * c3 underflows in fp16 range. So variable isn't updated.
+    expected_update = 0
+    symbolic_update = 1e-4 * lr
+    self.assertAllClose(
+        init_val - expected_update,
+        self.evaluate(x),
+        rtol=0,
+        atol=min(symbolic_update, 1e-6))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_float16_with_loss_scale(self):
+    lr = 1.
+    init_val = 1.
+
+    def loss_scale_opt_fn(opt):
+      return lso.LossScaleOptimizer(opt, lsm_lib.FixedLossScaleManager(1e4))
+
+    x, loss, opt = self._build_graph(lr, init_val, loss_scale_opt_fn)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(opt.minimize(loss, var_list=[x]))
+
+    # Symbolic grad is c1 * c2 * c3 = 1e-4 and actual grad is the same, due to
+    # up-scaled loss before backprop starts.
+    expected_update = 1.e-4 * lr
+    self.assertAllClose(
+        init_val - expected_update,
+        self.evaluate(x),
+        rtol=0,
+        atol=min(expected_update, 1e-6))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_compute_gradients_with_loss_scale(self):
+    lr = 1
+    init_val = 1.
+
+    def loss_scale_opt_fn(opt):
+      return lso.LossScaleOptimizer(opt, lsm_lib.FixedLossScaleManager(1e4))
+
+    x, loss, opt = self._build_graph(lr, init_val, loss_scale_opt_fn)
+    grads_and_vars = opt.compute_gradients(loss, var_list=[x])
+
+    self.assertEqual(len(grads_and_vars), 1)
+
+    self.evaluate(variables.global_variables_initializer())
+    g_v = self.evaluate(grads_and_vars[0][0])
+    self.assertAllClose(g_v, 1e-4)
+    self.assertIs(grads_and_vars[0][1], x)
+    # Gradients aren't applied.
+    self.assertAllClose(init_val, self.evaluate(x), rtol=0, atol=1e-6)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_compute_gradients_without_loss_scale(self):
+    lr = 1
+    init_val = 1.
+    x, loss, opt = self._build_graph(lr, init_val)
+    grads_and_vars = opt.compute_gradients(loss, var_list=[x])
+
+    self.assertEqual(len(grads_and_vars), 1)
+    self.evaluate(variables.global_variables_initializer())
+    g_v = self.evaluate(grads_and_vars[0][0])
+    self.assertAllClose(g_v, 0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_apply_gradients(self):
+
+    x = variable_scope.get_variable("x", initializer=1., dtype=dtypes.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices([np.nan, np.inf, 0.1])
+    itr = dataset.make_one_shot_iterator()
+
+    lr = 1
+    opt = gd.GradientDescentOptimizer(lr)
+    lsm = lsm_lib.FixedLossScaleManager(1.e4)
+    opt = lso.LossScaleOptimizer(opt, lsm)
+    train_fn = lambda: opt.apply_gradients([(itr.get_next(), x)])
+    if not context.executing_eagerly():
+      train_op = train_fn()
+
+    expected_output = [1, 1, 1 - 0.1]
+    actual_output = []
+
+    self.evaluate(variables.global_variables_initializer())
+    for _ in range(3):
+      # nan or inf is not applied.
+      if context.executing_eagerly():
+        train_fn()
+      else:
+        self.evaluate(train_op)
+      actual_output.append(self.evaluate(x))
+    self.assertAllClose(expected_output, actual_output)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_apply_gradients_loss_scale_is_updated(self):
+
+    class SimpleLossScaleManager(lsm_lib.LossScaleManager):
+      """A simple loss scale manager for easier testing.
+
+      It increments loss scale by 1 if grads are finite, and decreases loss
+      scale by 1 if otherwise.
+      """
+
+      def __init__(self, loss_scale):
+        self._loss_scale = variable_scope.variable(
+            name="loss_scale",
+            initial_value=loss_scale,
+            dtype=dtypes.float32,
+            trainable=False)
+
+      def get_loss_scale(self):
+        return self._loss_scale
+
+      def update_loss_scale(self, if_finite_grads):
+        return control_flow_ops.cond(
+            if_finite_grads, lambda: state_ops.assign_add(self._loss_scale, 1),
+            lambda: state_ops.assign_sub(self._loss_scale, 1))
+
+    x = variable_scope.get_variable("x", initializer=1., dtype=dtypes.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices([np.nan, np.inf, 0.1])
+    itr = dataset.make_one_shot_iterator()
+
+    lr = 1
+    init_loss_scale = 8
+    opt = gd.GradientDescentOptimizer(lr)
+    lsm = SimpleLossScaleManager(init_loss_scale)
+    opt = lso.LossScaleOptimizer(opt, lsm)
+    train_fn = lambda: opt.apply_gradients([(itr.get_next(), x)])
+    if not context.executing_eagerly():
+      train_op = train_fn()
+
+    self.evaluate(variables.global_variables_initializer())
+
+    expected_loss_scale = [
+        init_loss_scale - 1, init_loss_scale - 2, init_loss_scale - 2 + 1
+    ]
+    expected_output = [1, 1, 1 - 0.1]
+    actual_output = []
+    for i in range(3):
+      # nan or inf is not applied.
+      if context.executing_eagerly():
+        train_fn()
+      else:
+        self.evaluate(train_op)
+      actual_output.append(self.evaluate(x))
+      self.assertAllClose(expected_loss_scale[i],
+                          self.evaluate(lsm._loss_scale))
+    self.assertAllClose(expected_output, actual_output)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index ea6032e588cf398deaf497fb99087436ce1cb2e8..4b7af18b3316950afdb90c344ce777848c63e4c1 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -396,14 +396,19 @@ class Pruning(object):
                        self._block_pooling_function)
 
     with ops.name_scope(weights.op.name + '_pruning_ops'):
-      abs_weights = math_ops.abs(
-          array_ops.reshape(weights, [
-              1,
-              squeezed_weights.get_shape()[0],
-              squeezed_weights.get_shape()[1], 1
-          ]))
+      abs_weights = math_ops.abs(squeezed_weights)
+
       pool_window = [self._block_dim[0], self._block_dim[1]]
-      pooled_weights = nn_ops.pool(
+      pool_fn = pruning_utils.factorized_pool
+
+      if not self._spec.use_tpu:
+        pool_fn = nn_ops.pool
+        abs_weights = array_ops.reshape(
+            abs_weights,
+            [1, abs_weights.get_shape()[0],
+             abs_weights.get_shape()[1], 1])
+
+      pooled_weights = pool_fn(
           abs_weights,
           window_shape=pool_window,
           pooling_type=self._block_pooling_function,
@@ -411,19 +416,18 @@ class Pruning(object):
           padding='SAME',
           name=weights.op.name + '_pooled')
 
+      if pooled_weights.get_shape().ndims != 2:
+        pooled_weights = array_ops.squeeze(pooled_weights)
+
       smoothed_threshold, new_mask = self._update_mask(pooled_weights,
                                                        threshold)
-
-      reshaped_mask = array_ops.reshape(
-          new_mask,
-          [pooled_weights.get_shape()[1],
-           pooled_weights.get_shape()[2]])
       updated_mask = pruning_utils.kronecker_product(
-          reshaped_mask, array_ops.ones(self._block_dim))
+          new_mask, array_ops.ones(self._block_dim))
       sliced_mask = array_ops.slice(
           updated_mask, [0, 0],
           [squeezed_weights.get_shape()[0],
            squeezed_weights.get_shape()[1]])
+
     return smoothed_threshold, array_ops.reshape(sliced_mask,
                                                  array_ops.shape(weights))
 
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils.py b/tensorflow/contrib/model_pruning/python/pruning_utils.py
index 56d3dcef20d1b1c34d6b04535e2b4dc7be7f7320..ef6c6a3f5d7aa2980dfd4e59d450ec827eb68f0a 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_utils.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 
@@ -221,6 +222,56 @@ def compute_cdf(values, value_range, **kwargs):
     return math_ops.div(cdf, math_ops.reduce_max(cdf))
 
 
+def factorized_pool(input_tensor,
+                    window_shape,
+                    pooling_type,
+                    strides,
+                    padding,
+                    name=None):
+  """Performs m x n pooling through a combination of 1xm and 1xn pooling.
+
+  Args:
+    input_tensor: Input tensor. Must be rank 2
+    window_shape: Pooling window shape
+    pooling_type: Either 'MAX' or 'AVG'
+    strides: The stride of the pooling window
+    padding: 'SAME' or 'VALID'.
+    name: Name of the op
+
+  Returns:
+    A rank 2 tensor containing the pooled output
+
+  Raises:
+    ValueError: if the input tensor is not rank 2
+  """
+  if input_tensor.get_shape().ndims != 2:
+    raise ValueError('factorized_pool() accepts tensors of rank 2 only')
+
+  [height, width] = input_tensor.get_shape()
+  with ops.name_scope(name, 'factorized_pool'):
+    input_tensor_aligned = array_ops.reshape(
+        input_tensor, [1, 1, height, width],
+        name=input_tensor.op.name + '_aligned')
+
+    height_pooling = nn_ops.pool(
+        input_tensor_aligned,
+        window_shape=[1, window_shape[0]],
+        pooling_type=pooling_type,
+        strides=[1, strides[0]],
+        padding=padding)
+    swap_height_width = array_ops.transpose(height_pooling, perm=[0, 1, 3, 2])
+
+    width_pooling = nn_ops.pool(
+        swap_height_width,
+        window_shape=[1, window_shape[1]],
+        pooling_type=pooling_type,
+        strides=[1, strides[1]],
+        padding=padding)
+
+  return array_ops.squeeze(
+      array_ops.transpose(width_pooling, perm=[0, 1, 3, 2]))
+
+
 def determine_partitioned_axis(partitioned_variable):
   partitioned_axis = 0
   concatenated_variable_shape = partitioned_variable.get_shape()
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
index 10e1dd0a8eee88f357fbe60bf00f180c05f2c4d2..ccde5b4e8a86fcfdb8b942412827057fb18e70ae 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
@@ -22,8 +22,10 @@ import numpy as np
 
 from tensorflow.contrib.model_pruning.python import pruning_utils
 from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -31,6 +33,30 @@ from tensorflow.python.platform import test
 
 class PruningUtilsTest(test.TestCase):
 
+  def _compare_cdf(self, values):
+    abs_values = math_ops.abs(values)
+    max_value = math_ops.reduce_max(abs_values)
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      cdf_from_histogram = pruning_utils.compute_cdf_from_histogram(
+          abs_values, [0.0, max_value], nbins=pruning_utils._NBINS)
+      cdf = pruning_utils.compute_cdf(abs_values, [0.0, max_value])
+      self.assertAllEqual(cdf.eval(), cdf_from_histogram.eval())
+
+  def _compare_pooling_methods(self, weights, pooling_kwargs):
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      pooled_weights_tf = array_ops.squeeze(
+          nn_ops.pool(
+              array_ops.reshape(
+                  weights,
+                  [1, weights.get_shape()[0],
+                   weights.get_shape()[1], 1]), **pooling_kwargs))
+      pooled_weights_factorized_pool = pruning_utils.factorized_pool(
+          weights, **pooling_kwargs)
+      self.assertAllClose(pooled_weights_tf.eval(),
+                          pooled_weights_factorized_pool.eval())
+
   def testHistogram(self):
     width = 10
     height = 10
@@ -59,27 +85,35 @@ class PruningUtilsTest(test.TestCase):
       self.assertAllEqual(len(norm_cdf_val), nbins)
       self.assertAllEqual(expected_cdf, norm_cdf_val)
 
-  def _compare_cdf(self, values):
-    abs_values = math_ops.abs(values)
-    max_value = math_ops.reduce_max(abs_values)
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      cdf_from_histogram = pruning_utils.compute_cdf_from_histogram(
-          abs_values, [0.0, max_value], nbins=pruning_utils._NBINS)
-      cdf = pruning_utils.compute_cdf(abs_values, [0.0, max_value])
-      return cdf.eval(), cdf_from_histogram.eval()
-
   def testCDFEquivalence2D(self):
     width = 100
     height = 100
     weights = variable_scope.get_variable("weights", shape=[width, height])
-    cdf_val, cdf_from_histogram_val = self._compare_cdf(weights)
-    self.assertAllEqual(cdf_val, cdf_from_histogram_val)
+    self._compare_cdf(weights)
 
   def testCDFEquivalence4D(self):
     weights = variable_scope.get_variable("weights", shape=[5, 5, 128, 128])
-    cdf_val, cdf_from_histogram_val = self._compare_cdf(weights)
-    self.assertAllEqual(cdf_val, cdf_from_histogram_val)
+    self._compare_cdf(weights)
+
+  def testFactorizedAvgPool(self):
+    weights = variable_scope.get_variable("weights", shape=[1024, 2048])
+    pooling_kwargs = {
+        "window_shape": [2, 4],
+        "pooling_type": "AVG",
+        "strides": [2, 4],
+        "padding": "SAME"
+    }
+    self._compare_pooling_methods(weights, pooling_kwargs)
+
+  def testFactorizedMaxPool(self):
+    weights = variable_scope.get_variable("weights", shape=[1024, 2048])
+    pooling_kwargs = {
+        "window_shape": [2, 4],
+        "pooling_type": "MAX",
+        "strides": [2, 4],
+        "padding": "SAME"
+    }
+    self._compare_pooling_methods(weights, pooling_kwargs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/mpi_collectives/kernels/ring.h b/tensorflow/contrib/mpi_collectives/kernels/ring.h
index 1d56d588bc49eda542303ae6ebb19602352ae01d..c001615d3ffbdf04194cf8fd1fd242542bf8f89d 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/ring.h
+++ b/tensorflow/contrib/mpi_collectives/kernels/ring.h
@@ -129,7 +129,7 @@ cudaStream_t CudaStreamForMPI();
  *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
  * complete.
  *
- *  Next, the allgather distributes these fully accumululated chunks across all
+ *  Next, the allgather distributes these fully accumulated chunks across all
  * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
  * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
  * For example, at the first iteration, the following transfers will occur:
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer.py b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
index a7c97a1da2baf29914337094c6153447c997af08..b6b10e500b6af80ab61cbf74077ea8e70800662f 100644
--- a/tensorflow/contrib/opt/python/training/model_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
@@ -62,7 +62,7 @@ class ModelAverageCustomGetter(object):
   """
 
   def __init__(self, worker_device):
-    """Create a new `ElasticAverageCustomGetter`.
+    """Create a new `ModelAverageCustomGetter`.
 
     Args:
       worker_device: String.  Name of the `worker` job.
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index 9e2858d00ff192e56680b288651975410c63c539..64b95786b5c7a71ee514201d8eb60c26975938b5 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -31,19 +31,20 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import checkpointable
-from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import saver as core_saver
 from tensorflow.python.training import training_util
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 class NonLayerCheckpointable(checkpointable.Checkpointable):
@@ -139,8 +140,9 @@ class CheckpointingTests(test.TestCase):
       self.evaluate(checkpointable_utils.gather_initializers(
           root_checkpointable))
       self.evaluate(train_op)
-    named_variables, serialized_graph = (
-        checkpointable_utils._serialize_object_graph(root_checkpointable))
+    named_variables, serialized_graph, _ = (
+        checkpointable_utils._serialize_object_graph(
+            root_checkpointable, saveables_cache=None))
     expected_checkpoint_names = (
         # Created in the root node, so no prefix.
         "optimizer_step",
@@ -163,24 +165,29 @@ class CheckpointingTests(test.TestCase):
     suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
     expected_checkpoint_names = [
         name + suffix for name in expected_checkpoint_names]
+    # The Dense layers also save get_config() JSON
+    expected_checkpoint_names.extend(
+        ["model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
+         "model/_named_dense/.ATTRIBUTES/OBJECT_CONFIG_JSON"])
+    named_variables = {v.name: v for v in named_variables}
     six.assertCountEqual(self, expected_checkpoint_names,
                          named_variables.keys())
     # Check that we've mapped to the right variable objects (not exhaustive)
     self.assertEqual(
-        "global_step:0",
-        named_variables["optimizer_step" + suffix].name)
+        "global_step",
+        named_variables["optimizer_step" + suffix].full_name)
     self.assertEqual(
-        "my_model/dense_1/kernel:0",
-        named_variables["model/_second/kernel" + suffix].name)
+        "my_model/dense_1/kernel",
+        named_variables["model/_second/kernel" + suffix].full_name)
     self.assertEqual(
-        "my_model/dense/kernel:0",
-        named_variables["model/_named_dense/kernel" + suffix].name)
+        "my_model/dense/kernel",
+        named_variables["model/_named_dense/kernel" + suffix].full_name)
     self.assertEqual(
-        "beta1_power:0",
-        named_variables["optimizer/beta1_power" + suffix].name)
+        "beta1_power",
+        named_variables["optimizer/beta1_power" + suffix].full_name)
     self.assertEqual(
-        "beta2_power:0",
-        named_variables["optimizer/beta2_power" + suffix].name)
+        "beta2_power",
+        named_variables["optimizer/beta2_power" + suffix].full_name)
     # Spot check the generated protocol buffers.
     self.assertEqual("optimizer",
                      serialized_graph.nodes[0].children[1].local_name)
@@ -205,7 +212,7 @@ class CheckpointingTests(test.TestCase):
     self.assertEqual(
         "my_model/dense/kernel/Adam:0",
         optimizer.get_slot(
-            var=named_variables["model/_named_dense/kernel" + suffix],
+            var=model._named_dense.kernel,
             name="m").name)
     self.assertEqual(
         "model/_named_dense/kernel" + suffix,
@@ -417,16 +424,6 @@ class CheckpointingTests(test.TestCase):
                          self.evaluate(root.save_counter))
   # pylint: enable=cell-var-from-loop
 
-  def _get_checkpoint_name(self, name):
-    root = checkpointable.Checkpointable()
-    checkpointable_utils.add_variable(
-        root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    checkpoint_name, = named_variables.keys()
-    with ops.name_scope("root/" + checkpoint_name):
-      pass  # Make sure we can use this as an op name if we prefix it.
-    return checkpoint_name
-
   def testAnonymousVarsInInit(self):
 
     class Model(training.Model):
@@ -617,6 +614,49 @@ class CheckpointingTests(test.TestCase):
         self.assertAllEqual(3., self.evaluate(beta1_power))
 
 
+class TemplateTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_checkpointable_save_restore(self):
+
+    def _templated():
+      v = variable_scope.get_variable(
+          "v", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
+      v2 = variable_scope.get_variable(
+          "v2", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
+      return v, v + 1., v2
+
+    save_template = template.make_template("s1", _templated)
+    v1_save, _, v2_save = save_template()
+    optimizer = adam.AdamOptimizer(0.0)
+    save_root = checkpointable_utils.Checkpoint(
+        my_template=save_template, optimizer=optimizer)
+    optimizer.minimize(v1_save.read_value)
+    self.evaluate([v.initializer for v in optimizer.variables()])
+    self.evaluate(v1_save.assign([12.]))
+    self.evaluate(v2_save.assign([14.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_root.save(checkpoint_prefix)
+
+    load_template = template.make_template("s2", _templated)
+    load_optimizer = adam.AdamOptimizer(0.0)
+    load_root = checkpointable_utils.Checkpoint(
+        my_template=load_template, optimizer=load_optimizer)
+    status = load_root.restore(save_path)
+    var, var_plus_one, var2 = load_template()
+    load_optimizer.minimize(var.read_value)
+    self.assertEqual(2, len(load_template._checkpoint_dependencies))
+    self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
+    self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual([12.], self.evaluate(var))
+    self.assertAllEqual([13.], self.evaluate(var_plus_one))
+    self.assertAllEqual([14.], self.evaluate(var2))
+
+
 class CheckpointCompatibilityTests(test.TestCase):
 
   def _initialized_model(self):
@@ -682,12 +722,22 @@ class CheckpointCompatibilityTests(test.TestCase):
       with self.assertRaises(AssertionError):
         self._check_sentinels(root)
       object_saver = checkpointable_utils.CheckpointableSaver(root)
+      self._set_sentinels(root)
       status = object_saver.restore(save_path)
-      with self.assertRaises(AssertionError):
-        status.assert_consumed()
+      if context.executing_eagerly():
+        self._check_sentinels(root)
+      if context.executing_eagerly():
+        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
+          status.assert_consumed()
+      else:
+        # When graph building, we haven't read any keys, so we don't know
+        # whether the restore will be complete.
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_consumed()
       status.run_restore_ops()
       self._check_sentinels(root)
       self._set_sentinels(root)
+      status = object_saver.restore(save_path)
       status.initialize_or_restore()
       self._check_sentinels(root)
 
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 46bfbb729fa9cdfc98f4228f516a7c5c42f23059..f537318b32986c941b6c41eb363929e906027dd7 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -33,10 +33,10 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import checkpointable
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.training import slot_creator
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
 
 
@@ -360,7 +360,16 @@ class _OptimizerV2State(object):
     """
     slot_variable = self.get_slot(var=variable, name=slot_name)
     if (slot_variable is None and context.executing_eagerly() and
-        slot_variable_position.is_simple_variable()):
+        slot_variable_position.is_simple_variable()
+        # Defer slot variable creation if there is an active variable creator
+        # scope. Generally we'd like to eagerly create/restore slot variables
+        # when possible, but this may mean that scopes intended to catch
+        # `variable` also catch its eagerly created slot variable
+        # unintentionally (specifically make_template would add a dependency on
+        # a slot variable if not for this case). Deferring is mostly harmless
+        # (aside from double initialization), and makes variable creator scopes
+        # behave the same way they do when graph building.
+        and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
       initializer = checkpointable.CheckpointInitialValue(
           checkpoint_position=slot_variable_position)
       slot_variable = self.create_slot(
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index b9918fdee1ece2bae1ab1459985066a35b6431be..23363617eddd2078db9052a64d70d5f8c234805d 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -155,8 +155,10 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:partitioned_variables",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:session",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
 )
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index 76f695dce0d1c4c104d823dcac9a4f94d3fba81d..55479bf5f74299bf09f131a6127f9f11d6192d90 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -475,7 +475,7 @@ def _FoldUnfusedBatchNorms(graph, is_training, freeze_batch_norm_delay):
 def _IsValidUnfusedBatchNorm(graph, context):
   """Checks that the output of the unfused batch norm has consumers."""
   add_shift = graph.get_operation_by_name(
-      context + '/BatchNorm/batchnorm/add_1')
+      context + '/BatchNorm/batchnorm_1/add_1')
   # Ensure that the output tensor of batch norm has consumers, otherwise this
   # is a dangling node and not a match.
   return bool(add_shift.outputs[0].consumers())
@@ -568,7 +568,7 @@ def _GetBatchNormParams(graph, context, has_scaling):
 
   op_suffix_mean = '/BatchNorm/moments/Squeeze'
   op_suffix_variance = '/BatchNorm/moments/Squeeze_1'
-  op_suffix_epsilon = '/BatchNorm/batchnorm/add/y'
+  op_suffix_epsilon = '/BatchNorm/batchnorm_1/add/y'
   op_suffix_bn_decay_mean = '/BatchNorm/AssignMovingAvg/decay'
   op_suffix_bn_decay_var = '/BatchNorm/AssignMovingAvg_1/decay'
 
@@ -643,12 +643,12 @@ def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay,
 
   Returns:
     A pair of Operations, the first is the original consumer node of the batch
-      norm (../BatchNorm/batchnorm/add_1), the second is the consumer node of
+      norm (../BatchNorm/batchnorm_1/add_1), the second is the consumer node of
       the folded graph (add_fold).
   """
   mul_scale_name = 'mul_1' if has_scaling else 'mul'
   mul_scale = graph.get_operation_by_name(context +
-                                          '/BatchNorm/batchnorm/' +
+                                          '/BatchNorm/batchnorm_1/' +
                                           mul_scale_name)
   op_below = mul_scale.inputs[0].op
   weights = op_below.inputs[1]
@@ -670,7 +670,7 @@ def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay,
     ]
     scale_name = 'mul' if has_scaling else 'Rsqrt'
     scale = graph.get_operation_by_name(
-        context + '/BatchNorm/batchnorm/' + scale_name)
+        context + '/BatchNorm/batchnorm_1/' + scale_name)
     scale = array_ops.reshape(scale.outputs[0], new_shape,
                               context + '/scale_reshape')
 
@@ -698,7 +698,7 @@ def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay,
                                [(1, mul_fold.outputs[0])])
 
   add_shift = graph.get_operation_by_name(
-      context + '/BatchNorm/batchnorm/add_1')
+      context + '/BatchNorm/batchnorm_1/add_1')
 
   corrected_output = conv_or_fc_folded.outputs[0]
   if correction_offset is not None:
@@ -886,7 +886,7 @@ def _HasScaling(graph, input_to_ops_map, bn):
   Returns:
     A boolean indicating whether this batch norm layer has scaling enabled.
   """
-  rsqrt_op = graph.get_operation_by_name(bn + '/BatchNorm/batchnorm/Rsqrt')
+  rsqrt_op = graph.get_operation_by_name(bn + '/BatchNorm/batchnorm_1/Rsqrt')
   rsqrt_consumers = input_to_ops_map.ConsumerOperations(rsqrt_op)
 
   return sum(1 for op in rsqrt_consumers if op.type == 'Mul') == 1
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
index fa5e11b4708402a4fe76a494ed59e30835ed1363..bfa9d3bf705e327091098a8e416b7902f852605a 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
@@ -516,13 +516,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     if has_scaling:
       if fused:
         return scope + '/BatchNorm_Fold/mul'
-      return scope + '/BatchNorm/batchnorm/mul'
-    return scope + '/BatchNorm/batchnorm/Rsqrt'
+      return scope + '/BatchNorm/batchnorm_1/mul'
+    return scope + '/BatchNorm/batchnorm_1/Rsqrt'
 
   def _BathNormBiasName(self, scope, fused):
     if fused:
       return scope + '/BatchNorm_Fold/bias'
-    return scope + '/BatchNorm/batchnorm/sub'
+    return scope + '/BatchNorm/batchnorm_1/sub'
 
   def _WeightInit(self, stddev):
     """Returns a truncated normal variable initializer.
diff --git a/tensorflow/contrib/quantize/python/graph_matcher.py b/tensorflow/contrib/quantize/python/graph_matcher.py
index bacc707a3abb5539b3b119c1ebc17bd7b30efc5b..aa3ca991c060b208ec71ae27e1ddc75df8a2c723 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import itertools
 
 
 class Pattern(object):
@@ -33,7 +34,7 @@ class Pattern(object):
 class OpTypePattern(Pattern):
   """A tree pattern that matches TF expressions with certain op types."""
 
-  def __init__(self, op_type, name=None, inputs=None):
+  def __init__(self, op_type, name=None, inputs=None, ordered_inputs=True):
     """Initializes an OpTypePattern.
 
     Args:
@@ -48,16 +49,25 @@ class OpTypePattern(Pattern):
       inputs: Optional list of `Pattern`s or strings that specify the
         patterns for the inputs of a matching op. If None, this pattern accepts
         any inputs of a matching op.
+      ordered_inputs: Defaults to True. If False, will match any op that
+        matches a permutation of the inputs.
+
+    Raises:
+      ValueError: if too many inputs are provided when order_inputs is False.
     """
     self._op_type = op_type
     self._name = name
     if inputs is None:
       inputs = []
+    if len(inputs) > 8:
+      raise ValueError(
+          'Only < 8 inputs are allowed when ordered_inputs is False.')
     self._inputs = [
         input_pattern
         if isinstance(input_pattern, Pattern) else OpTypePattern(input_pattern)
         for input_pattern in inputs
     ]
+    self._ordered_inputs = ordered_inputs
 
   @property
   def name(self):
@@ -78,12 +88,23 @@ class OpTypePattern(Pattern):
     if len(op.inputs) != len(self._inputs):
       return None
 
-    for input_tensor, input_pattern in zip(op.inputs, self._inputs):
-      input_match_result = input_pattern.match(input_tensor.op, input_tensor)
-      if input_match_result is None:
-        return None
-      match_result.merge_from(input_match_result)
-    return match_result
+    input_patterns_list = [self._inputs]
+    # If order doesn't matter for the inputs, then make sure we match at least
+    # one permutation of the inputs.
+    if not self._ordered_inputs:
+      input_patterns_list = list(itertools.permutations(self._inputs))
+
+    for input_patterns in input_patterns_list:
+      match_failed = False
+      for input_tensor, input_pattern in zip(op.inputs, input_patterns):
+        input_match_result = input_pattern.match(input_tensor.op, input_tensor)
+        if input_match_result is None:
+          match_failed = True
+          break
+        match_result.merge_from(input_match_result)
+      if not match_failed:
+        return match_result
+    return None
 
 
 class OneofPattern(Pattern):
diff --git a/tensorflow/contrib/quantize/python/graph_matcher_test.py b/tensorflow/contrib/quantize/python/graph_matcher_test.py
index 6d587572181c125faa02d36fb54933cff24f11c6..be741644b615416658001b385930dbe8429c82a2 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher_test.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher_test.py
@@ -22,6 +22,7 @@ from tensorflow.contrib.framework.python import ops as contrib_ops
 from tensorflow.contrib.layers.python.layers import initializers
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.quantize.python import graph_matcher
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -163,6 +164,44 @@ class GraphMatcherTest(test_util.TensorFlowTestCase):
       self.assertEqual(match_result.get_tensor('slice'), slicing)
       self.assertEqual(match_result.get_op('transpose'), transpose.op)
 
+  def test_ordered_pattern(self):
+    #   +            +
+    #  / \          / \
+    # x   y  and   y   x  should both match when ordered inputs is False.
+    # Even when x and y are different operations.
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtypes.float32, shape=[], name='x')
+      y = constant_op.constant(1.0, dtype=dtypes.float32)
+      plus = x + y
+
+    add_pattern_a = graph_matcher.OpTypePattern(
+        'Add', inputs=['Const', 'Placeholder'], ordered_inputs=False)
+    add_pattern_b = graph_matcher.OpTypePattern(
+        'Add', inputs=['Placeholder', 'Const'], ordered_inputs=False)
+    add_pattern_fail = graph_matcher.OpTypePattern(
+        'Add', inputs=['Const', 'Placeholder'], ordered_inputs=True)
+    # Both add_pattern_a and add_pattern_b should match the graph since
+    # ordered_input was set False.
+    matcher_a = graph_matcher.GraphMatcher(add_pattern_a)
+    self.assertEqual([
+        match_result.get_op(add_pattern_a)
+        for match_result in matcher_a.match_graph(g)
+    ], [plus.op])
+    matcher_b = graph_matcher.GraphMatcher(add_pattern_b)
+    self.assertEqual([
+        match_result.get_op(add_pattern_b)
+        for match_result in matcher_b.match_graph(g)
+    ], [plus.op])
+    # But if ordered_inputs is True, the inputs list match should fail if not
+    # specified in the right order.
+    matcher_fail = graph_matcher.GraphMatcher(add_pattern_fail)
+    self.assertEqual(
+        len([
+            match_result.get_op(add_pattern_fail)
+            for match_result in matcher_fail.match_graph(g)
+        ]), 0)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index 5c0e17dc8646ce7850e26ffaa80c0201cea456af..27069444a4bf8416b27787cb142ac9569ed99bb9 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -81,7 +81,8 @@ def LastValueQuantize(inputs,
     a tensor containing quantized values.
   """
   with variable_scope.variable_scope(
-      None, default_name=name_prefix, values=[inputs], reuse=reuse):
+      None, default_name=name_prefix, values=[inputs], reuse=reuse) as scope:
+    scope.set_partitioner(None)
     input_shape = inputs.get_shape()
     input_dim = len(input_shape)
     if per_channel:
@@ -189,7 +190,8 @@ def MovingAvgQuantize(inputs,
     a tensor containing quantized values.
   """
   with variable_scope.variable_scope(
-      None, default_name=name_prefix, values=[inputs], reuse=reuse):
+      None, default_name=name_prefix, values=[inputs], reuse=reuse) as scope:
+    scope.set_partitioner(None)
     input_shape = inputs.get_shape()
     input_dim = len(input_shape)
     if per_channel:
diff --git a/tensorflow/contrib/quantize/python/quant_ops_test.py b/tensorflow/contrib/quantize/python/quant_ops_test.py
index 38846796028512a722752cd83b8bda3b5b0bb77f..c2a8def48012c808da18587c8ff462fa33a363c0 100644
--- a/tensorflow/contrib/quantize/python/quant_ops_test.py
+++ b/tensorflow/contrib/quantize/python/quant_ops_test.py
@@ -23,6 +23,8 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -73,6 +75,36 @@ class QuantOpsTest(googletest.TestCase):
       self.assertGreater(max_value, 0.0)
       self.assertLess(max_value, 1.0)
 
+  def testVariablesNotParitioned_LastValue(self):
+    # Variables added should not use a default partiioner since they are
+    # scalar. There would be a tensorflow error thrown if the partitioner was
+    # respected by the rewrite.
+    with ops.Graph().as_default():
+      with variable_scope.variable_scope(
+          'part', partitioner=partitioned_variables.fixed_size_partitioner(2)):
+        x = array_ops.placeholder(dtypes.float32, shape=[2])
+        _ = quant_ops.LastValueQuantize(
+            x,
+            init_min=0.0,
+            init_max=0.0,
+            is_training=True,
+            vars_collection=_MIN_MAX_VARS)
+
+  def testVariablesNotParitioned_MovingAvg(self):
+    # Variables added should not use a default partiioner since they are
+    # scalar. There would be a tensorflow error thrown if the partitioner was
+    # respected by the rewrite.
+    with ops.Graph().as_default():
+      with variable_scope.variable_scope(
+          'part', partitioner=partitioned_variables.fixed_size_partitioner(2)):
+        x = array_ops.placeholder(dtypes.float32, shape=[2])
+        _ = quant_ops.MovingAvgQuantize(
+            x,
+            init_min=0.0,
+            init_max=0.0,
+            is_training=True,
+            vars_collection=_MIN_MAX_VARS)
+
   def _GetMinMaxValues(self, sess):
     min_max_vars = ops.get_collection(_MIN_MAX_VARS)
     self.assertEqual(len(min_max_vars), 2)
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 60616ea749cd3fb0edaf57cbe67484285ad41f75..cbba72643f7f166c473b6181edc292f695c4cbc2 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -218,8 +218,19 @@ def _FindLayersToQuantize(graph):
   """
   input_pattern = graph_matcher.OpTypePattern('*')
   weight_var_pattern = graph_matcher.OpTypePattern('Variable|VariableV2')
-  weight_identity_pattern = graph_matcher.OpTypePattern(
+  weight_partition_identity_pattern = graph_matcher.OpTypePattern(
       'Identity', inputs=[weight_var_pattern])
+  weight_partition_concat_pattern = graph_matcher.OpTypePattern(
+      'ConcatV2', inputs=[weight_partition_identity_pattern, '*', '*'])
+  weight_identity_pattern = graph_matcher.OpTypePattern(
+      'Identity',
+      inputs=[
+          graph_matcher.OneofPattern([
+              weight_partition_identity_pattern,
+              weight_partition_concat_pattern,
+              weight_var_pattern,
+          ])
+      ])
   weight_resource_var_pattern = graph_matcher.OpTypePattern('ReadVariableOp')
   folded_weight_pattern = graph_matcher.OpTypePattern('Mul')
 
@@ -233,37 +244,37 @@ def _FindLayersToQuantize(graph):
               weight_identity_pattern, weight_resource_var_pattern,
               folded_weight_pattern
           ])
-      ])
+      ],
+      ordered_inputs=False)
 
   folded_bias_mul_pattern = graph_matcher.OpTypePattern(
-      'Mul', inputs=[graph_matcher.OpTypePattern('*'), layer_pattern])
+      'Mul',
+      inputs=[graph_matcher.OpTypePattern('*'), layer_pattern],
+      ordered_inputs=False)
   post_layer_op_correction_pattern = graph_matcher.OpTypePattern(
-      'Add', inputs=[folded_bias_mul_pattern,
-                     graph_matcher.OpTypePattern('*')])
+      'Add',
+      inputs=[folded_bias_mul_pattern,
+              graph_matcher.OpTypePattern('*')],
+      ordered_inputs=False)
   folded_bias_add_pattern = graph_matcher.OpTypePattern(
       'Add',
       inputs=[
           post_layer_op_correction_pattern,
           graph_matcher.OpTypePattern('*')
-      ])
+      ],
+      ordered_inputs=False)
 
   bias_add_pattern = graph_matcher.OpTypePattern(
-      'Add|BiasAdd', inputs=[layer_pattern, '*'])
+      'Add|BiasAdd', inputs=[layer_pattern, '*'], ordered_inputs=False)
 
   # The bias can come from the bias add or the folded bias add.
-  bypass_pattern_a = graph_matcher.OpTypePattern(
+  bypass_pattern = graph_matcher.OpTypePattern(
       'Add',
       inputs=[
           graph_matcher.OneofPattern(
               [bias_add_pattern, folded_bias_add_pattern]), '*'
-      ])
-  bypass_pattern_b = graph_matcher.OpTypePattern(
-      'Add',
-      inputs=[
-          '*',
-          graph_matcher.OneofPattern(
-              [bias_add_pattern, folded_bias_add_pattern])
-      ])
+      ],
+      ordered_inputs=False)
 
   # The input to the activation can come from bias add, fold bias add, the
   # bypasses.
@@ -273,15 +284,14 @@ def _FindLayersToQuantize(graph):
       '|'.join(_ACTIVATION_TYPES) + '|Identity',
       inputs=[
           graph_matcher.OneofPattern([
-              bias_add_pattern, folded_bias_add_pattern, bypass_pattern_a,
-              bypass_pattern_b
+              bias_add_pattern,
+              folded_bias_add_pattern,
+              bypass_pattern,
           ])
       ])
 
-  post_activation_bypass_pattern_a = graph_matcher.OpTypePattern(
-      'Add', inputs=['*', activation_pattern])
-  post_activation_bypass_pattern_b = graph_matcher.OpTypePattern(
-      'Add', inputs=[activation_pattern, '*'])
+  post_activation_bypass_pattern = graph_matcher.OpTypePattern(
+      'Add', inputs=['*', activation_pattern], ordered_inputs=False)
 
   # The order of the following matching blocks is very important. Since matches
   # aren't guaranteed to be disjoint, we structure matches from largest to
@@ -297,10 +307,7 @@ def _FindLayersToQuantize(graph):
   # to ensure we don't match only the first part of this layer, missing the
   # post activation bypass node.
   post_activation_bypass_layer_matcher = graph_matcher.GraphMatcher(
-      graph_matcher.OneofPattern([
-          post_activation_bypass_pattern_a,
-          post_activation_bypass_pattern_b,
-      ]))
+      post_activation_bypass_pattern)
   for match_result in post_activation_bypass_layer_matcher.match_graph(graph):
     layer_op = match_result.get_op(layer_pattern)
     weight_tensor = match_result.get_tensor(weight_identity_pattern)
@@ -312,14 +319,9 @@ def _FindLayersToQuantize(graph):
     bias_add_op = match_result.get_op(bias_add_pattern)
     if bias_add_op is None:
       bias_add_op = match_result.get_op(folded_bias_add_pattern)
-    bypass_op = match_result.get_op(bypass_pattern_a)
-    if bypass_op is None:
-      bypass_op = match_result.get_op(bypass_pattern_b)
+    bypass_op = match_result.get_op(bypass_pattern)
     post_activation_bypass_op = match_result.get_op(
-        post_activation_bypass_pattern_a)
-    if post_activation_bypass_op is None:
-      post_activation_bypass_op = match_result.get_op(
-          post_activation_bypass_pattern_b)
+        post_activation_bypass_pattern)
     if layer_op not in matched_layer_set:
       matched_layer_set.add(layer_op)
       layer_matches.append(
@@ -340,9 +342,7 @@ def _FindLayersToQuantize(graph):
     bias_add_op = match_result.get_op(bias_add_pattern)
     if bias_add_op is None:
       bias_add_op = match_result.get_op(folded_bias_add_pattern)
-    bypass_op = match_result.get_op(bypass_pattern_a)
-    if bypass_op is None:
-      bypass_op = match_result.get_op(bypass_pattern_b)
+    bypass_op = match_result.get_op(bypass_pattern)
     if layer_op not in matched_layer_set:
       matched_layer_set.add(layer_op)
       layer_matches.append(
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index e7360ae03ca535146dee007eeec88373adf39f12..92ca4a1b0c3126ebccf2b525f01f4d6455c4d527 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -27,6 +27,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import googletest
 
 conv2d = layers.conv2d
@@ -327,6 +329,66 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     # No ops should be inserted or removed.
     self.assertEqual(op_names_before_quantize, op_names_after_quantize)
 
+  def testSinglePartitionedVariable(self):
+    self._RunTestOverParameters(self._testSinglePartitionedVariable)
+
+  def _testSinglePartitionedVariable(self, is_training):
+    # When weights are partitioned into a single partition, the weights variable
+    # is followed by a identity -> identity (An additional identity node).
+    partitioner = partitioned_variables.fixed_size_partitioner(1)
+    graph = ops.Graph()
+    with graph.as_default():
+      with variable_scope.variable_scope('part', partitioner=partitioner):
+        batch_size, height, width, depth = 5, 128, 128, 3
+        input1 = array_ops.zeros((batch_size, height, width, depth))
+        input2 = array_ops.zeros((batch_size, height / 2, width / 2, 32))
+        conv = conv2d(
+            input1,
+            32, [5, 5],
+            stride=2,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            scope='test/test')
+        node = math_ops.add(conv, input2, name='test/add')
+        node = nn_ops.relu6(node, name='test/relu6')
+
+      quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+      # Check that the weight's quant node was added.
+      op_names = [op.name for op in graph.get_operations()]
+      self.assertTrue(
+          'part/test/test/weights_quant/FakeQuantWithMinMaxVars' in op_names)
+
+  def testMultiplePartitionedVariables(self):
+    self._RunTestOverParameters(self._testMultiplePartitionedVariables)
+
+  def _testMultiplePartitionedVariables(self, is_training):
+    # When weights are partitioned into multiple partitions the weights variable
+    # is followed by a identity -> concat -> identity to group the partitions.
+    partitioner = partitioned_variables.fixed_size_partitioner(2)
+    graph = ops.Graph()
+    with graph.as_default():
+      with variable_scope.variable_scope('part', partitioner=partitioner):
+        batch_size, height, width, depth = 5, 128, 128, 3
+        input1 = array_ops.zeros((batch_size, height, width, depth))
+        input2 = array_ops.zeros((batch_size, height / 2, width / 2, 32))
+        conv = conv2d(
+            input1,
+            32, [5, 5],
+            stride=2,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            scope='test/test')
+        node = math_ops.add(conv, input2, name='test/add')
+        node = nn_ops.relu6(node, name='test/relu6')
+
+      quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+      # Check that the weight's quant node was added.
+      op_names = [op.name for op in graph.get_operations()]
+      self.assertTrue(
+          'part/test/test/weights_quant/FakeQuantWithMinMaxVars' in op_names)
+
   def _WeightInit(self, stddev):
     """Returns truncated normal variable initializer.
 
diff --git a/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py b/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py
index cf55da27236d17c709cbde689831ad68da9a8a7b..a42bbca61135a5c1666f1964c25af9c105b472bb 100644
--- a/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py
+++ b/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py
@@ -385,7 +385,7 @@ class ReceptiveFieldTest(test.TestCase):
      effective_stride_y, effective_padding_x, effective_padding_y) = (
          receptive_field.compute_receptive_field_from_graph_def(
              graph_def, input_node, output_node,
-             ['Dropout/dropout/random_uniform']))
+             ['Dropout/dropout_1/random_uniform']))
     self.assertEqual(receptive_field_x, 3)
     self.assertEqual(receptive_field_y, 3)
     self.assertEqual(effective_stride_x, 4)
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index d41fc0b3ac1cee4eacc88cb0f41df1f9ee59e7c3..b8840a8f2420f1bc6c75f0a02e5465c595378dec 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
+import os
 
 import numpy as np
 
@@ -30,6 +31,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -39,6 +41,7 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 # pylint: enable=protected-access
 Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
@@ -189,6 +192,7 @@ class RNNCellTest(test.TestCase):
           self.assertEqual(cell.dtype, None)
           self.assertEqual("cell-0", cell._checkpoint_dependencies[0].name)
           self.assertEqual("cell-1", cell._checkpoint_dependencies[1].name)
+          cell.get_config()  # Should not throw an error
           g, out_m = cell(x, m)
           # Layer infers the input type.
           self.assertEqual(cell.dtype, dtype.name)
@@ -439,6 +443,26 @@ class RNNCellTest(test.TestCase):
           self.assertTrue(
               float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) < 1e-6)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testWrapperCheckpointing(self):
+    for wrapper_type in [
+        rnn_cell_impl.DropoutWrapper,
+        rnn_cell_impl.ResidualWrapper,
+        lambda cell: rnn_cell_impl.MultiRNNCell([cell])]:
+      with self.test_session():
+        cell = rnn_cell_impl.BasicRNNCell(1)
+        wrapper = wrapper_type(cell)
+        wrapper(array_ops.ones([1, 1]),
+                state=wrapper.zero_state(batch_size=1, dtype=dtypes.float32))
+        self.evaluate([v.initializer for v in cell.variables])
+        checkpoint = checkpointable_utils.Checkpoint(wrapper=wrapper)
+        prefix = os.path.join(self.get_temp_dir(), "ckpt")
+        self.evaluate(cell._bias.assign([40.]))
+        save_path = checkpoint.save(prefix)
+        self.evaluate(cell._bias.assign([0.]))
+        checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+        self.assertAllEqual([40.], self.evaluate(cell._bias))
+
   def testOutputProjectionWrapper(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
@@ -483,7 +507,13 @@ class RNNCellTest(test.TestCase):
         base_cell = rnn_cell_impl.GRUCell(3)
         g, m_new = base_cell(x, m)
         variable_scope.get_variable_scope().reuse_variables()
-        g_res, m_new_res = rnn_cell_impl.ResidualWrapper(base_cell)(x, m)
+        wrapper_object = rnn_cell_impl.ResidualWrapper(base_cell)
+        (name, dep), = wrapper_object._checkpoint_dependencies
+        wrapper_object.get_config()  # Should not throw an error
+        self.assertIs(dep, base_cell)
+        self.assertEqual("cell", name)
+
+        g_res, m_new_res = wrapper_object(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run([g, g_res, m_new, m_new_res], {
             x: np.array([[1., 1., 1.]]),
@@ -526,7 +556,13 @@ class RNNCellTest(test.TestCase):
         "root", initializer=init_ops.constant_initializer(0.5)):
       x = array_ops.zeros([1, 3])
       m = array_ops.zeros([1, 3])
-      cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), "/cpu:14159")
+      wrapped = rnn_cell_impl.GRUCell(3)
+      cell = rnn_cell_impl.DeviceWrapper(wrapped, "/cpu:14159")
+      (name, dep), = cell._checkpoint_dependencies
+      cell.get_config()  # Should not throw an error
+      self.assertIs(dep, wrapped)
+      self.assertEqual("cell", name)
+
       outputs, _ = cell(x, m)
       self.assertTrue("cpu:14159" in outputs.device.lower())
 
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index ba4933ddf793c58d00ae28a54eb21410f41e2e16..be99a5d67a3e49b1d522406601d050392f75e963 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
@@ -142,6 +143,47 @@ class TestStateSaver(object):
     self.saved_state[name] = state
     return array_ops.identity(state)
 
+  @property
+  def batch_size(self):
+    return self._batch_size
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+
+class TestStateSaverWithCounters(TestStateSaver):
+  """Class wrapper around TestStateSaver.
+
+  A dummy class used for testing of static_state_saving_rnn. It helps test if
+  save_state and state functions got called same number of time when we
+  evaluate output of rnn cell and state or either of them separately. It
+  inherits from the TestStateSaver and adds the counters for calls of functions.
+  """
+
+  def __init__(self, batch_size, state_size):
+    super(TestStateSaverWithCounters, self).__init__(batch_size, state_size)
+    self._num_state_calls = variables_lib.Variable(0)
+    self._num_save_state_calls = variables_lib.Variable(0)
+
+  def state(self, name):
+    with ops_lib.control_dependencies(
+        [state_ops.assign_add(self._num_state_calls, 1)]):
+      return super(TestStateSaverWithCounters, self).state(name)
+
+  def save_state(self, name, state):
+    with ops_lib.control_dependencies([state_ops.assign_add(
+        self._num_save_state_calls, 1)]):
+      return super(TestStateSaverWithCounters, self).save_state(name, state)
+
+  @property
+  def num_state_calls(self):
+    return self._num_state_calls
+
+  @property
+  def num_save_state_calls(self):
+    return self._num_save_state_calls
+
 
 class RNNTest(test.TestCase):
 
@@ -186,6 +228,9 @@ class RNNTest(test.TestCase):
     cell = Plus1RNNCell()
     full_dropout_cell = rnn_cell.DropoutWrapper(
         cell, input_keep_prob=1e-12, seed=0)
+    (name, dep), = full_dropout_cell._checkpoint_dependencies
+    self.assertIs(dep, cell)
+    self.assertEqual("cell", name)
     batch_size = 2
     input_size = 5
     max_length = 8
@@ -1792,13 +1837,40 @@ class StateSaverRNNTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
-  def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
+  def _factory(self, scope, state_saver):
+    num_units = state_saver.state_size // 2
+    batch_size = state_saver.batch_size
+    input_size = 5
+    max_length = 8
+    initializer = init_ops.random_uniform_initializer(
+        -0.01, 0.01, seed=self._seed)
+    cell = rnn_cell.LSTMCell(
+        num_units,
+        use_peepholes=False,
+        initializer=initializer,
+        state_is_tuple=False)
+    inputs = max_length * [
+        array_ops.zeros(dtype=dtypes.float32, shape=(batch_size, input_size))
+    ]
+    out, state = rnn.static_state_saving_rnn(
+        cell,
+        inputs,
+        state_saver=state_saver,
+        state_name="save_lstm",
+        scope=scope)
+    return out, state, state_saver
+
+  def _testScope(self, prefix="prefix", use_outer_scope=True):
+    num_units = 3
+    batch_size = 2
+    state_saver = TestStateSaver(batch_size, 2 * num_units)
+
     with self.test_session(use_gpu=True, graph=ops_lib.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
-          factory(scope)
+          self._factory(scope=scope, state_saver=state_saver)
       else:
-        factory(prefix)
+        self._factory(scope=prefix, state_saver=state_saver)
         variables_lib.global_variables_initializer()
 
       # check that all the variables names starts
@@ -1813,34 +1885,46 @@ class StateSaverRNNTest(test.TestCase):
       self.assertEqual(len(scope_vars), len(all_vars))
 
   def testStateSaverRNNScope(self):
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    max_length = 8
+    self._testScope(use_outer_scope=True)
+    self._testScope(use_outer_scope=False)
+    self._testScope(prefix=None, use_outer_scope=False)
 
-    def factory(scope):
-      initializer = init_ops.random_uniform_initializer(
-          -0.01, 0.01, seed=self._seed)
-      state_saver = TestStateSaver(batch_size, 2 * num_units)
-      cell = rnn_cell.LSTMCell(
-          num_units,
-          use_peepholes=False,
-          initializer=initializer,
-          state_is_tuple=False)
-      inputs = max_length * [
-          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
-      ]
-      return rnn.static_state_saving_rnn(
-          cell,
-          inputs,
-          state_saver=state_saver,
-          state_name="save_lstm",
-          scope=scope)
+  def testStateSaverCallsSaveState(self):
+    """Test that number of calls to state and save_state is equal.
 
-    self._testScope(factory, use_outer_scope=True)
-    self._testScope(factory, use_outer_scope=False)
-    self._testScope(factory, prefix=None, use_outer_scope=False)
+    Test if the order of actual evaluating or skipping evaluation of out,
+    state tensors, which are the output tensors from static_state_saving_rnn,
+    have influence on number of calls to save_state and state methods of
+    state_saver object (the number of calls should be same.)
+    """
 
+    num_units = 3
+    batch_size = 2
+    state_saver = TestStateSaverWithCounters(batch_size, 2 * num_units)
+    out, state, state_saver = self._factory(scope=None, state_saver=state_saver)
+
+    with self.test_session() as sess:
+      sess.run(variables_lib.global_variables_initializer())
+      sess.run(variables_lib.local_variables_initializer())
+
+      _, _, num_state_calls, num_save_state_calls = sess.run([
+          out,
+          state,
+          state_saver.num_state_calls,
+          state_saver.num_save_state_calls])
+      self.assertEqual(num_state_calls, num_save_state_calls)
+
+      _, num_state_calls, num_save_state_calls = sess.run([
+          out,
+          state_saver.num_state_calls,
+          state_saver.num_save_state_calls])
+      self.assertEqual(num_state_calls, num_save_state_calls)
+
+      _, num_state_calls, num_save_state_calls = sess.run([
+          state,
+          state_saver.num_state_calls,
+          state_saver.num_save_state_calls])
+      self.assertEqual(num_state_calls, num_save_state_calls)
 
 class GRUTest(test.TestCase):
 
diff --git a/tensorflow/contrib/signal/python/ops/window_ops.py b/tensorflow/contrib/signal/python/ops/window_ops.py
index 50094010dc75cf8b3c62da5e3a7ed5e995e6df41..59e67e8ba414df1f9c777d1f5a3f3dba975648a2 100644
--- a/tensorflow/contrib/signal/python/ops/window_ops.py
+++ b/tensorflow/contrib/signal/python/ops/window_ops.py
@@ -47,7 +47,7 @@ def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
   Raises:
     ValueError: If `dtype` is not a floating point type.
 
-  [hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window
+  [hann]: https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
   """
   return _raised_cosine_window(name, 'hann_window', window_length, periodic,
                                dtype, 0.5, 0.5)
@@ -72,7 +72,7 @@ def hamming_window(window_length, periodic=True, dtype=dtypes.float32,
   Raises:
     ValueError: If `dtype` is not a floating point type.
 
-  [hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window
+  [hamming]: https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
   """
   return _raised_cosine_window(name, 'hamming_window', window_length, periodic,
                                dtype, 0.54, 0.46)
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index 746b95564237617359afe1791484809369c4a894..f2bb458848fab5603128903868b52f29785efc92 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -909,3 +909,8 @@ slim.evaluation.evaluation_loop(
 
 ## Authors
 Sergio Guadarrama and Nathan Silberman
+
+## Citation
+"TensorFlow-Slim: a lightweight library for defining, training and evaluating complex models in TensorFlow"
+S. Guadarrama, N. Silberman, 2016.
+https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index f2d31dc8db5688dc9a3308267109214277436040..d877831fce99a30c4f1aa104d70a6d588a768de7 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -102,7 +102,7 @@ class BoundingBox(ItemHandler):
   """An ItemHandler that concatenates a set of parsed Tensors to Bounding Boxes.
   """
 
-  def __init__(self, keys=None, prefix=None):
+  def __init__(self, keys=None, prefix=''):
     """Initialize the bounding box handler.
 
     Args:
diff --git a/tensorflow/contrib/sparsemax/BUILD b/tensorflow/contrib/sparsemax/BUILD
index b729fff261192be22c6a56fa9ca0a641f302c570..d7ba754f701d4b433e35ad8396eae7ee6132b97f 100644
--- a/tensorflow/contrib/sparsemax/BUILD
+++ b/tensorflow/contrib/sparsemax/BUILD
@@ -38,7 +38,7 @@ py_library(
 
 cuda_py_tests(
     name = "sparsemax_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/sparsemax_test.py"],
     additional_deps = [
         ":sparsemax_py",
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index 99ced53e1167ec5486d0b75cff81ffbf857c2be7..d22b80ac88a9ced541a952fcbb58c50366464075 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -21,6 +21,7 @@ from @{tf.summary.merge_all} to @{tf.summary.FileWriter}.
 
 To use with eager execution enabled, write your code as follows:
 
+```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -30,9 +31,11 @@ with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
   tf.contrib.summary.scalar("loss", my_loss)
   # In this case every call to tf.contrib.summary.scalar will generate a record
   # ...
+```
 
 To use it with graph execution, write your code as follows:
 
+```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -53,7 +56,7 @@ with tf.Session(...) as sess:
   while not_done_training:
     sess.run([train_op, tf.contrib.summary.all_summary_ops()])
     # ...
-
+```
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 7a35a70bbe3112e0649cefd8116cc50565978da5..6f62cd11a9733949c350e35b6b0c436dd097cc33 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -295,7 +295,7 @@ def get_epoch_variable():
 
 
 # A simple container to hold the training variables for a single tree.
-class TreeTrainingVariables(object):
+class TreeVariables(object):
   """Stores tf.Variables for training a single random tree.
 
   Uses tf.get_variable to get tree-specific names so that this can be used
@@ -303,7 +303,7 @@ class TreeTrainingVariables(object):
   then relies on restoring that model to evaluate).
   """
 
-  def __init__(self, params, tree_num, training):
+  def __init__(self, params, tree_num, training, tree_config='', tree_stat=''):
     if (not hasattr(params, 'params_proto') or
         not isinstance(params.params_proto,
                        _params_proto.TensorForestParams)):
@@ -315,27 +315,28 @@ class TreeTrainingVariables(object):
       # TODO(gilberth): Manually shard this to be able to fit it on
       # multiple machines.
       self.stats = stats_ops.fertile_stats_variable(
-          params, '', self.get_tree_name('stats', tree_num))
+          params, tree_stat, self.get_tree_name('stats', tree_num))
     self.tree = model_ops.tree_variable(
-        params, '', self.stats, self.get_tree_name('tree', tree_num))
+        params, tree_config, self.stats, self.get_tree_name('tree', tree_num))
 
   def get_tree_name(self, name, num):
     return '{0}-{1}'.format(name, num)
 
 
-class ForestTrainingVariables(object):
+class ForestVariables(object):
   """A container for a forests training data, consisting of multiple trees.
 
-  Instantiates a TreeTrainingVariables object for each tree. We override the
+  Instantiates a TreeVariables object for each tree. We override the
   __getitem__ and __setitem__ function so that usage looks like this:
 
-    forest_variables = ForestTrainingVariables(params)
+    forest_variables = ForestVariables(params)
 
     ... forest_variables.tree ...
   """
 
   def __init__(self, params, device_assigner, training=True,
-               tree_variables_class=TreeTrainingVariables):
+               tree_variables_class=TreeVariables,
+               tree_configs=None, tree_stats=None):
     self.variables = []
     # Set up some scalar variables to run through the device assigner, then
     # we can use those to colocate everything related to a tree.
@@ -347,7 +348,13 @@ class ForestTrainingVariables(object):
 
     for i in range(params.num_trees):
       with ops.device(self.device_dummies[i].device):
-        self.variables.append(tree_variables_class(params, i, training))
+        kwargs = {}
+        if tree_configs is not None:
+          kwargs.update(dict(tree_config=tree_configs[i]))
+        if tree_stats is not None:
+          kwargs.update(dict(tree_stat=tree_stats[i]))
+        self.variables.append(tree_variables_class(
+            params, i, training, **kwargs))
 
   def __setitem__(self, t, val):
     self.variables[t] = val
@@ -361,9 +368,11 @@ class RandomForestGraphs(object):
 
   def __init__(self,
                params,
+               tree_configs=None,
+               tree_stats=None,
                device_assigner=None,
                variables=None,
-               tree_variables_class=TreeTrainingVariables,
+               tree_variables_class=TreeVariables,
                tree_graphs=None,
                training=True):
     self.params = params
@@ -371,9 +380,10 @@ class RandomForestGraphs(object):
         device_assigner or framework_variables.VariableDeviceChooser())
     logging.info('Constructing forest with params = ')
     logging.info(self.params.__dict__)
-    self.variables = variables or ForestTrainingVariables(
+    self.variables = variables or ForestVariables(
         self.params, device_assigner=self.device_assigner, training=training,
-        tree_variables_class=tree_variables_class)
+        tree_variables_class=tree_variables_class,
+        tree_configs=tree_configs, tree_stats=tree_stats)
     tree_graph_class = tree_graphs or RandomTreeGraphs
     self.trees = [
         tree_graph_class(self.variables[i], self.params, i)
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
index bbe627b15773fafe83a0700da696f429876c0968..1c9c81827e0f251c8ae7bc47242334fb202835ac 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from google.protobuf.json_format import ParseDict
+from tensorflow.contrib.decision_trees.proto import generic_tree_model_pb2 as _tree_proto
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -110,6 +114,47 @@ class TensorForestTest(test_util.TensorFlowTestCase):
     self.assertTrue(isinstance(paths, ops.Tensor))
     self.assertTrue(isinstance(var, ops.Tensor))
 
+  def testInfrenceFromRestoredModel(self):
+    input_data = [[-1., 0.], [-1., 2.],  # node 1
+                  [1., 0.], [1., -2.]]  # node 2
+    expected_prediction = [[0.0, 1.0], [0.0, 1.0],
+                           [0.0, 1.0], [0.0, 1.0]]
+    hparams = tensor_forest.ForestHParams(
+        num_classes=2,
+        num_features=2,
+        num_trees=1,
+        max_nodes=1000,
+        split_after_samples=25).fill()
+    tree_weight = {'decisionTree':
+                       {'nodes':
+                        [{'binaryNode':
+                          {'rightChildId': 2,
+                           'leftChildId': 1,
+                           'inequalityLeftChildTest':
+                           {'featureId': {'id': '0'},
+                            'threshold': {'floatValue': 0}}}},
+                         {'leaf': {'vector':
+                                   {'value': [{'floatValue': 0.0},
+                                              {'floatValue': 1.0}]}},
+                          'nodeId': 1},
+                         {'leaf': {'vector':
+                                   {'value': [{'floatValue': 0.0},
+                                              {'floatValue': 1.0}]}},
+                          'nodeId': 2}]}}
+    restored_tree_param = ParseDict(tree_weight,
+                                    _tree_proto.Model()).SerializeToString()
+    graph_builder = tensor_forest.RandomForestGraphs(hparams,
+                                                     [restored_tree_param])
+    probs, paths, var = graph_builder.inference_graph(input_data)
+    self.assertTrue(isinstance(probs, ops.Tensor))
+    self.assertTrue(isinstance(paths, ops.Tensor))
+    self.assertTrue(isinstance(var, ops.Tensor))
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+      self.assertEquals(probs.eval().shape, (4, 2))
+      self.assertEquals(probs.eval().tolist(), expected_prediction)
+
   def testTrainingConstructionClassificationSparse(self):
     input_data = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 3], [1, 0], [1, 7], [2, 1], [3, 9]],
diff --git a/tensorflow/contrib/tensorboard/graph_explorer/proto/graph_explorer.proto b/tensorflow/contrib/tensorboard/graph_explorer/proto/graph_explorer.proto
deleted file mode 100644
index 835337ed5c58d0f0595ce8a88f08c8e63a860a36..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorboard/graph_explorer/proto/graph_explorer.proto
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the 'License');
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an 'AS IS' BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-// GraphExplorer is a tool that supports interactive, hierarchical visualization
-// of graphs. GraphExplorer renders graphs generated by TensorFlow represented
-// as GraphDef messages defined in tensorflow/core/framework/graph.proto. The
-// GraphDef proto does not allow for explicitly specifying visual attributes of
-// the graph such as color, line thickness, fonts, etc. This file introduces a
-// new proto for representing graphs and specifying visual attributes of graphs.
-//
-// The structure of the Graph proto is given by the EBNF grammar below. Consult
-// the message definitions below for details.
-//
-//  graph ::= node* edge* node_attribute* metanode_attribute* edge_attribute*
-//            graph_attribute*
-//  node  ::= node_id node_attribute* metanode_attribute* node_data*
-//  edge  ::= source_id target_id edge_attribute* edge_data*
-//
-// A graph consists of a list of nodes and a list of edges and attributes for
-// nodes, edges and the graph. Attributes have a name and a value and are
-// represented as key-value pairs, with {"color", "blue"} being an example.
-// Attributes have a scope, where the broadest scope is the graph and the
-// narrowest is a node that has no internal structure.
-syntax = "proto3";
-
-package graph_explorer;
-
-// There are two types of nodes. A 'metanode' contains other
-// nodes and a 'leaf node' has no internal structure.  The metanode containment
-// relationship is acyclic, meaning that if a metanode 'A' contains the metanode
-// 'B', then 'B' cannot contain 'A'.
-message Node {
-  // The identifier of a node is a sequence of strings separated by '/'. The
-  // identifier provides a unique name for a node and defines its hierarchical
-  // relation to other nodes.  If no label is provided  the last part of the
-  // identifier is used as a label.
-  //
-  // Example: In the graph below, metanodes are written with square brackets and
-  // leaf nodes with parentheses. The metanode 'node1' contains the leaf node
-  // 'node4' and the metanode 'node2', which contains the leaf node 'node3'.
-  //
-  //   [node1 [node2 (node3)] (node4)]
-  //
-  // The identifiers for these nodes are: "node1", "node1/node2",
-  // "node1/node2/node3", and "node1/node4".
-  string name = 1;
-
-  // A node attribute is information used by Graph Explorer to style a node.
-  map<string, string> node_attr = 2;
-
-  // A metanode attribute is one that is inherited by all nodes inside the
-  // current metanode. If an attribute applies only to the current node and
-  // should not be inherited, it should be specified as a node attribute.
-  map<string, string> metanode_attr = 3;
-};
-
-// An edge consists of a source and a target node, specified by their
-// identifiers. An edge has attributes and data that are similar to node
-// attributes and node data. Edges do not form a hierarchy so there are no
-// metanode attributes.
-message Edge {
-  // The source and target fields must have the format of a Node name.
-  string source = 1;
-  string target = 2;
-
-  // Edge attributes.
-  map<string, string> edge_attr = 3;
-}
-
-message Graph {
-  // List of nodes in the graph.
-  repeated Node node = 1;
-
-  // List of edges in the graph.
-  repeated Edge edge = 2;
-
-  // Default values of node, metanode and edge attributes.
-  map<string, string> node_attr = 3;
-  map<string, string> metanode_attr = 4;
-  map<string, string> edge_attr = 5;
-
-  // Graph attributes.
-  map<string, string> graph_attr = 6;
-};
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 675f0b1fd6ede56e38da3bd9dad4ae61e11a9be8..7a8a71ac7f491ec48a47ae1ea1aff750a587beaa 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -67,6 +67,7 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = [
         ":trt_logging",
+        ":trt_plugins",
     ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
     ]) + tf_custom_op_library_additional_deps(),
@@ -86,6 +87,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":trt_logging",
+        ":trt_plugins",
         ":trt_resources",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
@@ -232,6 +234,7 @@ tf_cuda_library(
     ],
     deps = [
         ":segment",
+        ":trt_plugins",
         ":trt_logging",
         ":trt_resources",
         "//tensorflow/core/grappler/clusters:cluster",
@@ -263,7 +266,6 @@ cc_library(
         "segment/segment.h",
         "segment/union_find.h",
     ],
-    linkstatic = 1,
     deps = [
         "//tensorflow/core:graph",
         "//tensorflow/core:lib_proto_parsing",
@@ -286,6 +288,46 @@ tf_cc_test(
     ],
 )
 
+# Library for the plugin factory
+tf_cuda_library(
+    name = "trt_plugins",
+    srcs = [
+        "plugin/trt_plugin.cc",
+        "plugin/trt_plugin_factory.cc",
+        "plugin/trt_plugin_utils.cc",
+    ],
+    hdrs = [
+        "plugin/trt_plugin.h",
+        "plugin/trt_plugin_factory.h",
+        "plugin/trt_plugin_utils.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:platform_base",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_cuda_cc_test(
+    name = "trt_plugin_factory_test",
+    size = "small",
+    srcs = ["plugin/trt_plugin_factory_test.cc"],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":trt_plugins",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
 py_test(
     name = "tf_trt_integration_test",
     srcs = ["test/tf_trt_integration_test.py"],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 4df54a749f5a2dc4884fb437a7a16cd3bb51fa17..b7b26cfb1c05ae74e932c8b9cb2479cfca308514 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <list>
 #include <map>
@@ -77,7 +78,8 @@ bool IsTensorRTCandidate(const tensorflow::Node* node) {
       // TODO(ben,jie): ...
   };
   // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h)
-  return candidate_ops.count(node->type_string());
+  return (candidate_ops.count(node->type_string()) ||
+          PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
 }
 
 void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 3767596f8c20a300b5cae401e6abb7f9b7e03834..32b211dcd1e282d334327b83a27f9401de7f310a 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <algorithm>
 #include <list>
@@ -240,35 +241,49 @@ class TFAttrs {
     return attrs_.at(key);
   }
   template <typename T>
-  T get(string key) const;
+  T get(const string& key) const;
   template <typename T>
-  T get(string key, const T& default_value) const {
+  T get(const string& key, const T& default_value) const {
     return attrs_.count(key) ? this->get<T>(key) : default_value;
   }
 
+  std::vector<string> GetAllAttrKey() {
+    std::vector<string> attr_list;
+    for (const auto& attr_item : attrs_) {
+      attr_list.emplace_back(attr_item.first);
+    }
+    return attr_list;
+  }
+
  private:
   typedef std::map<string, tensorflow::AttrValue const*> AttrMap;
   AttrMap attrs_;
 };
 
 template <>
-string TFAttrs::get<string>(string key) const {
+string TFAttrs::get<string>(const string& key) const {
   return this->at(key)->s();
 }
 
 template <>
-std::vector<int> TFAttrs::get<std::vector<int>>(string key) const {
+std::vector<int> TFAttrs::get<std::vector<int>>(const string& key) const {
   auto attr = this->at(key)->list().i();
   return std::vector<int>(attr.begin(), attr.end());
 }
 
 template <>
-std::vector<string> TFAttrs::get<std::vector<string>>(string key) const {
+std::vector<float> TFAttrs::get<std::vector<float>>(const string& key) const {
+  auto attr = this->at(key)->list().f();
+  return std::vector<float>(attr.begin(), attr.end());
+}
+
+template <>
+std::vector<string> TFAttrs::get<std::vector<string>>(const string& key) const {
   auto attr = this->at(key)->list().s();
   return std::vector<string>(attr.begin(), attr.end());
 }
 template <>
-nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(string key) const {
+nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(const string& key) const {
   auto values = this->get<std::vector<int>>(key);
   nvinfer1::Dims dims;
   dims.nbDims = values.size();
@@ -278,24 +293,25 @@ nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(string key) const {
 }
 
 template <>
-nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(string key) const {
+nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(const string& key) const {
   nvinfer1::DataType trt_dtype(nvinfer1::DataType::kFLOAT);
   TF_CHECK_OK(ConvertDType(this->at(key)->type(), &trt_dtype));
   return trt_dtype;
 }
 
 template <>
-tensorflow::DataType TFAttrs::get<tensorflow::DataType>(string key) const {
+tensorflow::DataType TFAttrs::get<tensorflow::DataType>(
+    const string& key) const {
   return this->at(key)->type();
 }
 
 template <>
-float TFAttrs::get<float>(string key) const {
+float TFAttrs::get<float>(const string& key) const {
   return this->at(key)->f();
 }
 
 template <>
-bool TFAttrs::get<bool>(string key) const {
+bool TFAttrs::get<bool>(const string& key) const {
   return this->at(key)->b();
 }
 
@@ -346,11 +362,10 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
       break;
     }
     case tensorflow::DataType::DT_HALF: {
-      Reorder2(
-          {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-          istrides,
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
-          ostrides);
+      Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
+               istrides, static_cast<Eigen::half*>(
+                             const_cast<void*>(oweights->GetValues())),
+               ostrides);
       break;
     }
     default:
@@ -425,6 +440,7 @@ using OpConverter =
 class Converter {
   std::unordered_map<string, TRT_TensorOrWeights> trt_tensors_;
   std::unordered_map<string, OpConverter> op_registry_;
+  OpConverter plugin_converter_;
   nvinfer1::INetworkDefinition* trt_network_;
   std::list<std::vector<uint8_t>> temp_bufs_;
   tensorflow::tensorrt::TRTWeightStore* weight_store_;
@@ -491,13 +507,17 @@ class Converter {
     std::vector<TRT_TensorOrWeights> inputs;
     TF_RETURN_IF_ERROR(this->get_inputs(node_def, &inputs));
     string op = node_def.op();
-    if (!op_registry_.count(op)) {
-      return tensorflow::errors::Unimplemented(
-          "No converter registered for op: " + op);
-    }
-    OpConverter op_converter = op_registry_.at(op);
     std::vector<TRT_TensorOrWeights> outputs;
-    TF_RETURN_IF_ERROR(op_converter(*this, node_def, inputs, &outputs));
+    if (PluginFactoryTensorRT::GetInstance()->IsPlugin(op)) {
+      TF_RETURN_IF_ERROR(plugin_converter_(*this, node_def, inputs, &outputs));
+    } else {
+      if (!op_registry_.count(op)) {
+        return tensorflow::errors::Unimplemented(
+            "No converter registered for op: " + op);
+      }
+      OpConverter op_converter = op_registry_.at(op);
+      TF_RETURN_IF_ERROR(op_converter(*this, node_def, inputs, &outputs));
+    }
     for (size_t i = 0; i < outputs.size(); ++i) {
       TRT_TensorOrWeights output = outputs.at(i);
       // TODO(jie): tf protobuf seems to be omitting the :0 suffix
@@ -1159,9 +1179,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented(
-        "binary op: " + node_def.op() +
-        " not supported at: " + node_def.name());
+    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
+                                             " not supported at: " +
+                                             node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -1174,6 +1194,45 @@ tensorflow::Status BinaryTensorOpTensor(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertPlugin(Converter& ctx,
+                                 const tensorflow::NodeDef& node_def,
+                                 const std::vector<TRT_TensorOrWeights>& inputs,
+                                 std::vector<TRT_TensorOrWeights>* outputs) {
+  // prepare input
+  std::vector<nvinfer1::ITensor*> all_inputs;
+  for (auto input : inputs) {
+    all_inputs.emplace_back(const_cast<nvinfer1::ITensor*>(input.tensor()));
+  }
+
+  // plugin is owned by PluginFactory
+  // TODO(jie): destroy plugins later (resource management)
+  PluginTensorRT* plugin =
+      PluginFactoryTensorRT::GetInstance()->CreatePlugin(node_def.op());
+
+  // passing attributes
+  // TODO(jie): support more general attribute
+  TFAttrs attrs(node_def);
+  auto attr_key_vector = attrs.GetAllAttrKey();
+  for (auto attr_key : attr_key_vector) {
+    // TODO(jie): support only list of float for toy example here.
+    auto data = attrs.get<std::vector<float>>(attr_key);
+    size_t size_data = data.size() * sizeof(float);
+    if (!plugin->SetAttribute(attr_key, static_cast<void*>(data.data()),
+                              size_data)) {
+      return tensorflow::errors::InvalidArgument("plugin SetAttribute failed");
+    }
+  }
+
+  nvinfer1::IPluginLayer* layer = ctx.network()->addPlugin(
+      &all_inputs[0], static_cast<int>(inputs.size()), *plugin);
+
+  for (int i = 0; i < layer->getNbOutputs(); i++) {
+    nvinfer1::ITensor* output_tensor = layer->getOutput(i);
+    outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  }
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertPlaceholder(
     Converter& ctx, const tensorflow::NodeDef& node_def,
     const std::vector<TRT_TensorOrWeights>& inputs,
@@ -2074,6 +2133,8 @@ void Converter::register_op_converters() {
   op_registry_["Reshape"] = ConvertReshape;
   op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm;
   op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm;
+
+  plugin_converter_ = ConvertPlugin;
 }
 
 }  // namespace
@@ -2214,64 +2275,63 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
-  // Visit nodes in reverse topological order and construct the TRT network.
-
-  // Toposort
+tensorflow::Status ReverseTopologicalSort(
+    const tensorrt::convert::SubGraphParams& s,
+    std::list<tensorflow::Node*>* order) {
   std::vector<tensorflow::Node*> order_vec;
   tensorflow::GetPostOrder(s.graph, &order_vec);
   // Select just the subgraph
-  std::list<tensorflow::Node*> order;
   for (tensorflow::Node* node : order_vec) {
     if (s.subgraph_node_ids.count(node->id())) {
-      order.push_front(node);  // we want topological order to construct the
+      // We want topological order to contstruct the
       // network layer by layer
+      order->push_front(node);
     }
   }
-  // topological order is needed to build TRT network
-  static int static_id = 0;
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetInputList(
+    const tensorrt::convert::SubGraphParams& s,
+    tensorflow::NodeDefBuilder* op_builder,
+    const std::vector<string>* input_names,
+    std::vector<tensorflow::DataType>* input_dtypes) {
+  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  VLOG(2) << "input edge size: " << input_names->size();
+  for (size_t i = 0; i < input_names->size(); ++i) {
+    VLOG(2) << "input edges: " << i << " " << input_names->at(i);
+    int output_idx = s.input_inds.at(i).second;
+    // we wired up the input here already, it is redundant to do it again in
+    //  ConvertSubGraphToTensorRT(convert_graph.cc)
+    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
+        input_names->at(i), output_idx, input_dtypes->at(i));
+    income_edges.push_back(incoming_edge);
+  }
+  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
+      income_edges);
+  op_builder->Input(input_list);
+  return tensorflow::Status::OK();
+}
+
+string SubgraphNameScopeGenerator(const std::list<tensorflow::Node*>* order) {
   string subgraph_name_scope;
-  if (!order.empty()) {
-    subgraph_name_scope = order.front()->name();
+  if (!order->empty()) {
+    subgraph_name_scope = order->front()->name();
   }
-  for (const tensorflow::Node* node : order) {
+  for (const tensorflow::Node* node : *order) {
     subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
   }
   // TODO(sami,ben,jie): proper naming!
-  string calib_op_name =
-      StrCat(subgraph_name_scope, "my_trt_calib_op_", static_id);
-  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id);
-  static_id++;
-  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
-  auto op_rmgr = trt_rmgr->getManager("TRTCalibOps");
-  auto op_res = new tensorflow::tensorrt::TRTCalibrationResource();
-  TF_CHECK_OK(op_rmgr->Create(calib_op_name, calib_op_name, op_res));
-  op_res->logger_ = new tensorflow::tensorrt::Logger();
-  cudaSetDevice(s.cuda_gpu_id_);
-  op_res->builder_ = nvinfer1::createInferBuilder(*(op_res->logger_));
-  op_res->allocator_ = s.allocator_;
-#if NV_TENSORRT_MAJOR > 3
-  op_res->builder_->setGpuAllocator(s.allocator_.get());
-#endif
-  if (!op_res->builder_) {
-    return tensorflow::errors::Internal(
-        "failed to create TensorRT builder object");
-  }
-
-  op_res->network_ = op_res->builder_->createNetwork();
-  if (!op_res->network_) {
-    return tensorflow::errors::Internal(
-        "failed to create TensorRT network object");
-  }
-
-  // Build the network
-  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
-  auto ws = new tensorflow::tensorrt::TRTWeightStore();
-  TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
-  Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
+  return subgraph_name_scope;
+}
 
-  std::vector<string> input_names;
-  std::vector<tensorflow::DataType> input_dtypes;
+tensorflow::Status ConvertSubgraph(
+    Converter& converter, tensorrt::convert::SubGraphParams& s,
+    std::list<tensorflow::Node*>* order, std::vector<string>* input_names,
+    std::vector<tensorflow::DataType>* input_dtypes,
+    std::vector<string>* output_names,
+    std::vector<tensorflow::DataType>* output_dtypes,
+    const string& engine_name) {
   for (const std::pair<int, int>& input : s.input_inds) {
     VLOG(2) << "parsing input. Node id= " << input.first;
     int node_id = input.first;
@@ -2314,19 +2374,18 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
 
     auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
-    input_dtypes.push_back(tf_dtype);
+    input_dtypes->push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     auto type_status = ConvertDType(tf_dtype, &dtype);
     if (type_status != tensorflow::Status::OK()) {
-      LOG(WARNING) << "Data type conversion for input '" << node_name
-                   << "' failed";
+      LOG(WARNING) << "Type conversion failed for " << node_name;
       return type_status;
     }
 
-    VLOG(2) << "accessing output index of: " << output_idx
+    VLOG(2) << "Accessing output index of: " << output_idx
             << ", at node: " << node_name
-            << "with output entry from shape_map: " << op_info_vec.size();
+            << " with output entry from shape_map: " << op_info_vec.size();
     // TODO(ben,jie): update TRT input format/dimension
     nvinfer1::DimsCHW input_dim_pseudo_chw;
     for (int i = 0; i < 3; i++) input_dim_pseudo_chw.d[i] = 1;
@@ -2352,33 +2411,29 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
       input_tensor_name = StrCat(node_name, ":", output_idx);
     }
 
-    input_names.push_back(input_tensor_name);
+    input_names->push_back(input_tensor_name);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
 
     if (!input_tensor)
       return tensorflow::errors::InvalidArgument(
           "Failed to create Input layer");
-    VLOG(2) << "input tensor name :" << input_tensor_name;
+    VLOG(2) << "Input tensor name :" << input_tensor_name;
 
     if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
       return tensorflow::errors::AlreadyExists(
-          "output tensor already exists for op: " + input_tensor_name);
+          "Output tensor already exists for op: " + input_tensor_name);
   }
 
-  VLOG(2) << "finished sorting";
-
-  for (const tensorflow::Node* node : order) {
+  for (const tensorflow::Node* node : *order) {
     const tensorflow::NodeDef& node_def = node->def();
-    VLOG(2) << "converting node: " << node_def.name() << " , " << node_def.op();
+    VLOG(2) << "Converting node: " << node_def.name() << " , " << node_def.op();
     TF_RETURN_IF_ERROR(converter.convert_node(node_def));
   }
 
-  VLOG(2) << "finished conversion";
+  VLOG(2) << "Finished conversion";
 
   // Gather output metadata
-  std::vector<string> output_names;
-  std::vector<tensorflow::DataType> output_dtypes;
   int trt_engine_op_output_idx = 0;
   for (const std::pair<int, int>& output : s.output_inds) {
     int node_id = output.first;
@@ -2393,14 +2448,13 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
              : StrCat(engine_name, ":", trt_engine_op_output_idx),
          {output_idx, tensor_name}});
     trt_engine_op_output_idx++;
-    if (output_idx != 0) {
-      tensor_name = StrCat(tensor_name, ":", output_idx);
-    }
-    VLOG(1) << "output tensor name: " << tensor_name;
-    output_names.push_back(tensor_name);
+    if (output_idx != 0)
+      tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
+    VLOG(2) << "Output tensor name: " << tensor_name;
+    output_names->push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument("Output node'" + tensor_name +
+      return tensorflow::errors::InvalidArgument("Output node '" + tensor_name +
                                                  "' is weights not tensor");
     }
     nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
@@ -2410,12 +2464,65 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     }
     converter.network()->markOutput(*tensor);
     tensorflow::DataType tf_dtype = node->output_type(output_idx);
-    output_dtypes.push_back(tf_dtype);
+    output_dtypes->push_back(tf_dtype);
     nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
     TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
     tensor->setType(trt_dtype);
   }
 
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
+  // Visit nodes in reverse topological order and construct the TRT network.
+  // Toposort
+  std::list<tensorflow::Node*> order;
+  TF_RETURN_IF_ERROR(ReverseTopologicalSort(s, &order));
+
+  static int static_id = 0;
+  string subgraph_name_scope = SubgraphNameScopeGenerator(&order);
+  // TODO(sami,ben,jie): proper naming!
+  string calib_op_name =
+      StrCat(subgraph_name_scope, "my_trt_calib_op_", static_id);
+  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id);
+  static_id++;
+
+  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
+  auto op_rmgr = trt_rmgr->getManager("TRTCalibOps");
+  auto op_res = new tensorflow::tensorrt::TRTCalibrationResource();
+  TF_CHECK_OK(op_rmgr->Create(calib_op_name, calib_op_name, op_res));
+  op_res->logger_ = new tensorflow::tensorrt::Logger();
+  cudaSetDevice(s.cuda_gpu_id_);
+  op_res->builder_ = nvinfer1::createInferBuilder(*(op_res->logger_));
+  op_res->allocator_ = s.allocator_;
+#if NV_TENSORRT_MAJOR > 3
+  op_res->builder_->setGpuAllocator(s.allocator_.get());
+#endif
+  if (!op_res->builder_) {
+    return tensorflow::errors::Internal(
+        "failed to create TensorRT builder object");
+  }
+
+  op_res->network_ = op_res->builder_->createNetwork();
+  if (!op_res->network_) {
+    return tensorflow::errors::Internal(
+        "failed to create TensorRT network object");
+  }
+
+  // Build the network
+  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
+  auto ws = new tensorflow::tensorrt::TRTWeightStore();
+  TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
+  Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
+
+  std::vector<string> input_names;
+  std::vector<tensorflow::DataType> input_dtypes;
+  std::vector<string> output_names;
+  std::vector<tensorflow::DataType> output_dtypes;
+  TF_RETURN_IF_ERROR(ConvertSubgraph(converter, s, &order, &input_names,
+                                     &input_dtypes, &output_names,
+                                     &output_dtypes, engine_name));
+
   VLOG(2) << "Finished processing outputs";
 
   // Build the engine
@@ -2427,21 +2534,8 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
   // Build the TRT op
   // TODO(sami,ben,jie): proper naming!
   tensorflow::NodeDefBuilder op_builder(calib_op_name, "TRTCalibOp");
-  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  for (size_t i = 0; i < input_names.size(); ++i) {
-    int output_idx = s.input_inds.at(i).second;
-    // we wired up the input here already, it is redundant to do it again in
-    //  ConvertSubGraphToTensorRT(convert_graph.cc)
-    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
-        input_names.at(i), output_idx, input_dtypes.at(i));
-    VLOG(1) << calib_op_name << " input " << i << " = " << input_names.at(i)
-            << ":" << output_idx
-            << " dType= " << tensorflow::DataTypeString(input_dtypes.at(i));
-    income_edges.push_back(incoming_edge);
-  }
-  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
-      income_edges);
-  op_builder.Input(input_list);
+  SetInputList(s, &op_builder, &input_names, &input_dtypes);
+
   std::vector<string> segment_names;
   segment_names.reserve(s.subgraph_node_ids.size());
   for (int i : s.subgraph_node_ids) {
@@ -2465,20 +2559,12 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
 tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     tensorrt::convert::SubGraphParams& s) {
   // Visit nodes in reverse topological order and construct the TRT network.
-
-  // Toposort
-  std::vector<tensorflow::Node*> order_vec;
-  tensorflow::GetPostOrder(s.graph, &order_vec);
-  // Select just the subgraph
   std::list<tensorflow::Node*> order;
-  for (tensorflow::Node* node : order_vec) {
-    if (s.subgraph_node_ids.count(node->id())) {
-      // We want topological order to contstruct the
-      // network layer by layer
-      order.push_front(node);
-    }
-  }
-  // Topological order is needed to build TRT network
+  TF_RETURN_IF_ERROR(ReverseTopologicalSort(s, &order));
+
+  static int static_id = 0;
+  string subgraph_name_scope = SubgraphNameScopeGenerator(&order);
+  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id++);
 
   tensorflow::tensorrt::Logger trt_logger;
   cudaSetDevice(s.cuda_gpu_id_);
@@ -2496,17 +2582,6 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
         "Failed to create TensorRT network object");
   }
 
-  string subgraph_name_scope;
-  if (!order.empty()) {
-    subgraph_name_scope = order.front()->name();
-  }
-  for (const tensorflow::Node* node : order) {
-    subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
-  }
-  static int static_id = 0;
-  // TODO(sami,ben,jie): proper naming!
-  string engine_name = StrCat(subgraph_name_scope, "my_trt_op");
-  engine_name = StrCat(engine_name, static_id++);
   auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
   auto weight_rmgr = trt_rmgr->getManager("WeightStore");
   auto ws = new tensorflow::tensorrt::TRTWeightStore();
@@ -2517,147 +2592,11 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
   std::vector<string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
-  for (const std::pair<int, int>& input : s.input_inds) {
-    VLOG(2) << "parsing input. Node id= " << input.first;
-    int node_id = input.first;
-    int output_idx = input.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
-    auto node_name = node->name();
-    // input_names should use the node name in the graph
-    // here it should be the input tensor name -> matching the binding
-    // insert original node name without port
-    auto tensor_name = node_name;
-    if (output_idx != 0) {
-      tensor_name = StrCat(tensor_name, ":", output_idx);
-    }
-
-    VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name
-            << " idx: " << output_idx;
-
-    auto shape_inference_node_name = node_name;
-    auto shape_inference_output_idx = output_idx;
-    // rewire the shape inference to original node in the graph
-    if (s.output_edge_map->count(tensor_name)) {
-      shape_inference_node_name = s.output_edge_map->at(tensor_name).second;
-      shape_inference_output_idx = s.output_edge_map->at(tensor_name).first;
-    }
-    if (shape_inference_output_idx < 0) continue;
-    VLOG(2) << "shapeinference name: " << shape_inference_node_name
-            << " idx: " << shape_inference_output_idx;
-
-    if (!s.graph_properties.HasOutputProperties(shape_inference_node_name))
-      return tensorflow::errors::Internal("failed to find input node: " +
-                                          shape_inference_node_name);
-
-    auto op_info_vec =
-        s.graph_properties.GetOutputProperties(shape_inference_node_name);
-    if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx)
-      return tensorflow::errors::Internal(
-          "accessing output index of: ", shape_inference_output_idx,
-          ", at node: ", shape_inference_node_name,
-          " with output entry from shape_map: ", op_info_vec.size());
-
-    auto op_info = op_info_vec.at(shape_inference_output_idx);
-    tensorflow::DataType tf_dtype = op_info.dtype();
-    input_dtypes.push_back(tf_dtype);
-
-    nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
-    auto type_status = ConvertDType(tf_dtype, &dtype);
-    if (type_status != tensorflow::Status::OK()) {
-      LOG(WARNING) << "Type conversion failed for " << node_name;
-      return type_status;
-    }
-
-    VLOG(2) << "Accessing output index of: " << output_idx
-            << ", at node: " << node_name
-            << " with output entry from shape_map: " << op_info_vec.size();
-    // TODO(ben,jie): update TRT input format/dimension
-    nvinfer1::DimsCHW input_dim_pseudo_chw;
-    for (int i = 0; i < 3; i++) input_dim_pseudo_chw.d[i] = 1;
-
-    // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
-    //            update the code once TRT 4.0 comes out.
-    if (op_info.shape().dim_size() != 4) {
-      string err_str = "Require 4 dimensional input.";
-      StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ",
-                shape_inference_node_name);
-      return tensorflow::errors::Unimplemented(err_str);
-    }
-
-    for (int i = 1; i < op_info.shape().dim_size(); i++) {
-      VLOG(2) << "dimension: " << i
-              << " , size: " << op_info.shape().dim(i).size();
-      input_dim_pseudo_chw.d[i - 1] = op_info.shape().dim(i).size();
-    }
-
-    // TODO(ben,jie): proper way to restore input tensor name?
-    auto input_tensor_name = node_name;
-    if (output_idx != 0) {
-      input_tensor_name = StrCat(node_name, ":", output_idx);
-    }
-
-    input_names.push_back(input_tensor_name);
-    nvinfer1::ITensor* input_tensor = converter.network()->addInput(
-        input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
-
-    if (!input_tensor)
-      return tensorflow::errors::InvalidArgument(
-          "Failed to create Input layer");
-    VLOG(2) << "Input tensor name :" << input_tensor_name;
-
-    if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
-      return tensorflow::errors::AlreadyExists(
-          "Output tensor already exists for op: " + input_tensor_name);
-  }
-
-  VLOG(2) << "Finished sorting";
-
-  for (const tensorflow::Node* node : order) {
-    const tensorflow::NodeDef& node_def = node->def();
-    VLOG(2) << "Converting node: " << node_def.name() << " , " << node_def.op();
-    TF_RETURN_IF_ERROR(converter.convert_node(node_def));
-  }
-
-  VLOG(2) << "Finished conversion";
-
-  // Gather output metadata
   std::vector<string> output_names;
   std::vector<tensorflow::DataType> output_dtypes;
-  int trt_engine_op_output_idx = 0;
-  for (const std::pair<int, int>& output : s.output_inds) {
-    int node_id = output.first;
-    int output_idx = output.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
-    string op_name = node->name();
-    string tensor_name = op_name;
-
-    s.output_edge_map->insert(
-        {trt_engine_op_output_idx == 0
-             ? engine_name
-             : StrCat(engine_name, ":", trt_engine_op_output_idx),
-         {output_idx, tensor_name}});
-    trt_engine_op_output_idx++;
-    if (output_idx != 0)
-      tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
-    VLOG(2) << "Output tensor name: " << tensor_name;
-    output_names.push_back(tensor_name);
-    auto tensor_or_weights = converter.get_tensor(tensor_name);
-    if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument("Output node '" + tensor_name +
-                                                 "' is weights not tensor");
-    }
-    nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
-    if (!tensor) {
-      return tensorflow::errors::NotFound("Output tensor not found: " +
-                                          tensor_name);
-    }
-    converter.network()->markOutput(*tensor);
-    tensorflow::DataType tf_dtype = node->output_type(output_idx);
-    output_dtypes.push_back(tf_dtype);
-    nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
-    TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
-    tensor->setType(trt_dtype);
-  }
+  TF_RETURN_IF_ERROR(ConvertSubgraph(converter, s, &order, &input_names,
+                                     &input_dtypes, &output_names,
+                                     &output_dtypes, engine_name));
 
   VLOG(2) << "Finished output";
 
@@ -2693,20 +2632,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
   // Build the TRT op
   tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
-  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  VLOG(2) << "input edge size: " << input_names.size();
-  for (size_t i = 0; i < input_names.size(); ++i) {
-    VLOG(2) << "input edges: " << i << " " << input_names.at(i);
-    int output_idx = s.input_inds.at(i).second;
-    // we wired up the input here already, it is redundant to do it again in
-    //  ConvertSubGraphToTensorRT(convert_graph.cc)
-    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
-        input_names.at(i), output_idx, input_dtypes.at(i));
-    income_edges.push_back(incoming_edge);
-  }
-  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
-      income_edges);
-  op_builder.Input(input_list);
+  SetInputList(s, &op_builder, &input_names, &input_dtypes);
 
   VLOG(0) << "Finished op preparation";
 
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a89cf3ab8bfaecc74fc5890ccb7e7a7147278182
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
@@ -0,0 +1,118 @@
+# Description:
+#   Example for plugin support in TensorRT(http://developer.nvidia.com/tensorrt)
+#   through TensorFlow integration. Targeting TensorRT 3.0.4
+#   APIs are meant to change while upgrading TRT.
+#   add init_py into pip package BUILD dependency to install it.
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_custom_op_library_additional_deps",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+)
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["inc_op"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "inc_op",
+    deps = [":inc_op_op_lib"],
+)
+
+tf_custom_op_library(
+    name = "_inc_op.so",
+    srcs = [
+        "inc_op_kernel.h",
+        "inc_op_plugin.cc",
+        "inc_op_plugin.h",
+        "ops/inc_op.cc",
+    ],
+    gpu_srcs = [
+        "inc_op_kernel.h",
+        "inc_op_kernel.cu.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/tensorrt:trt_plugins",
+        "//tensorflow/core:framework_lite",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_kernel_library(
+    name = "inc_op_plugin_kernel",
+    srcs = ["inc_op_plugin.cc"],
+    hdrs = [
+        "inc_op_kernel.h",
+        "inc_op_plugin.h",
+    ],
+    gpu_srcs = [
+        "inc_op_kernel.h",
+        "inc_op_kernel.cu.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/tensorrt:trt_plugins",
+        "//tensorflow/core:stream_executor_headers_lib",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]) + tf_custom_op_library_additional_deps(),
+)
+
+tf_custom_op_py_library(
+    name = "inc_op_loader",
+    srcs = ["inc_op.py"],
+    dso = [
+        ":_inc_op.so",
+    ],
+    kernels = [
+        ":inc_op_op_lib",
+        ":inc_op_plugin_kernel",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:resources",
+    ],
+)
+
+py_library(
+    name = "init_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":inc_op",
+        ":inc_op_loader",
+    ],
+)
+
+cuda_py_test(
+    name = "plugin_test",
+    size = "small",
+    srcs = ["plugin_test.py"],
+    additional_deps = [
+        ":init_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/contrib/tensorrt:init_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:tf_optimizer",
+    ],
+    tags = [
+        "manual",
+        "noguitar",
+        "notap",
+    ],
+)
diff --git a/tensorflow/python/keras/datasets/cifar100/__init__.py b/tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py
similarity index 66%
rename from tensorflow/python/keras/datasets/cifar100/__init__.py
rename to tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py
index ca93742673341660ba69712feb59c5dd32ea3252..363edab2e80ada5c5d52ae7ff66ff4af678b251d 100644
--- a/tensorflow/python/keras/datasets/cifar100/__init__.py
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,15 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""CIFAR100 small image classification dataset."""
+# =============================================================================
+"""Import custom op for plugin and register it in plugin factory registry."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.cifar100 import load_data
+from tensorflow.contrib.tensorrt.custom_plugin_examples import inc_op as import_inc_op_so
+from tensorflow.contrib.tensorrt.custom_plugin_examples.ops import gen_inc_op
 
-del absolute_import
-del division
-del print_function
+inc_op = gen_inc_op.inc_plugin_trt
diff --git a/tensorflow/python/keras/applications/nasnet/__init__.py b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op.py
similarity index 64%
rename from tensorflow/python/keras/applications/nasnet/__init__.py
rename to tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op.py
index 94eb145b85b85b2e52ca37e7aebc681c1f054e16..a007c3f54e208b7623db128f4069c2343d0283c8 100644
--- a/tensorflow/python/keras/applications/nasnet/__init__.py
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op.py
@@ -11,18 +11,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""NASNet Keras applications."""
+# =============================================================================
+"""Loader for the custom inc_op."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.nasnet import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetLarge
-from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetMobile
-from tensorflow.python.keras._impl.keras.applications.nasnet import preprocess_input
+import platform
 
-del absolute_import
-del division
-del print_function
+if platform.system() != "Windows":
+  # pylint: disable=g-import-not-at-top
+  from tensorflow.contrib.util import loader
+  from tensorflow.python.platform import resource_loader
+  # pylint: enable=g-import-not-at-top
+
+  _inc_op = loader.load_op_library(
+      resource_loader.get_path_to_datafile("_inc_op.so"))
+else:
+  raise RuntimeError("Windows not supported")
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..988b35f74f3989481f59c52c6320623a26704327
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h"
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+__global__ void VecInc(const float* vec, float inc, float* dest, int n) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < n) dest[i] = vec[i] + inc;
+}
+
+void IncrementKernel(const float* d_input, float inc, float* d_output,
+                     int count, cudaStream_t stream) {
+  int threads_per_block = 256;
+  int blocks_per_grid = (count + threads_per_block - 1) / threads_per_block;
+
+  VecInc<<<threads_per_block, blocks_per_grid, 0, stream>>>(d_input, inc,
+                                                            d_output, count);
+}
+
+// Note: this kernel definition is not needed in the plugin_test rule, but it is
+// required for correctness of the TF program, i.e. if not using plugin or when
+// run with trt optimization pass, the test should work.
+class IncPluginTRT : public OpKernel {
+ public:
+  explicit IncPluginTRT(OpKernelConstruction* context) : OpKernel(context) {
+    std::vector<float> inc_list;
+    OP_REQUIRES_OK(context, context->GetAttr("inc", &inc_list));
+    OP_REQUIRES(context, inc_list.size() == 1,
+                errors::InvalidArgument(
+                    "The increment list should contain single element."));
+    inc_ = inc_list[0];
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor = context->input(0);
+    const TensorShape& input_shape = input_tensor.shape();
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_shape, &output_tensor));
+    const cudaStream_t* stream = CHECK_NOTNULL(
+        reinterpret_cast<const cudaStream_t*>(context->op_device_context()
+                                                  ->stream()
+                                                  ->implementation()
+                                                  ->CudaStreamMemberHack()));
+    IncrementKernel(input_tensor.flat<float>().data(), inc_,
+                    output_tensor->flat<float>().data(),
+                    input_shape.num_elements(), *stream);
+  }
+
+ private:
+  float inc_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("IncPluginTRT").Device(DEVICE_GPU), IncPluginTRT);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c35955e105798b20f93f650624eac24f378beb0b
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_KERNEL_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_KERNEL_H_
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+void IncrementKernel(const float* d_input, float inc, float* d_output,
+                     int count, cudaStream_t stream);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_KERNEL_H_
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d4c893af56689185da72398919e2241d451594b
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
@@ -0,0 +1,86 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h"
+
+#include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+const char* kPluginName = "IncPluginTRT";
+
+IncOpPlugin* CreateIncPlugin() { return new IncOpPlugin(); }
+
+IncOpPlugin* CreateIncPluginDeserialize(const void* buffer, size_t length) {
+  return new IncOpPlugin(buffer, length);
+}
+
+REGISTER_TRT_PLUGIN(kPluginName, CreateIncPluginDeserialize, CreateIncPlugin);
+
+IncOpPlugin::IncOpPlugin() : plugin_name_(kPluginName) {}
+
+IncOpPlugin::IncOpPlugin(const void* serialized_data, size_t length)
+    : PluginTensorRT(serialized_data, length), plugin_name_(kPluginName) {
+  // account for the consumed pointer.
+  size_t consumed_data = PluginTensorRT::getSerializationSize();
+  assert(length - consumed_data >= sizeof(float));
+  const char* buffer = reinterpret_cast<const char*>(serialized_data);
+  SetAttribute("inc", buffer + consumed_data, sizeof(float));
+}
+
+bool IncOpPlugin::SetAttribute(const string& key, const void* ptr,
+                               const size_t size) {
+  if (strcmp(key.c_str(), "inc") == 0 && size == sizeof(float)) {
+    StoreAttribute(key, ptr, size);  // save the attribute to own the data;
+    inc_ = *static_cast<const float*>(ptr);
+    return true;
+  }
+  return false;
+}
+
+bool IncOpPlugin::GetAttribute(const string& key, const void** ptr,
+                               size_t* size) const {
+  const auto& iter = attr_map_.find(key);
+  if (iter != attr_map_.end()) {
+    *ptr = iter->second.data();
+    *size = iter->second.size();
+    return true;
+  }
+  return false;
+}
+
+int IncOpPlugin::enqueue(int batch_size, const void* const* inputs,
+                         void** outputs, void*, cudaStream_t stream) {
+  int count = 1;
+  for (int i = 0; i < input_dim_list_[0].nbDims; i++) {
+    count *= input_dim_list_[0].d[i];
+  }
+  count *= batch_size;
+  const float* input = reinterpret_cast<const float*>(inputs[0]);
+  float* output = reinterpret_cast<float*>(outputs[0]);
+  IncrementKernel(input, inc_, output, count, stream);
+  return 0;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
new file mode 100644
index 0000000000000000000000000000000000000000..189e9c939b9ffd4450f7ba95fe1abdbbc049b430
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_PLUGIN_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_PLUGIN_H_
+
+#include <cassert>
+#include <cstring>
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+class IncOpPlugin : public PluginTensorRT {
+ public:
+  IncOpPlugin();
+
+  IncOpPlugin(const void* serialized_data, size_t length);
+
+  const string& GetPluginName() const override { return plugin_name_; };
+
+  bool Finalize() override { return true; };
+
+  bool SetAttribute(const string& key, const void* ptr,
+                    const size_t size) override;
+
+  bool GetAttribute(const string& key, const void** ptr,
+                    size_t* size) const override;
+
+  int getNbOutputs() const override { return 1; }
+
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int num_input_dims) override {
+    assert(index == 0);
+    assert(num_input_dims == 1);
+    return inputs[0];
+  }
+
+  // use configure to setup input dimensions
+  void configure(const nvinfer1::Dims* inputs, int num_inputs,
+                 const nvinfer1::Dims* outputs, int num_outputs,
+                 int max_batch_size) override {
+    assert(num_inputs == 1);
+    PluginTensorRT::configure(inputs, num_inputs, outputs, num_outputs,
+                              max_batch_size);
+  }
+
+  int initialize() override { return 0; }
+
+  void terminate() override {}
+
+  size_t getWorkspaceSize(int max_batch_size) const override { return 0; }
+
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
+
+  size_t getSerializationSize() override {
+    return PluginTensorRT::getSerializationSize() + sizeof(float);
+  }
+
+  void serialize(void* buffer) override {
+    // Serialize parent data.
+    PluginTensorRT::serialize(buffer);
+    // Incremented buffer after parent serialization.
+    buffer =
+        static_cast<char*>(buffer) + PluginTensorRT::getSerializationSize();
+    std::memcpy(buffer, &inc_, sizeof(float));
+    buffer = static_cast<char*>(buffer) + sizeof(float);
+  }
+
+ protected:
+  float inc_;
+  nvinfer1::Dims dim_;
+
+ private:
+  const string plugin_name_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_PLUGIN_H_
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d0eb0d299dd61dcc5c889e61994e6430340cdb1d
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+
+REGISTER_OP("IncPluginTRT")
+    .Attr("inc: list(float)")
+    .Input("input: float32")
+    .Output("output: float32")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/plugin_test.py b/tensorflow/contrib/tensorrt/custom_plugin_examples/plugin_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc4d270bec4fb83d8ea067fca3a750270755a659
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/plugin_test.py
@@ -0,0 +1,95 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to show usage of TensorRT custom op & plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.contrib import tensorrt
+from tensorflow.contrib.tensorrt import custom_plugin_examples
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+class TrtPluginTest(test_util.TensorFlowTestCase):
+
+  def _get_plugin_graph_def(self):
+    """Create a simple graph and return its graph_def."""
+    g = ops.Graph()
+    with g.as_default():
+      a = array_ops.placeholder(
+          dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
+      relu = nn.relu(a, "relu")
+      v = nn_ops.max_pool(
+          relu, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+
+      # insert custom_op in the graph
+      v = custom_plugin_examples.inc_op(v, inc=[16.5], name="plugin_test")
+
+      v *= 2.0
+      v = nn.relu(v)
+      v = nn.relu(v)
+      array_ops.squeeze(v, name="output")
+    return g.as_graph_def()
+
+  def _run_graph(self, gdef, dumm_inp):
+    """Run given graphdef once."""
+    gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+    ops.reset_default_graph()
+    g = ops.Graph()
+    with g.as_default():
+      inp, out = importer.import_graph_def(
+          graph_def=gdef, return_elements=["input", "output"])
+      inp = inp.outputs[0]
+      out = out.outputs[0]
+
+    with session.Session(
+        config=config_pb2.ConfigProto(gpu_options=gpu_options),
+        graph=g) as sess:
+      val = sess.run(out, {inp: dumm_inp})
+    return val
+
+  def testIncOpPlugin(self):
+    inp_dims = (5, 24, 24, 2)
+    dummy_input = numpy.ones(inp_dims).astype(numpy.float32)
+    orig_graph = self._get_plugin_graph_def()  # graph with plugin node
+
+    # trigger conversion.
+    # plugin nodes have been registered during import, converter will be able to
+    # create corresponding plugin layer during conversion.
+    trt_graph = tensorrt.create_inference_graph(
+        input_graph_def=orig_graph,
+        outputs=["output"],
+        max_batch_size=inp_dims[0],
+        max_workspace_size_bytes=1 << 25,
+        precision_mode="FP32",
+        minimum_segment_size=2)
+    o2 = self._run_graph(trt_graph, dummy_input)
+    self.assertEqual(35, o2.reshape([-1])[0])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 5c5b2e3c073d5fc38a8505e9f2c7ddf117cb4ffd..9ac8047944874181de228a6cc58e2dafe46abe50 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
 
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
@@ -59,7 +60,8 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
     infer->setGpuAllocator(allocator_.get());
 #endif
     trt_engine_ptr_.reset(infer->deserializeCudaEngine(
-        serialized_engine_.c_str(), serialized_engine_.size(), nullptr));
+        serialized_engine_.c_str(), serialized_engine_.size(),
+        PluginFactoryTensorRT::GetInstance()));
     trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
     // Runtime is safe to delete after engine creation
     infer->destroy();
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/contrib/tensorrt/log/trt_logger.h
index 7f3544f8cfda8dce13881e1f8f4388b640e315f4..96ccacb791e40143c5c4d9d691bb353702f9a28b 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.h
@@ -28,7 +28,7 @@ namespace tensorrt {
 // Logger for GIE info/warning/errors
 class Logger : public nvinfer1::ILogger {
  public:
-  Logger(string name = "DefaultLogger") : name_(name){};
+  Logger(string name = "DefaultLogger") : name_(name) {}
   void log(nvinfer1::ILogger::Severity severity, const char* msg) override;
 
  private:
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc
new file mode 100644
index 0000000000000000000000000000000000000000..062f86e8bb4dc753925e4e2baf0bc80a5312a94f
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc
@@ -0,0 +1,106 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include <cassert>
+#include <cstring>
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+PluginTensorRT::PluginTensorRT(const void* serialized_data, size_t length) {
+  const char* buffer = static_cast<const char*>(serialized_data);
+  size_t op_name_char_count = *reinterpret_cast<const size_t*>(buffer);
+  buffer += sizeof(size_t);
+  buffer += op_name_char_count;
+
+  size_t count = *reinterpret_cast<const size_t*>(buffer);
+  buffer += sizeof(size_t);
+
+  for (int i = 0; i < count; i++) {
+    nvinfer1::Dims dim;
+    std::memcpy(&(dim.nbDims), buffer, sizeof(dim.nbDims));
+    buffer += sizeof(dim.nbDims);
+    std::memcpy(dim.d, buffer, sizeof(dim.d));
+    buffer += sizeof(dim.d);
+    std::memcpy(dim.type, buffer, sizeof(dim.type));
+    buffer += sizeof(dim.type);
+    input_dim_list_.emplace_back(dim);
+  }
+}
+
+void PluginTensorRT::configure(const nvinfer1::Dims* inputs, int num_inputs,
+                               const nvinfer1::Dims* outputs, int num_outputs,
+                               int max_batch_size) {
+  for (int index = 0; index < num_inputs; index++) {
+    nvinfer1::Dims dim;
+    dim.nbDims = inputs[index].nbDims;
+    for (int i = 0; i < dim.nbDims; i++) {
+      dim.d[i] = inputs[index].d[i];
+      dim.type[i] = inputs[index].type[i];
+    }
+    input_dim_list_.emplace_back(dim);
+  }
+}
+
+size_t PluginTensorRT::getSerializationSize() {
+  nvinfer1::Dims dim;
+  return sizeof(size_t) + GetPluginName().size() +
+         sizeof(input_dim_list_.size()) + sizeof(dim.nbDims) + sizeof(dim.d) +
+         sizeof(dim.type);
+}
+
+void PluginTensorRT::serialize(void* serialized_data) {
+  size_t op_name_size = GetPluginName().size();
+  char* buffer = static_cast<char*>(serialized_data);
+  std::memcpy(buffer, &op_name_size, sizeof(size_t));
+  buffer += sizeof(size_t);
+
+  std::memcpy(buffer, GetPluginName().data(), op_name_size);
+  buffer += op_name_size;
+
+  auto list_size = input_dim_list_.size();
+  std::memcpy(buffer, &list_size, sizeof(input_dim_list_.size()));
+  buffer += sizeof(input_dim_list_.size());
+
+  for (int i = 0; i < input_dim_list_.size(); i++) {
+    auto dim = input_dim_list_[i];
+    std::memcpy(buffer, &(dim.nbDims), sizeof(dim.nbDims));
+    buffer += sizeof(dim.nbDims);
+    std::memcpy(buffer, dim.d, sizeof(dim.d));
+    buffer += sizeof(dim.d);
+    std::memcpy(buffer, dim.type, sizeof(dim.type));
+    buffer += sizeof(dim.type);
+  }
+}
+
+bool PluginTensorRT::StoreAttribute(const string& key, const void* ptr,
+                                    const size_t size) {
+  if (attr_map_.count(key) != 0) return false;
+
+  attr_map_.emplace(key, std::vector<char>(size));
+  std::memcpy(attr_map_[key].data(), ptr, size);
+  return true;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin.h
new file mode 100644
index 0000000000000000000000000000000000000000..754920b60ca7439513a91ad0354833a2482b29c1
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin.h
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
+
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// A wrapper class for TensorRT plugin
+// User application should inherit from this class to write custom kernels.
+// Allows user to insert custom op in TensorRT engine
+// To register plugin in converter, user should also register custom
+// PluginDeserializeFunc & PluginConstructFunc through PluginFactoryTensorRT
+class PluginTensorRT : public nvinfer1::IPlugin {
+ public:
+  PluginTensorRT() {}
+  PluginTensorRT(const void* serialized_data, size_t length);
+
+  virtual const string& GetPluginName() const = 0;
+
+  virtual bool Finalize() = 0;
+
+  virtual bool SetAttribute(const string& key, const void* ptr,
+                            const size_t size) = 0;
+  virtual bool GetAttribute(const string& key, const void** ptr,
+                            size_t* size) const = 0;
+
+  void configure(const nvinfer1::Dims* inputs, int num_inputs,
+                 const nvinfer1::Dims* outputs, int num_outputs,
+                 int max_batch_size) override;
+
+  virtual bool StoreAttribute(const string& key, const void* ptr,
+                              const size_t size);
+
+  size_t getSerializationSize() override;
+
+  void serialize(void* buffer) override;
+
+ protected:
+  std::unordered_map<string, std::vector<char> > attr_map_;
+
+  std::vector<nvinfer1::Dims> input_dim_list_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2bc591484dcaf5b35c39f3d0523dd89dcd152e6a
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
@@ -0,0 +1,78 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
+                                                    const void* serial_data,
+                                                    size_t serial_length) {
+  size_t parsed_byte = 0;
+  // extract op_name from serial_data
+  string encoded_op_name =
+      ExtractOpName(serial_data, serial_length, &parsed_byte);
+
+  if (!IsPlugin(encoded_op_name)) {
+    return nullptr;
+  }
+
+  tensorflow::mutex_lock lock(instance_m_);
+  auto plugin_ptr =
+      plugin_registry_[encoded_op_name].first(serial_data, serial_length);
+  owned_plugins_.emplace_back(plugin_ptr);
+
+  return plugin_ptr;
+}
+
+PluginTensorRT* PluginFactoryTensorRT::CreatePlugin(const string& op_name) {
+  if (!IsPlugin(op_name)) return nullptr;
+
+  tensorflow::mutex_lock lock(instance_m_);
+  auto plugin_ptr = plugin_registry_[op_name].second();
+  owned_plugins_.emplace_back(plugin_ptr);
+
+  return plugin_ptr;
+}
+
+bool PluginFactoryTensorRT::RegisterPlugin(
+    const string& op_name, PluginDeserializeFunc deserialize_func,
+    PluginConstructFunc construct_func) {
+  if (IsPlugin(op_name)) return false;
+
+  tensorflow::mutex_lock lock(instance_m_);
+  auto ret = plugin_registry_.emplace(
+      op_name, std::make_pair(deserialize_func, construct_func));
+
+  return ret.second;
+}
+
+void PluginFactoryTensorRT::DestroyPlugins() {
+  tensorflow::mutex_lock lock(instance_m_);
+  for (auto& owned_plugin_ptr : owned_plugins_) {
+    owned_plugin_ptr.release();
+  }
+  owned_plugins_.clear();
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbae9fb65c22cf69d2e7954436fd04dd16f7f6c8
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+class PluginFactoryTensorRT : public nvinfer1::IPluginFactory {
+ public:
+  // TODO(aaroey): this static method has to be inlined to make the singleton a
+  // unique global symbol. Find a way to fix it.
+  static PluginFactoryTensorRT* GetInstance() {
+    static PluginFactoryTensorRT* factory_instance =
+        new PluginFactoryTensorRT();
+    return factory_instance;
+  }
+
+  // Deserialization method
+  PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data,
+                               size_t serial_length) override;
+
+  // Plugin construction, PluginFactoryTensorRT owns the plugin.
+  PluginTensorRT* CreatePlugin(const string& op_name);
+
+  bool RegisterPlugin(const string& op_name,
+                      PluginDeserializeFunc deserialize_func,
+                      PluginConstructFunc construct_func);
+
+  bool IsPlugin(const string& op_name) {
+    return plugin_registry_.find(op_name) != plugin_registry_.end();
+  }
+
+  size_t CountOwnedPlugins() { return owned_plugins_.size(); }
+
+  void DestroyPlugins();
+
+ protected:
+  std::unordered_map<string,
+                     std::pair<PluginDeserializeFunc, PluginConstructFunc>>
+      plugin_registry_;
+
+  // TODO(jie): Owned plugin should be associated with different sessions;
+  //            should really hand ownership of plugins to resource management;
+  std::vector<std::unique_ptr<PluginTensorRT>> owned_plugins_;
+  tensorflow::mutex instance_m_;
+};
+
+class TrtPluginRegistrar {
+ public:
+  TrtPluginRegistrar(const string& name, PluginDeserializeFunc deserialize_func,
+                     PluginConstructFunc construct_func) {
+    auto factory = PluginFactoryTensorRT::GetInstance();
+    QCHECK(factory->RegisterPlugin(name, deserialize_func, construct_func))
+        << "Failed to register plugin: " << name;
+  }
+};
+
+#define REGISTER_TRT_PLUGIN(name, deserialize_func, construct_func)    \
+  REGISTER_TRT_PLUGIN_UNIQ_HELPER(__COUNTER__, name, deserialize_func, \
+                                  construct_func)
+#define REGISTER_TRT_PLUGIN_UNIQ_HELPER(ctr, name, deserialize_func, \
+                                        construct_func)              \
+  REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func)
+#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func) \
+  static ::tensorflow::tensorrt::TrtPluginRegistrar trt_plugin_registrar##ctr \
+      TF_ATTRIBUTE_UNUSED = ::tensorflow::tensorrt::TrtPluginRegistrar(       \
+          name, deserialize_func, construct_func)
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..129bdcdbc2f8d9d5215f45f381bcadf35e4fa75e
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
@@ -0,0 +1,125 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace test {
+
+class StubPlugin : public PluginTensorRT {
+ public:
+  static const char* kPluginName;
+
+  StubPlugin() : plugin_name_(kPluginName) {}
+
+  StubPlugin(const void* serialized_data, size_t length)
+      : PluginTensorRT(serialized_data, length) {}
+
+  const string& GetPluginName() const override { return plugin_name_; }
+
+  bool Finalize() override { return true; }
+
+  bool SetAttribute(const string& key, const void* ptr,
+                    const size_t size) override {
+    return true;
+  }
+
+  bool GetAttribute(const string& key, const void** ptr,
+                    size_t* size) const override {
+    return true;
+  }
+
+  int getNbOutputs() const override { return 1; }
+
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nbInputDims) override {
+    return inputs[0];
+  }
+
+  int initialize() override { return 0; }
+
+  void terminate() override {}
+
+  size_t getWorkspaceSize(int maxBatchSize) const override { return 0; }
+
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override {
+    return 0;
+  }
+
+ private:
+  const string plugin_name_;
+};
+
+const char* StubPlugin::kPluginName = "StubPlugin";
+
+StubPlugin* CreateStubPlugin() { return new StubPlugin(); }
+
+StubPlugin* CreateStubPluginDeserialize(const void* serialized_data,
+                                        size_t length) {
+  return new StubPlugin(serialized_data, length);
+}
+
+class TrtPluginFactoryTest : public ::testing::Test {
+ public:
+  bool RegisterStubPlugin() {
+    if (PluginFactoryTensorRT::GetInstance()->IsPlugin(
+            StubPlugin::kPluginName)) {
+      return true;
+    }
+    return PluginFactoryTensorRT::GetInstance()->RegisterPlugin(
+        StubPlugin::kPluginName, CreateStubPluginDeserialize, CreateStubPlugin);
+  }
+};
+
+TEST_F(TrtPluginFactoryTest, Registration) {
+  EXPECT_FALSE(
+      PluginFactoryTensorRT::GetInstance()->IsPlugin(StubPlugin::kPluginName));
+  EXPECT_TRUE(RegisterStubPlugin());
+
+  ASSERT_TRUE(
+      PluginFactoryTensorRT::GetInstance()->IsPlugin(StubPlugin::kPluginName));
+}
+
+TEST_F(TrtPluginFactoryTest, CreationDeletion) {
+  EXPECT_TRUE(RegisterStubPlugin());
+  ASSERT_TRUE(
+      PluginFactoryTensorRT::GetInstance()->IsPlugin(StubPlugin::kPluginName));
+
+  PluginFactoryTensorRT::GetInstance()->DestroyPlugins();
+  ASSERT_TRUE(PluginFactoryTensorRT::GetInstance()->CreatePlugin(
+      StubPlugin::kPluginName));
+  ASSERT_EQ(1, PluginFactoryTensorRT::GetInstance()->CountOwnedPlugins());
+  PluginFactoryTensorRT::GetInstance()->DestroyPlugins();
+  ASSERT_EQ(0, PluginFactoryTensorRT::GetInstance()->CountOwnedPlugins());
+}
+
+}  // namespace test
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a8f60886c03c174a612e7a135b6eb7bb7cb9997a
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+#include <cassert>
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+string ExtractOpName(const void* serial_data, size_t serial_length,
+                     size_t* incremental) {
+  size_t op_name_char_count = *static_cast<const size_t*>(serial_data);
+  *incremental = sizeof(size_t) + op_name_char_count;
+
+  assert(serial_length >= *incremental);
+
+  const char* buffer = static_cast<const char*>(serial_data) + sizeof(size_t);
+  string op_name(buffer, op_name_char_count);
+
+  return op_name;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..274ce42fec9283c643004d45fba461879fc5f2dc
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+
+#include <functional>
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+typedef std::function<PluginTensorRT*(const void*, size_t)>
+    PluginDeserializeFunc;
+
+typedef std::function<PluginTensorRT*(void)> PluginConstructFunc;
+
+// TODO(jie): work on error handling here
+string ExtractOpName(const void* serial_data, size_t serial_length,
+                     size_t* incremental);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index 8038085a060dc92c3a046c7efe7d7a08ca97973a..2de3923b06a8ddf89c7e6f922138a85f55a618d6 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index 8b475177bc670ddae2b26b6a494f758eba20b2c3..f36495f6b69ecb2f2a8d730b9ae4919fea3c04b8 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <string>
 #include <vector>
@@ -33,7 +34,8 @@ tensorflow::Status TRTEngineOpShapeInference(InferenceContext* context) {
   TF_RETURN_IF_ERROR(context->GetAttr("serialized_engine", &serialized_engine));
   nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
   nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine(
-      serialized_engine.c_str(), serialized_engine.size(), nullptr);
+      serialized_engine.c_str(), serialized_engine.size(),
+      tensorrt::PluginFactoryTensorRT::GetInstance());
 
   int num_batch = -1;
   std::vector<::tensorflow::DataType> input_type;
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
index a5c00dd6333183c87a41aa7effc32f814a2299e7..0403b652d72877196c3537a3181529aeeb997395 100644
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
@@ -34,7 +34,6 @@ from tensorflow.python.ops import nn_ops as nn_ops
 from tensorflow.python.platform import googletest
 
 
-@test_util.with_c_api
 class IntegrationTest(test_util.TensorFlowTestCase):
   """Class to test Tensorflow-TensorRT integration."""
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index d2746032a04946cdfab4b5ac968ea3add5f6b51d..e4963596d38dbe8aea98fddbc67dbbf761c215c8 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -110,6 +110,7 @@ py_test(
         "no_pip_gpu",  # b/63391119
         "nomsan",  # Takes too long to run.
         "notsan",  # b/67865658
+        "optonly",  # Takes too long to run without optimization.
     ],
     deps = [
         ":ar_model",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
index ce96180c9271b95991826c2527cec526c1397ae5..d8089453340e894db6af9fc3a3b360c9512207eb 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
@@ -30,9 +30,9 @@ from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.keras._impl.keras.engine import sequential
-from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
index 706742ca287a7d8a172472cd944c906e3eda335c..983455f63db07903a9b2996706c6dba731d5e2b8 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
@@ -68,15 +68,16 @@ class TimeSeriesRegressorTest(test.TestCase):
     eval_input_fn = input_pipeline.RandomWindowInputFn(
         input_pipeline.NumpyReader(features), shuffle_seed=3, num_threads=1,
         batch_size=16, window_size=16)
-    first_estimator.train(input_fn=train_input_fn, steps=5)
+    first_estimator.train(input_fn=train_input_fn, steps=1)
     first_loss_before_fit = first_estimator.evaluate(
         input_fn=eval_input_fn, steps=1)["loss"]
-    first_estimator.train(input_fn=train_input_fn, steps=50)
+    self.assertAllEqual([], first_loss_before_fit.shape)
+    first_estimator.train(input_fn=train_input_fn, steps=1)
     first_loss_after_fit = first_estimator.evaluate(
         input_fn=eval_input_fn, steps=1)["loss"]
-    self.assertLess(first_loss_after_fit, first_loss_before_fit)
+    self.assertAllEqual([], first_loss_after_fit.shape)
     second_estimator = estimator_fn(model_dir, exogenous_feature_columns)
-    second_estimator.train(input_fn=train_input_fn, steps=2)
+    second_estimator.train(input_fn=train_input_fn, steps=1)
     whole_dataset_input_fn = input_pipeline.WholeDatasetInputFn(
         input_pipeline.NumpyReader(features))
     whole_dataset_evaluation = second_estimator.evaluate(
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 0bdf6f64c9eeef83198e28c6b9279bde24bd0ead..f84ff1bfe9b014733205a8e51b43f79c63b227cb 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -181,6 +181,7 @@ py_library(
         ":datasets",
         ":profiler",
         ":tpu_py",
+        "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
         "//tensorflow/contrib/tpu/proto:topology_proto_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/tpu/ops/replication_ops.cc b/tensorflow/contrib/tpu/ops/replication_ops.cc
index 3bdf7c2f83b037984a45cea99910df87c967aa40..ab2a7a0d4bec48d6b3b459bb3144e8ddae614ca0 100644
--- a/tensorflow/contrib/tpu/ops/replication_ops.cc
+++ b/tensorflow/contrib/tpu/ops/replication_ops.cc
@@ -25,6 +25,7 @@ using shape_inference::ShapeHandle;
 REGISTER_OP("TPUReplicateMetadata")
     .Attr("num_replicas: int >= 0")
     .Attr("topology: string = \"\"")
+    .Attr("use_tpu: bool = true")
     .Attr("device_assignment: list(int) = []")
     .Attr("computation_shape: list(int) = []")
     .Attr("host_compute_core: list(string) = []")
@@ -64,10 +65,15 @@ REGISTER_OP("TPUReplicatedOutput")
         "Operator that connects the output of an N-way replicated TPU "
         "computation to N separate outputs.");
 
+REGISTER_OP("TPUCompilationResult")
+    .Output("output: string")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("TPUReplicate")
     .Attr("computation: func")
     .Attr("num_replicas: int >= 1")
     .Attr("topology: string = \"\"")
+    .Attr("use_tpu: bool = true")
     .Attr("device_assignment: list(int) = []")
     .Attr("host_compute_core: list(string) = []")
     .Attr("computation_shape: list(int) = []")
@@ -89,6 +95,9 @@ computation: a function containing the computation to run.
 num_replicas: the number of replicas of the computation to run.
 topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
 topology.
+use_tpu: a bool indicating if this computation will run on TPU or CPU/GPU.
+Currently, only supports a default placement (computation is placed on GPU
+if one is available, and on CPU if not).
 computation_shape: a [mesh_dimension] array describing the shape of each
   computation replica in numbers of cores in the TPU mesh.
 device_assignment: a flattened array with shape
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index 5e85a967ad4ea373e213fa90c3640e9ab1f92d25..73d941e5e99801d239441e438e9d640478c4e6b6 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/contrib/tpu/profiler/op_profile.pb.h"
 #include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
 #include "tensorflow/contrib/tpu/profiler/trace_events_to_json.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/compression.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -30,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/core/util/event.pb.h"
 #include "tensorflow/core/util/events_writer.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/tpu/profiler/op_profile.proto b/tensorflow/contrib/tpu/profiler/op_profile.proto
index 840a43913ba0f159d3c495553ebdff79c0448e73..1f249de314a54067ffbe7193e3135912a091b10a 100644
--- a/tensorflow/contrib/tpu/profiler/op_profile.proto
+++ b/tensorflow/contrib/tpu/profiler/op_profile.proto
@@ -60,6 +60,11 @@ message Metrics {
   //  - it does not reveal the peak core FLOPS of the hardware
   double flops = 2;
 
+  // The VMEM bandwidth used to load operands from HBM, as a fraction of
+  // thereotical VMEM bandwidth on the specific hardware.
+  double memory_bandwidth = 3;
+
   double raw_time = 11;   // Elapsed core-time in picoseconds.
   double raw_flops = 12;  // Total floating-point operations performed.
+  double raw_bytes_accessed = 13;  // Total bytes accessed (include read/write).
 }
diff --git a/tensorflow/contrib/tpu/proto/BUILD b/tensorflow/contrib/tpu/proto/BUILD
index fcfbbe1a213b6959b82c20beff02df48517b5e98..7ecb36852c53bb74d70ed0f8c70ca1ce860a037a 100644
--- a/tensorflow/contrib/tpu/proto/BUILD
+++ b/tensorflow/contrib/tpu/proto/BUILD
@@ -21,3 +21,13 @@ tf_proto_library(
     cc_api_version = 2,
     visibility = ["//visibility:public"],
 )
+
+tf_proto_library(
+    name = "compilation_result_proto",
+    srcs = [
+        "compilation_result.proto",
+    ],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/tpu/proto/compilation_result.proto b/tensorflow/contrib/tpu/proto/compilation_result.proto
new file mode 100644
index 0000000000000000000000000000000000000000..cf52897de3d0fefa55e68a6b889ae9af7b45864a
--- /dev/null
+++ b/tensorflow/contrib/tpu/proto/compilation_result.proto
@@ -0,0 +1,13 @@
+syntax = "proto3";
+
+option cc_enable_arenas = true;
+package tensorflow.tpu;
+
+import "tensorflow/core/lib/core/error_codes.proto";
+
+// Describes the result of a TPU compilation.
+message CompilationResultProto {
+  // The error message, if any, returned during compilation.
+  error.Code status_code = 1;
+  string status_error_message = 2;
+}
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index 2e472a2805f98b15505f56af403aa6223e28c667..d879170b6875b3088d284459b70dc91567e33bab 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -166,11 +166,21 @@ def StreamingFilesDataset(files,
     return remote_iterator.get_next()
 
   def MapFn(unused_input):
-    return functional_ops.remote_call(
+    if isinstance(source_dataset.output_types, dtypes.DType):
+      output_types = [source_dataset.output_types]
+    elif isinstance(source_dataset.output_types, (list, tuple)):
+      output_types = source_dataset.output_types
+    else:
+      raise ValueError('source dataset has invalid output types')
+    remote_calls = functional_ops.remote_call(
         args=[source_handle],
-        Tout=[dtypes.string],
+        Tout=output_types,
         f=LoadingFunc,
-        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)[0]
+        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
+    if len(remote_calls) == 1:
+      return remote_calls[0]
+    else:
+      return remote_calls
 
   with ops.device('/job:%s' % worker_job):
     output_dataset = dataset_ops.Dataset.range(2).repeat().map(
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
index 918cf0ed8e513de0d4207f7d2aac61ad886c8288..b58d05eac56f3586e183333f7c1a3867ee57456c 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
@@ -26,6 +26,8 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -162,6 +164,30 @@ class DatasetsTest(test.TestCase):
 
     self.assertEqual(set(all_contents), set(retrieved_values))
 
+  def testArbitraryReaderFuncFromDatasetGenerator(self):
+
+    def my_generator():
+      yield (1, [1] * 10)
+
+    def gen_dataset(dummy):
+      return dataset_ops.Dataset.from_generator(
+          my_generator, (dtypes.int64, dtypes.int64),
+          (tensor_shape.TensorShape([]), tensor_shape.TensorShape([10])))
+
+    dataset = datasets.StreamingFilesDataset(
+        dataset_ops.Dataset.range(10), filetype=gen_dataset)
+
+    iterator = dataset.make_initializable_iterator()
+    self._sess.run(iterator.initializer)
+    get_next = iterator.get_next()
+
+    retrieved_values = self._sess.run(get_next)
+
+    self.assertIsInstance(retrieved_values, (list, tuple))
+    self.assertEqual(len(retrieved_values), 2)
+    self.assertEqual(retrieved_values[0], 1)
+    self.assertItemsEqual(retrieved_values[1], [1] * 10)
+
   def testUnexpectedFiletypeString(self):
     with self.assertRaises(ValueError):
       datasets.StreamingFilesDataset(
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index b1d8d38a9a0e68719380d062a2357930b2f5f167..f1a11fa6548b87d6222a97c72b8db5442c8ef774 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -49,20 +49,23 @@ from __future__ import print_function
 
 import collections
 import re
+import time
 
 from tensorflow.contrib.framework.python.framework import experimental
+from tensorflow.contrib.tpu.proto import compilation_result_pb2 as tpu_compilation_result
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
+from tensorflow.contrib.tpu.python.tpu import tpu_optimizer
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as tf_session
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import layers
-from tensorflow.python.keras._impl.keras import models
-from tensorflow.python.keras._impl.keras import optimizers as keras_optimizers
-from tensorflow.python.keras._impl.keras.layers import embeddings
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import layers
+from tensorflow.python.keras import models
+from tensorflow.python.keras import optimizers as keras_optimizers
+from tensorflow.python.keras.layers import embeddings
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -75,9 +78,6 @@ class TPUEmbedding(embeddings.Embedding):
   replacement: it has the same behavior and will work on CPU and GPU devices.
   """
 
-  def __init__(self, *args, **kw):
-    super(TPUEmbedding, self).__init__(*args, **kw)
-
   def build(self, input_shape):
     if input_shape[0] is None:
       raise ValueError(
@@ -92,10 +92,11 @@ class TPUEmbedding(embeddings.Embedding):
     return math_ops.tensordot(inputs, self.embeddings, 1)
 
 
-class CompiledTPUOp(
+class TPUModelOp(
     collections.namedtuple(
-        'CompiledTPUOp',
-        ['tpu_execute_op', 'infeed_tensors', 'infeed_op', 'outfeed_op'])):
+        'TPUModelOp',
+        ['compile_op', 'execute_op', 'infeed_tensors', 'infeed_op',
+         'outfeed_op'])):
   pass
 
 
@@ -104,6 +105,15 @@ def _valid_name(tensor_name):
   return re.sub('[^a-zA-Z0-9_-]+', '', tensor_name)
 
 
+def _replicated_optimizer(opt, num_replicas):
+  """Wrap the optimizer `opt` with CrossShardOptimizer if applicable."""
+  if num_replicas == 1:
+    return opt
+  return keras_optimizers.TFOptimizer(
+      optimizer=tpu_optimizer.CrossShardOptimizer(opt.optimizer)
+  )
+
+
 class TPUFunction(object):
   """K.function compatible interface for invoking a TPU compiled function.
 
@@ -116,10 +126,11 @@ class TPUFunction(object):
   instead of being injected as `feed_dict` items or fetches.
   """
 
-  def __init__(self, model, execution_mode):
+  def __init__(self, model, execution_mode, num_replicas=1):
     self.model = model
     self.execution_mode = execution_mode
     self._compilation_cache = {}
+    self.num_replicas = num_replicas
 
   def _specialize_model(self, input_specs):
     """Specialize `self.model` (a Keras model) for the given input shapes."""
@@ -165,9 +176,11 @@ class TPUFunction(object):
       # Call our model with our infeed inputs (re-using the weights).
       model_outputs = self.model(tpu_inputs)
       child_model = models.Model(inputs=tpu_inputs, outputs=model_outputs)
+
       if is_training or is_test:
         child_model.compile(
-            optimizer=self.model.optimizer,
+            optimizer=_replicated_optimizer(self.model.optimizer,
+                                            self.num_replicas),
             loss=self.model.loss,
             loss_weights=self.model.loss_weights,
             metrics=self.model.metrics,
@@ -185,7 +198,8 @@ class TPUFunction(object):
         return [
             child_model.train_function.updates_op,
             tpu_ops.outfeed_enqueue_tuple(
-                child_model.train_function.outputs, name='oufeed-enqueue-train')
+                child_model.train_function.outputs,
+                name='outfeed-enqueue-train')
         ]
       elif is_test:
         child_model._make_test_function()
@@ -195,7 +209,8 @@ class TPUFunction(object):
         ]
         return [
             tpu_ops.outfeed_enqueue_tuple(
-                child_model.test_function.outputs, name='outfeed-enqueue-test')
+                child_model.test_function.outputs,
+                name='outfeed-enqueue-test')
         ]
       elif is_predict:
         child_model._make_predict_function()
@@ -215,29 +230,85 @@ class TPUFunction(object):
     # Capture outfeed metadata computed during the rewrite.
     self._outfeed_spec = None
 
-    tpu_execute_op = tpu.rewrite(_model_fn)
+    # Generate out TPU operations using `tpu.split_compile_and_replicate`.
+    # `compile_op` can be used to test the TPU model compiles before execution.
+    # `execute op` replicates `_model_fn` `num_replicas` times, with each shard
+    # running on a different logical core.
+    compile_op, execute_op = tpu.split_compile_and_replicate(
+        _model_fn, inputs=[[]] * self.num_replicas)
 
     # Generate CPU side operations to enqueue features/labels and dequeue
     # outputs from the model call.
-    with ops.device('/device:TPU:0'):
-      infeed_tensors = []
-      for spec in input_specs:
-        infeed_tensors.append(
-            array_ops.placeholder(
-                dtype=spec.dtype,
-                shape=spec.shape,
-                name='infeed-enqueue-%s' % spec.name))
-
-      infeed_op = tpu_ops.infeed_enqueue_tuple(
-          infeed_tensors, [spec.shape for spec in input_specs],
-          name='infeed-enqueue-%s' % self.execution_mode)
-
-      outfeed_op = tpu_ops.outfeed_dequeue_tuple(
-          dtypes=[spec.dtype for spec in self._outfeed_spec],
-          shapes=[spec.shape for spec in self._outfeed_spec],
-          name='outfeed-dequeue-%s' % self.execution_mode)
-
-    return CompiledTPUOp(tpu_execute_op, infeed_tensors, infeed_op, outfeed_op)
+    infeed_op = []
+    outfeed_op = []
+    shard_infeed_tensors = []
+
+    for shard_id in range(self.num_replicas):
+      with ops.device('/device:TPU:%d' % shard_id):
+        infeed_tensors = []
+        for spec in input_specs:
+          infeed_tensors.append(
+              array_ops.placeholder(
+                  dtype=spec.dtype,
+                  shape=spec.shape,
+                  name='infeed-enqueue-%s-%d' % (spec.name, shard_id)))
+        shard_infeed_tensors.append(infeed_tensors)
+
+        infeed_op.append(tpu_ops.infeed_enqueue_tuple(
+            infeed_tensors, [spec.shape for spec in input_specs],
+            name='infeed-enqueue-%s-%d' % (self.execution_mode, shard_id)))
+
+        outfeed_op.extend(tpu_ops.outfeed_dequeue_tuple(
+            dtypes=[spec.dtype for spec in self._outfeed_spec],
+            shapes=[spec.shape for spec in self._outfeed_spec],
+            name='outfeed-dequeue-%s-%d' % (self.execution_mode, shard_id)))
+
+    return TPUModelOp(
+        compile_op, execute_op, infeed_tensors=shard_infeed_tensors,
+        infeed_op=infeed_op, outfeed_op=outfeed_op)
+
+  def _test_model_compiles(self, tpu_model_ops):
+    """Verifies that the given TPUModelOp can be compiled via XLA."""
+    session = K.get_session()
+
+    logging.info('Started compiling')
+    start_time = time.clock()
+
+    result = session.run(tpu_model_ops.compile_op)
+    proto = tpu_compilation_result.CompilationResultProto()
+    proto.ParseFromString(result)
+    if proto.status_error_message:
+      raise RuntimeError(
+          'Compilation failed: {}'.format(proto.status_error_message))
+
+    end_time = time.clock()
+    logging.info('Finished compiling. Time elapsed: %s secs',
+                 end_time - start_time)
+
+  def _split_tensors(self, inputs):
+    """Split input data across shards.
+
+    Each input is sliced along the batch axis.
+
+    Args:
+      inputs: List of Numpy arrays to run on the TPU.
+
+    Returns:
+      List of lists containing the input to feed to each TPU shard.
+    """
+    if self.num_replicas == 1:
+      return [inputs]
+
+    batch_size = inputs[0].shape[0]
+    assert batch_size % self.num_replicas == 0, (
+        'batch_size must be divisible by num_replicas')
+    shard_size = batch_size // self.num_replicas
+    input_list = []
+    for index in range(self.num_replicas):
+      shard_inputs = [x[index * shard_size:(index + 1) * shard_size]
+                      for x in inputs]
+      input_list.append(shard_inputs)
+    return input_list
 
   def __call__(self, inputs):
     assert isinstance(inputs, list)
@@ -250,12 +321,18 @@ class TPUFunction(object):
     else:
       input_tensors = self.model._feed_inputs
 
+    shard_inputs = self._split_tensors(inputs)
+    del inputs  # To avoid accident usage.
+
     # Compute an input specification (used to generate infeed enqueue and
     # dequeue operations).  We use the shape from our input array and the
     # dtype from our model.  A user may pass in a float64 for a float32
     # input: for model compatibility we still must generate a float32 infeed.
     input_specs = []
-    for tensor, ary in zip(input_tensors, inputs):
+
+    # We use the shape and dtype from the first shard to compute the input
+    # metadata (`input_specs`); all replicas have the same type and shape.
+    for tensor, ary in zip(input_tensors, shard_inputs[0]):
       input_specs.append(
           tensor_spec.TensorSpec(ary.shape, tensor.dtype,
                                  _valid_name(tensor.name)))
@@ -268,21 +345,26 @@ class TPUFunction(object):
     if shape_key not in self._compilation_cache:
       logging.info('New input shapes; (re-)compiling: mode=%s, %s',
                    self.execution_mode, input_specs)
-      self._compilation_cache[shape_key] = self._specialize_model(input_specs)
+      new_tpu_model_ops = self._specialize_model(input_specs)
+      self._compilation_cache[shape_key] = new_tpu_model_ops
+      self._test_model_compiles(new_tpu_model_ops)
 
-    compiled_model = self._compilation_cache[shape_key]
+    tpu_model_ops = self._compilation_cache[shape_key]
 
     infeed_dict = {}
-    for tensor, value in zip(compiled_model.infeed_tensors, inputs):
-      infeed_dict[tensor] = value
+    for infeed_tensors, inputs in zip(tpu_model_ops.infeed_tensors,
+                                      shard_inputs):
+      for tensor, value in zip(infeed_tensors, inputs):
+        infeed_dict[tensor] = value
 
     session = K.get_session()
     _, _, outfeed_outputs = session.run([
-        compiled_model.infeed_op, compiled_model.tpu_execute_op,
-        compiled_model.outfeed_op
+        tpu_model_ops.infeed_op, tpu_model_ops.execute_op,
+        tpu_model_ops.outfeed_op
     ], infeed_dict)
 
-    return outfeed_outputs
+    # TODO(xiejw): Decide how to reduce outputs, or just discard all but first.
+    return outfeed_outputs[:len(outfeed_outputs) // self.num_replicas]
 
 
 @experimental
@@ -317,8 +399,8 @@ def shutdown_tpu_session(session=None):
 class KerasTPUModel(models.Model):
   """TPU compatible Keras model wrapper."""
 
-  def __init__(self, inputs, outputs, name=None):
-    super(models.Model, self).__init__(
+  def __init__(self, inputs, outputs, name, replicas=1):
+    super(models.Model, self).__init__(  # pylint: disable=bad-super-call
         inputs=inputs,
         outputs=outputs,
         name=name,
@@ -326,6 +408,7 @@ class KerasTPUModel(models.Model):
     self.predict_function = None
     self.test_function = None
     self.train_function = None
+    self.replicas = replicas
 
   def compile(self,
               optimizer,
@@ -354,7 +437,8 @@ class KerasTPUModel(models.Model):
 
   def _make_train_function(self):
     if not self.train_function:
-      self.train_function = TPUFunction(self, model_fn_lib.ModeKeys.TRAIN)
+      self.train_function = TPUFunction(self, model_fn_lib.ModeKeys.TRAIN,
+                                        num_replicas=self.replicas)
 
     return self.train_function
 
@@ -420,7 +504,53 @@ Output shape: %(output_shape)s
 
 
 @experimental
-def tpu_model(model):
+def tpu_model(model, replicas=None):
+  """Runs a model on TPU(s).
+
+  Usage:
+  ```
+  a = Input(shape=(32,))
+  b = Dense(32)(a)
+  model = Model(inputs=a, outputs=b)
+
+  model = keras_support.tpu_model(model)
+  model.compile(
+      optimizer=tf.train.GradientDescentOptimizer(learning_rate=1.0),
+      ...)
+  ```
+
+  If `replicas` is set, replicates the model computation on all TPU cores. The
+  model computation is replicated `num_replicas` times; each shard will run on a
+  different TPU core.
+
+  Limitation: Currently, replication is only supported for training.
+
+  Usage:
+  ```
+  a = Input(shape=(32,))
+  b = Dense(32)(a)
+  model = Model(inputs=a, outputs=b)
+
+  model = keras_support.tpu_model(model, replicas=2)
+  model.compile(
+      optimizer=tf.train.GradientDescentOptimizer(learning_rate=1.0),
+      ...)
+  ```
+
+  Args:
+    model: A `KerasTPUModel`.
+    replicas: (Optional) Int, number of TPU cores which to create model
+        replicas. If `None`, the model runs on single core only, i.e., no
+        replication.
+
+  Returns:
+    A new `KerasTPUModel` instance.
+  """
   _validate_shapes(model)
+  # TODO(xiejw): Validate TPU model. TPUModel only?
+  # TODO(xiejw): Validate replicas. Full or 1. Shall we allow subset?
+  # TODO(xiejw): Adds reduction option.
+  replicas = 1 if replicas is None else replicas
   return KerasTPUModel(
-      inputs=model.inputs, outputs=model.outputs, name=model.name)
+      inputs=model.inputs, outputs=model.outputs, name=model.name,
+      replicas=replicas)
diff --git a/tensorflow/contrib/tpu/python/tpu/session_support.py b/tensorflow/contrib/tpu/python/tpu/session_support.py
index 3455e0b4a674060eabcf98cbbe7550016afcda43..3e91e2df32e6f18b7f74c1d81f64776e59d09c2a 100644
--- a/tensorflow/contrib/tpu/python/tpu/session_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/session_support.py
@@ -28,6 +28,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.util import event_pb2
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -78,6 +79,15 @@ class WorkerHeartbeatManager(object):
     return WorkerHeartbeatManager(session, devices, heartbeat_ops,
                                   request_placeholder)
 
+  def heartbeat_supported(self):
+    """Returns True if heartbeat operations are supported on all workers."""
+    try:
+      # Send ping to verify worker has heartbeat support.
+      self.ping()
+      return True
+    except errors.InvalidArgumentError as _:
+      return False
+
   def configure(self, message):
     """Configure heartbeat manager for all devices.
 
@@ -106,7 +116,7 @@ class WorkerHeartbeatManager(object):
         event_pb2.WorkerHeartbeatResponse.FromString(res_pb)
         for res_pb in results
     ]
-    logging.info('Results: %s', parsed_results)
+    logging.debug('Ping results: %s', parsed_results)
     return parsed_results
 
   def lame_workers(self):
@@ -189,7 +199,9 @@ class WatchdogManager(threading.Thread):
     self._running = False
     self._graph = ops.Graph()
     self._session = session_lib.Session(
-        target=session.sess_str, graph=self._graph)
+        target=session.sess_str,
+        graph=self._graph,
+    )
 
     with self._graph.as_default():
       if devices is None:
@@ -249,6 +261,7 @@ class GracefulShutdownHook(session_run_hook.SessionRunHook):
     self._graph = ops.Graph()
     self._workers = None
     self._session = None
+    self._heartbeat_supported = False
 
   def after_create_session(self, training_session, coord):  # pylint: disable=unused-argument
     # N.B. We have to pull the global step here to avoid it being unavailable
@@ -264,28 +277,44 @@ class GracefulShutdownHook(session_run_hook.SessionRunHook):
           target=training_session.sess_str, graph=self._graph)
       self._workers = WorkerHeartbeatManager.from_devices(
           self._session, all_worker_devices(self._session))
-
-      self._workers.configure(
-          event_pb2.WorkerHeartbeatRequest(
-              shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
+      self._heartbeat_supported = self._workers.heartbeat_supported()
+      if self._heartbeat_supported:
+        self._workers.configure(
+            event_pb2.WorkerHeartbeatRequest(
+                shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
+      else:
+        logging.warn(
+            'Worker heartbeats not supported by all workers.  No failure '
+            'handling will be enabled.'
+        )
 
   def saver(self):
     if self._saver:
       return self._saver
 
-    savers = ops.get_collection(ops.GraphKeys.SAVERS)[0]
+    savers = ops.get_collection(ops.GraphKeys.SAVERS)
     if not savers:
       return None
 
     if not isinstance(savers, list):
       return savers
 
-    assert len(savers) == 1, 'Only one saver supported.'
+    if len(savers) > 1:
+      logging.error(
+          'Multiple savers in the SAVERS collection.  On-demand checkpointing '
+          'will be disabled. Pass an explicit `saver` to the constructor to '
+          'override this behavior.'
+      )
+      return None
+
     return savers[0]
 
   def after_run(self, run_context, run_values):
     del run_values
 
+    if not self._heartbeat_supported:
+      return
+
     lame_workers = self._workers.lame_workers()
     if lame_workers:
       logging.info('ShutdownHook: lame workers found: %s', lame_workers)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 7b8786304ccf7e707712f178838cafb8346d2941..e2f57ce9c503f454d2bf1eae895922a4a4d26ced 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -58,6 +58,7 @@ _NOT_IMPLEMENTED_OPS = set([
 _MAX_WARNING_LINES = 5
 
 _TPU_REPLICATE_ATTR = "_tpu_replicate"
+_TPU_COMPILATION_STATUS_ATTR = "_tpu_compilation_status"
 _OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
 
 
@@ -385,6 +386,49 @@ def replicate(computation,
     ValueError: If the number of inputs per replica does not match
       the number of formal parameters to `computation`.
   """
+  return split_compile_and_replicate(computation, inputs, infeed_queue,
+                                     device_assignment, name)[1]
+
+
+def split_compile_and_replicate(computation,
+                                inputs=None,
+                                infeed_queue=None,
+                                device_assignment=None,
+                                name=None,
+                                use_tpu=True):
+  """Builds graph operators that runs compilation and replicated computation.
+
+  This is a lower level interface than replicate that returns a separate compile
+  and execute output tensor. In the generated graph the compile op feeds into
+  the execute op and no additional compilation is incurred when running the
+  compile op before the execute op. The compile op returns additional
+  information about the compilation but does not return the compiled program.
+
+  Args:
+    computation: A Python function that builds the computation to replicate.
+    inputs: A list of lists of input tensors or `None` (equivalent to
+      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
+      have the same number of inputs.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to computation.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each replica of the computation uses
+      only one core, and there is either only one replica, or the number of
+      replicas is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+    use_tpu: When false, the input `computation` is executed on the XLA CPU/GPU
+      backends. Currently, only supports a default placement (computation is
+      placed on GPU if one is available, and on CPU if not).
+  Returns:
+    A list of lists with the first list corresponding to the compile op and the
+    second a list of output tensors, indexed by `[replica_num][output_num]`.
+  Raises:
+    ValueError: If all replicas do not have equal numbers of input tensors.
+    ValueError: If the number of inputs per replica does not match
+      the number of formal parameters to `computation`.
+  """
   del name
   inputs = [[]] if inputs is None else inputs
 
@@ -456,13 +500,13 @@ def replicate(computation,
     computation_inputs.append(
         tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
 
-  context = TPUReplicateContext(
-      name=graph.unique_name("cluster"), num_replicas=num_replicas)
+  cluster_name = graph.unique_name("cluster")
+  context = TPUReplicateContext(name=cluster_name, num_replicas=num_replicas)
   try:
     context.Enter()
 
     metadata = tpu_ops.tpu_replicate_metadata(
-        num_replicas=num_replicas, **metadata_kwargs)
+        num_replicas=num_replicas, use_tpu=use_tpu, **metadata_kwargs)
 
     with tpu_function.tpu_shard_context(
         num_replicas), ops.control_dependencies([metadata]):
@@ -516,8 +560,7 @@ def replicate(computation,
 
     # Separates the returned Operations and Tensors.
     output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
-    output_tensors = [o for o in outputs
-                      if not isinstance(o, ops.Operation)]
+    output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)]
 
     if outputs != output_tensors + output_operations:
       raise ValueError(
@@ -550,22 +593,36 @@ def replicate(computation,
                                            name="output{}".format(i))
              for i in xrange(output_arity)]
 
+  with ops.control_dependencies([metadata]):
+    if use_tpu:
+      compile_status = tpu_ops.tpu_compilation_result()
+      op = compile_status.op
+      attr_value = attr_value_pb2.AttrValue(s=compat.as_bytes(cluster_name))
+      op._set_attr(_TPU_COMPILATION_STATUS_ATTR, attr_value)  # pylint: disable=protected-access
+    else:
+      compile_status = control_flow_ops.no_op(name="compilation_status")
+
   with ops.control_dependencies(output_operations):
     if output_arity == 0:
       # Returns a list of NoOps dependent on the replication Op, indexed by
       # [replica_num].
       return [
-          control_flow_ops.no_op(name="shard_%d" % i)
-          for i in range(num_replicas)
+          compile_status, [
+              control_flow_ops.no_op(name="shard_%d" % i)
+              for i in range(num_replicas)
+          ]
       ]
     else:
       # Wraps the outputs in identity operators so the names of any possible
       # `fetch` nodes are preserved by the replication rewrite.
       return [
-          [array_ops.identity(outputs[out][replica],
-                              name="output_%d_shard_%d" % (out, replica))
-           for out in xrange(output_arity)]
-          for replica in xrange(num_replicas)
+          compile_status, [[
+              array_ops.identity(
+                  outputs[out][replica],
+                  name="output_%d_shard_%d" % (out, replica))
+              for out in xrange(output_arity)
+          ]
+                           for replica in xrange(num_replicas)]
       ]
 
 
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index 4d7bc6a5a65eaae9810b7f01bdb96b3537ba9896..5b9aeaa8797b92b4cc596744812f440607054dce 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -35,7 +35,98 @@ _DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
 _LOCAL_MASTERS = ('', 'local')
 
 
-class _TPUContext(object):
+class TPUContext(object):
+  """The context of current input_fn invocation."""
+
+  def __init__(self, internal_ctx, input_device=None, invocation_index=None):
+    self._internal_ctx = internal_ctx
+    self._input_device = input_device
+    self._invocation_index = invocation_index
+
+  def current_input_fn_deployment(self):
+    """The configuration of the current input_fn invocation.
+
+    The configuration depends on `TPUConfig.per_host_input_for_training`. See
+    `TPUConfig` for details.
+
+    Only set in params dict of input_fn
+
+    Returns:
+      A tuple of
+        1. Device spec string: String, is the current CPU host where the
+           input_fn is invoked.
+        2. Current invocation index: Int, 0-based index of the input_fn
+           invocation. See next item for details.
+        3. Total invocation count: Int, the total number of times to invoke the
+           input_fn on all CPU hosts. Each invocation will be passed with a new
+           `TPUContext` instance with current invocation index set properly.
+        4. Total number of replicas consumed by current_invocation: Int, the
+           number of replicas fed by the data returned by current input_fn. For
+           example, for per_core input pipeline deployment
+           and non-model-parallelism, total invocation count is equal to
+           the number of cores in the system and num replicas consumed by
+           current invocation is 1. For per-host v2 input pipeline deployment,
+           total invocation count is equal to the number of hosts in the system
+           and num replicas consumed by current invocation is equal to number of
+           cores per host.
+    """
+    if self._internal_ctx.is_input_sharded_per_core():
+      total_invocation_count = (self._internal_ctx.num_hosts
+                                * self._internal_ctx.num_of_replicas_per_host)
+      replicas_consumed = 1
+    else:
+      total_invocation_count = self._internal_ctx.num_hosts
+      replicas_consumed = self._internal_ctx.num_of_replicas_per_host
+    return (self._input_device, self._invocation_index,
+            total_invocation_count, replicas_consumed)
+
+  @property
+  def num_replicas(self):
+    """The total number of replicas.
+
+    For non-model-parallelism, num_replicas should be the total num of TPU
+    cores in the system.
+
+    Returns:
+      The number of replicas.
+    """
+    return self._internal_ctx.num_replicas
+
+  def device_for_replica(self, replica_id):
+    """Returns the tuple of (CPU device and device ordinal) for replica.
+
+    This should be used for full replicate for non-model-parallelism.
+
+    Args:
+       replica_id: Int, the replica index.
+
+    Returns:
+       A tuple of device spec for CPU device and int device ordinal.
+    """
+    # Note that: For the non-model parallelism, the mapping could be
+    # a random permutation. The order should not matter in most cases
+    # as far as model is replicated to all cores in the system.
+
+    # If the precise replica_id to device mapping is required, please
+    # set the computation_shape as [1,1,1] in TPUConfig to enable
+    # the model parallelism.
+    if self._internal_ctx.model_parallelism_enabled:
+      return RuntimeError(
+          'device_for_replica is not yet implemented for model parallelism. '
+          'b/79689078.')
+
+    master = self._internal_ctx.master_job
+    job_device = '' if master is None else ('/job:%s' % master)
+
+    num_of_replicas_per_host = self._internal_ctx.num_of_replicas_per_host
+    host_id = replica_id / num_of_replicas_per_host
+    ordinal_id = replica_id % num_of_replicas_per_host
+
+    host_device = '%s/task:%d/device:CPU:0' % (job_device, host_id)
+    return (host_device, ordinal_id)
+
+
+class _InternalTPUContext(object):
   """A context holds immutable states of TPU computation.
 
   This immutable object holds TPUEstimator config, train/eval batch size, and
@@ -44,9 +135,13 @@ class _TPUContext(object):
   information commonly required by TPU computation, such as TPU device names,
   TPU hosts, shard batch size, etc.
 
+  if eval_on_tpu is False, then execution of eval on TPU is disabled.
+  if eval_on_tpu is True, but use_tpu is False, a warning is issued,
+  and TPU execution is disabled for all modes.
+
   N.B. As `mode` is not immutable state in Estimator, but essential to
   distinguish between TPU training and evaluation, a common usage for
-  _TPUContext with `mode` is as follows:
+  _InternalTPUContext with `mode` is as follows:
   ```
   with _ctx.with_mode(mode) as ctx:
     if ctx.is_running_on_cpu():
@@ -55,12 +150,17 @@ class _TPUContext(object):
   """
 
   def __init__(self, config, train_batch_size, eval_batch_size,
-               predict_batch_size, use_tpu):
+               predict_batch_size, use_tpu, eval_on_tpu=True):
     self._config = config
     self._train_batch_size = train_batch_size
     self._eval_batch_size = eval_batch_size
     self._predict_batch_size = predict_batch_size
     self._use_tpu = use_tpu
+    logging.info('_TPUContext: eval_on_tpu %s', eval_on_tpu)
+    if not use_tpu and eval_on_tpu:
+      logging.warning('eval_on_tpu ignored because use_tpu is False.')
+
+    self._eval_on_tpu = eval_on_tpu
     self._model_parallelism_enabled = (
         use_tpu and config.tpu_config.computation_shape)
     self._mode = None
@@ -246,6 +346,10 @@ class _TPUContext(object):
     if not self._use_tpu:
       return True
 
+    if mode == model_fn_lib.ModeKeys.EVAL and not self._eval_on_tpu:
+      logging.info('_is_running_on_cpu: eval_on_tpu disabled')
+      return True
+
     if mode != model_fn_lib.ModeKeys.PREDICT:
       return False
 
@@ -345,6 +449,7 @@ class _TPUContext(object):
   @property
   def tpu_host_placement_function(self):
     """Returns the TPU host place function."""
+
     master = self.master_job
 
     def _placement_function(_sentinal=None, core_id=None, host_id=None):  # pylint: disable=invalid-name
@@ -473,8 +578,8 @@ class _TPUContext(object):
     self._lazy_validation_dict[mode] = True
 
 
-class _OneCoreTPUContext(_TPUContext):
-  """Special _TPUContext for one core usage."""
+class _OneCoreTPUContext(_InternalTPUContext):
+  """Special _InternalTPUContext for one core usage."""
 
   def __init__(self, config, train_batch_size, eval_batch_size,
                predict_batch_size, use_tpu):
@@ -503,8 +608,8 @@ class _OneCoreTPUContext(_TPUContext):
 
 
 def _get_tpu_context(config, train_batch_size, eval_batch_size,
-                     predict_batch_size, use_tpu):
-  """Returns an instance of `_TPUContext`."""
+                     predict_batch_size, use_tpu, eval_on_tpu):
+  """Returns an instance of `_InternalTPUContext`."""
 
   if (config.tpu_config.num_shards == 1 and
       config.tpu_config.computation_shape is None):
@@ -514,5 +619,5 @@ def _get_tpu_context(config, train_batch_size, eval_batch_size,
     return _OneCoreTPUContext(config, train_batch_size, eval_batch_size,
                               predict_batch_size, use_tpu)
 
-  return _TPUContext(config, train_batch_size, eval_batch_size,
-                     predict_batch_size, use_tpu)
+  return _InternalTPUContext(config, train_batch_size, eval_batch_size,
+                             predict_batch_size, use_tpu, eval_on_tpu)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index a69bfa9a20bed731f687b8b6fb57ae524fd620d3..77d117ba78276aa35e55eb3a524575acc1cf607b 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -46,7 +46,6 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator import util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -68,6 +67,7 @@ from tensorflow.python.training import evaluation
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
@@ -76,12 +76,13 @@ _ZERO_LOSS = 0.
 _TPU_ESTIMATOR = 'tpu_estimator'
 _ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
 _BATCH_SIZE_KEY = 'batch_size'
+_CTX_KEY = 'context'
 _CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
 _ONE_GIGABYTE = 1024 * 1024 * 1024
 _TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
 _TPU_TRAIN_OP = '_tpu_train_op'
 
-_RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
+_RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY, _CTX_KEY]
 
 
 # TODO(b/65703635): Flip the value and remove all dead code. Currently, this is
@@ -175,17 +176,7 @@ class _SIGNAL(object):
   STOP = -2
 
 
-class TPUEstimatorSpec(
-    collections.namedtuple('TPUEstimatorSpec', [
-        'mode',
-        'predictions',
-        'loss',
-        'train_op',
-        'eval_metrics',
-        'export_outputs',
-        'scaffold_fn',
-        'host_call'
-    ])):
+class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
   """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
 
   See `EstimatorSpec` for `mode`, 'predictions, 'loss', 'train_op', and
@@ -636,8 +627,8 @@ class _StoppingPredictHook(session_run_hook.SessionRunHook):
       raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.')
 
 
-def generate_per_core_enqueue_ops_fn_for_host(ctx, input_fn,
-                                              inputs_structure_recorder):
+def generate_per_core_enqueue_ops_fn_for_host(
+    ctx, input_fn, inputs_structure_recorder, host_device, host_id):
   """Generates infeed enqueue ops for per-core input_fn on a single host."""
   captured_infeed_queue = _CapturedObject()
 
@@ -647,7 +638,12 @@ def generate_per_core_enqueue_ops_fn_for_host(ctx, input_fn,
     per_host_sharded_inputs = []
     for core_ordinal in range(num_cores_per_host):
       with ops.name_scope('ordinal_%d' % (core_ordinal)):
-        inputs = _Inputs.from_input_fn(input_fn())
+        user_context = tpu_context.TPUContext(
+            internal_ctx=ctx,
+            input_device=host_device,
+            invocation_index=host_id * ctx.num_of_cores_per_host + core_ordinal
+        )
+        inputs = _Inputs.from_input_fn(input_fn(user_context))
         if inputs.is_dataset:
           raise TypeError(
               '`input_fn` returning `Dataset`  is not yet supported in '
@@ -684,7 +680,11 @@ def generate_per_host_enqueue_ops_fn_for_host(
   hooks = []
 
   with ops.device(device):
-    inputs = _Inputs.from_input_fn(input_fn())
+    user_context = tpu_context.TPUContext(
+        internal_ctx=ctx,
+        input_device=device,
+        invocation_index=host_id)
+    inputs = _Inputs.from_input_fn(input_fn(user_context))
 
     is_dataset = inputs.is_dataset
     if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
@@ -702,7 +702,7 @@ def generate_per_host_enqueue_ops_fn_for_host(
       hooks.append(inputs.dataset_initializer_hook())
 
   # TODO(ylc): Refactoring the code to merge the tpu ordinal logic here and the
-  # _TPUContext.tpu_ordinal_function. We should either introduce another
+  # _InternalTPUContext.tpu_ordinal_function. We should either introduce another
   # abstraction or a different helper method.
   def _tpu_ordinal_function_impl(shard_index_in_host):
     # We put both enqueue/dequeue op at tpu.core(0) in each replica.
@@ -755,12 +755,15 @@ def generate_per_host_enqueue_ops_fn_for_host(
 def generate_per_host_v2_enqueue_ops_fn_for_host(
     ctx, input_fn, inputs_structure_recorder, device, host_id):
   """Generates infeed enqueue ops for per-host input_fn on a single host."""
-  del host_id  # unused
   captured_infeed_queue = _CapturedObject()
   hooks = []
 
   with ops.device(device):
-    inputs = _Inputs.from_input_fn(input_fn())
+    user_context = tpu_context.TPUContext(
+        internal_ctx=ctx,
+        input_device=device,
+        invocation_index=host_id)
+    inputs = _Inputs.from_input_fn(input_fn(user_context))
 
     is_dataset = inputs.is_dataset
     if not is_dataset:
@@ -811,13 +814,14 @@ class _InputPipeline(object):
   """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue.
 
   `_InputPipeline` abstracts the per-core/per-host `input_fn` invocation from
-  call site.  To be precise, based on the configuration in `_TPUContext`,  it
-  invokes `input_fn` for all cores (usually multi-host TPU training) or for one
-  host (usually for single-host TPU evaluation), and sends all `features` and
-  `labels` returned by `input_fn` to TPU infeed. For per-core invocation,
-  `features` and `labels` are piped to infeed directly, one tuple for each
-  core. For per-host invocation,  `features` and `labels` are split at host
-  (with respect to `batch_axis`) and piped to all cores accordingly.
+  call site.  To be precise, based on the configuration in
+  `_InternalTPUContext`,  it invokes `input_fn` for all cores (usually
+  multi-host TPU training) or for one host (usually for single-host TPU
+  evaluation), and sends all `features` and `labels` returned by `input_fn` to
+  TPU infeed. For per-core invocation, `features` and `labels` are piped to
+  infeed directly, one tuple for each core. For per-host invocation,  `features`
+  and `labels` are split at host (with respect to `batch_axis`) and piped to all
+  cores accordingly.
 
   In addition, flatten/unflatten are handled by `_InputPipeline` also.  Model
   inputs returned by the `input_fn` can have one of the following forms:
@@ -970,7 +974,7 @@ class _InputPipeline(object):
       batch_axis: A python tuple of int values describing how each tensor
         produced by the Estimator `input_fn` should be split across the TPU
         compute shards.
-      ctx: A `_TPUContext` instance with mode.
+      ctx: A `_InternalTPUContext` instance with mode.
 
     Raises:
       ValueError: If both `sharded_features` and `num_cores` are `None`.
@@ -1025,7 +1029,8 @@ class _InputPipeline(object):
           with ops.name_scope('input_pipeline_task%d' % (host_id)):
             enqueue_ops_fn, captured_infeed_queue = (
                 generate_per_core_enqueue_ops_fn_for_host(
-                    self._ctx, self._input_fn, self._inputs_structure_recorder))
+                    self._ctx, self._input_fn, self._inputs_structure_recorder,
+                    host_device, host_id))
 
             if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
               run_infeed_loop_on_coordinator = False
@@ -1156,7 +1161,7 @@ class _ModelFnWrapper(object):
           self._call_model_fn(features, labels))
       loss, train_op = estimator_spec.loss, estimator_spec.train_op
 
-      if isinstance(estimator_spec, TPUEstimatorSpec):
+      if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
         captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
       else:
         captured_scaffold_fn.capture(None)
@@ -1165,8 +1170,8 @@ class _ModelFnWrapper(object):
       # outfeed.
       with ops.control_dependencies([train_op]):
         host_call_outfeed_ops = []
-        if (isinstance(estimator_spec, TPUEstimatorSpec) and
-            estimator_spec.host_call is not None):
+        if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)  # pylint: disable=protected-access
+            and estimator_spec.host_call is not None):
           host_call.record({'host_call': estimator_spec.host_call})
           host_call_outfeed_ops = host_call.create_enqueue_op()
         with ops.control_dependencies(host_call_outfeed_ops):
@@ -1209,7 +1214,7 @@ class _ModelFnWrapper(object):
       features, labels = inputs.features_and_labels()
 
       tpu_estimator_spec = self._call_model_fn(features, labels)
-      if not isinstance(tpu_estimator_spec, TPUEstimatorSpec):
+      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
         raise RuntimeError(
             'estimator_spec used by TPU evaluation must have type'
             '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
@@ -1254,7 +1259,7 @@ class _ModelFnWrapper(object):
 
       tpu_estimator_spec = self._call_model_fn(
           features, labels, is_export_mode=False)
-      if not isinstance(tpu_estimator_spec, TPUEstimatorSpec):
+      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
         raise RuntimeError(
             'estimator_spec used by TPU prediction must have type'
             '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
@@ -1279,7 +1284,7 @@ class _ModelFnWrapper(object):
 
   def _call_model_fn(self, features, labels, is_export_mode=False):
     """Calls the model_fn with required parameters."""
-    model_fn_args = util.fn_args(self._model_fn)
+    model_fn_args = function_utils.fn_args(self._model_fn)
     kwargs = {}
 
     # Makes deep copy with `config` and params` in case user mutates them.
@@ -1309,14 +1314,11 @@ class _ModelFnWrapper(object):
       batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
 
     if batch_size_for_model_fn is not None:
-      if isinstance(params, hparam.HParams):
-        params.add_hparam(_BATCH_SIZE_KEY, batch_size_for_model_fn)
-      else:
-        params[_BATCH_SIZE_KEY] = batch_size_for_model_fn
+      _add_item_to_params(params, _BATCH_SIZE_KEY, batch_size_for_model_fn)
 
     estimator_spec = self._model_fn(features=features, **kwargs)
     if (self._ctx.is_running_on_cpu(is_export_mode) and
-        isinstance(estimator_spec, TPUEstimatorSpec)):
+        isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)):  # pylint: disable=protected-access
       # The estimator_spec will be passed to `Estimator` directly, which expects
       # type `EstimatorSpec`.
       return estimator_spec.as_estimator_spec()
@@ -1325,7 +1327,7 @@ class _ModelFnWrapper(object):
 
   def _verify_estimator_spec(self, estimator_spec):
     """Validates the estimator_spec."""
-    if isinstance(estimator_spec, TPUEstimatorSpec):
+    if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
       return estimator_spec
 
     err_msg = '{} returned by EstimatorSpec is not supported in TPUEstimator.'
@@ -1371,7 +1373,7 @@ class _OutfeedHostCall(object):
 
       if isinstance(host_call[1], (tuple, list)):
         fullargspec = tf_inspect.getfullargspec(host_call[0])
-        fn_args = util.fn_args(host_call[0])
+        fn_args = function_utils.fn_args(host_call[0])
         # wrapped_hostcall_with_global_step uses varargs, so we allow that.
         if fullargspec.varargs is None and len(host_call[1]) != len(fn_args):
           raise RuntimeError(
@@ -1622,7 +1624,9 @@ class TPUEstimator(estimator_lib.Estimator):
   ==========
 
   `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics`
-  for TPU evaluation.
+  for TPU evaluation. However, if eval_on_tpu is False, `model_fn` must return
+  `EstimatorSpec` and the evaluation will execute on CPU or GPU; in this case
+  the following discussion on TPU evaluation does not apply.
 
   `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
   `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. (See
@@ -1769,7 +1773,9 @@ class TPUEstimator(estimator_lib.Estimator):
                train_batch_size=None,
                eval_batch_size=None,
                predict_batch_size=None,
-               batch_axis=None):
+               batch_axis=None,
+               eval_on_tpu=True,
+               warm_start_from=None):
     """Constructs an `TPUEstimator` instance.
 
     Args:
@@ -1787,7 +1793,8 @@ class TPUEstimator(estimator_lib.Estimator):
         basic python types. There are reserved keys for `TPUEstimator`,
         including 'batch_size'.
       use_tpu: A bool indicating whether TPU support is enabled. Currently,
-        - TPU training and evaluation respect this bit.
+        - TPU training and evaluation respect this bit, but eval_on_tpu can
+          override execution of eval. See below.
         - Predict still happens on CPU.
       train_batch_size: An int representing the global training batch size.
         TPUEstimator transforms this global batch size to a per-shard batch
@@ -1808,6 +1815,14 @@ class TPUEstimator(estimator_lib.Estimator):
         and per_host_input_for_training is True, batches will be sharded based
         on the major dimension. If tpu_config.per_host_input_for_training is
         False or `PER_HOST_V2`, batch_axis is ignored.
+      eval_on_tpu: If False, evaluation runs on CPU or GPU. In this case, the
+        model_fn must return `EstimatorSpec` when called with `mode` as `EVAL`.
+      warm_start_from: Optional string filepath to a checkpoint or SavedModel to
+                       warm-start from, or a `tf.estimator.WarmStartSettings`
+                       object to fully configure warm-starting.  If the string
+                       filepath is provided instead of a `WarmStartSettings`,
+                       then all variables are warm-started, and it is assumed
+                       that vocabularies and Tensor names are unchanged.
 
     Raises:
       ValueError: `params` has reserved keys already.
@@ -1822,7 +1837,7 @@ class TPUEstimator(estimator_lib.Estimator):
 
     if use_tpu:
       # Perform some very basic validations. More validations will be found in
-      # _TPUContext.
+      # _InternalTPUContext.
       if train_batch_size is None:
         raise ValueError('`train_batch_size` cannot be `None`')
       util_lib.check_positive_integer(train_batch_size, 'train_batch_size')
@@ -1860,16 +1875,18 @@ class TPUEstimator(estimator_lib.Estimator):
         model_fn=model_function,
         model_dir=model_dir,
         config=config,
-        params=params)
+        params=params,
+        warm_start_from=warm_start_from)
     self._iterations_per_training_loop = (
         self._config.tpu_config.iterations_per_loop)
 
-    # All properties passed to _TPUContext are immutable.
+    # All properties passed to _InternalTPUContext are immutable.
     # pylint: disable=protected-access
     self._ctx = tpu_context._get_tpu_context(
         self._config, train_batch_size,
         eval_batch_size, predict_batch_size,
-        use_tpu)
+        use_tpu,
+        eval_on_tpu)
 
     self._is_input_fn_invoked = None
 
@@ -1940,7 +1957,7 @@ class TPUEstimator(estimator_lib.Estimator):
     Raises:
       ValueError: if input_fn takes invalid arguments or does not have `params`.
     """
-    input_fn_args = util.fn_args(input_fn)
+    input_fn_args = function_utils.fn_args(input_fn)
     config = self.config  # a deep copy.
     kwargs = {}
     if 'params' in input_fn_args:
@@ -1963,10 +1980,8 @@ class TPUEstimator(estimator_lib.Estimator):
       # input_fn for use_tpu=True/False.
       batch_size_for_input_fn = ctx.batch_size_for_input_fn
       if batch_size_for_input_fn is not None:
-        if isinstance(kwargs['params'], hparam.HParams):
-          kwargs['params'].add_hparam(_BATCH_SIZE_KEY, batch_size_for_input_fn)
-        else:
-          kwargs['params'][_BATCH_SIZE_KEY] = batch_size_for_input_fn
+        _add_item_to_params(kwargs['params'],
+                            _BATCH_SIZE_KEY, batch_size_for_input_fn)
 
       # For export_savedmodel, input_fn is never passed to Estimator. So,
       # `is_export_mode` must be False.
@@ -1984,7 +1999,8 @@ class TPUEstimator(estimator_lib.Estimator):
       # tf.while_loop also. So, we either pass input_fn to model_fn or pass
       # dequeue_fn to model_fn. Here, `input_fn` is passed directly as
       # `features` in `model_fn` signature.
-      def _input_fn():
+      def _input_fn(ctx):
+        _add_item_to_params(kwargs['params'], _CTX_KEY, ctx)
         return input_fn(**kwargs)
 
       return _input_fn
@@ -2802,3 +2818,17 @@ def _verify_cross_hosts_transfer_size(tensor_dict, message):
         '{}'.format(message, '\n'.join([
             ' -- Key: {}, Shape: {}'.format(k, v)
             for k, v in tensor_structure.items()])))
+
+
+def _add_item_to_params(params, key, value):
+  """Adds a new item into `params`."""
+  if isinstance(params, hparam.HParams):
+    # For HParams, we need to use special API.
+    if key in params:
+      params.key = value
+    else:
+      params.add_hparam(key, value)
+  else:
+    # Now params is Python dict.
+    params[key] = value
+
diff --git a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
index f305197c190b67355338c407a7895a0507941ddb..df07ff44ee68230cd06723d87c2f60407120e8dc 100644
--- a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
+++ b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
@@ -27,7 +27,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -506,19 +505,6 @@ class BatchSequencesWithStatesTest(test.TestCase):
         expected_seq4_batch2=expected_seq4_batch2)
 
 
-class BatchSequencesWithStatesTestWithCApi(BatchSequencesWithStatesTest):
-
-  def setUp(self):
-    self._prev_value = ops._USE_C_API
-    ops._USE_C_API = True
-    super(BatchSequencesWithStatesTestWithCApi, self).setUp()
-
-  def tearDown(self):
-    super(BatchSequencesWithStatesTestWithCApi, self).tearDown()
-    ops._USE_C_API = self._prev_value
-
-
-@test_util.with_c_api
 class PaddingTest(test.TestCase):
 
   def testPaddingInvalidLengths(self):
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index f0418f04ba2c5c12c882d0b678f182058f25a94f..3beb7bfe3048a8f0294f7e9149b5a07b5fcc7d17 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -34,7 +34,7 @@ from tensorflow.python.util import deprecation
 # where <rhs> is either a single token or [] enclosed list of tokens.
 # For example:  "var[1] = a" or "x = [1,2,3]"
 PARAM_RE = re.compile(r"""
-  (?P<name>[a-zA-Z][\w]*)      # variable name: "var" or "x"
+  (?P<name>[a-zA-Z][\w\.]*)      # variable name: "var" or "x"
   (\[\s*(?P<index>\d+)\s*\])?  # (optional) index: "1" or None
   \s*=\s*
   ((?P<val>[^,\[]*)            # single value: "a" or None
@@ -200,6 +200,13 @@ def parse_values(values, type_map):
   If a hyperparameter name in both an index assignment and scalar assignment,
   a ValueError is raised.  (e.g. 'a=[1,2,3],a[0] = 1').
 
+  The hyperparameter name may contain '.' symbols, which will result in an
+  attribute name that is only accessible through the getattr and setattr
+  functions.  (And must be first explicit added through add_hparam.)
+
+  WARNING: Use of '.' in your variable names is allowed, but is not well
+  supported and not recommended.
+
   The `value` in `name=value` must follows the syntax according to the
   type of the parameter:
 
diff --git a/tensorflow/contrib/training/python/training/hparam_test.py b/tensorflow/contrib/training/python/training/hparam_test.py
index 11fd15b5275a3c00b85bf986b2ff1ba0e2638aed..660c97f25e8458c345c8914bcaf98f37d047e50e 100644
--- a/tensorflow/contrib/training/python/training/hparam_test.py
+++ b/tensorflow/contrib/training/python/training/hparam_test.py
@@ -118,6 +118,21 @@ class HParamsTest(test.TestCase):
     self.assertEqual('2.3"', hparams2.c_c)
     self.assertEqual('/a=b/c/d', hparams2.d)
 
+  def testWithPeriodInVariableName(self):
+    hparams = hparam.HParams()
+    hparams.add_hparam(name='a.b', value=0.0)
+    hparams.parse('a.b=1.0')
+    self.assertEqual(1.0, getattr(hparams, 'a.b'))
+    hparams.add_hparam(name='c.d', value=0.0)
+    with self.assertRaisesRegexp(ValueError, 'Could not parse'):
+      hparams.parse('c.d=abc')
+    hparams.add_hparam(name='e.f', value='')
+    hparams.parse('e.f=abc')
+    self.assertEqual('abc', getattr(hparams, 'e.f'))
+    hparams.add_hparam(name='d..', value=0.0)
+    hparams.parse('d..=10.0')
+    self.assertEqual(10.0, getattr(hparams, 'd..'))
+
   def testSetFromMap(self):
     hparams = hparam.HParams(a=1, b=2.0, c='tanh')
     hparams.override_from_dict({'a': -2, 'c': 'identity'})
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 4b86d6ef4752b191d04114a5ee5b547c644debcb..a576e360973381d6df3ed399b207a987006d58c7 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -94,6 +94,7 @@ load(
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_mkl")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_version_info_genrule")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_only_cc_test")
 
@@ -225,6 +226,7 @@ ADDITIONAL_CORE_PROTO_SRCS = [
     "protobuf/named_tensor.proto",
     "protobuf/saved_model.proto",
     "protobuf/tensorflow_server.proto",
+    "protobuf/transport_options.proto",
     "util/test_log.proto",
 ]
 
@@ -267,6 +269,12 @@ proto_library(
     visibility = ["//visibility:public"],
 )
 
+closure_proto_library(
+    name = "example_protos_closure",
+    visibility = ["//visibility:public"],
+    deps = [":example_protos"],
+)
+
 exports_files([
     "framework/types.proto",
 ])
@@ -303,6 +311,7 @@ PLATFORM_OTHER_HDRS = [
     "platform/cpu_info.h",
     "platform/cpu_feature_guard.h",
     "platform/dynamic_annotations.h",
+    "platform/error.h",
     "platform/env.h",
     "platform/file_system.h",
     "platform/file_system_helper.h",
@@ -336,6 +345,8 @@ cc_library(
     ],
     hdrs = PLATFORM_BASE_HDRS,
     copts = tf_copts(),
+    # TODO(ahentz): remove use of this library so we can move it into 'platform'
+    tags = ["avoid_dep"],
     deps = [
         ":lib_platform",
         "//tensorflow/core/platform/default/build_config:base",
@@ -578,6 +589,7 @@ tf_cuda_library(
         "framework/types.h",
         "public/version.h",
         "util/activation_mode.h",
+        "util/batch_util.h",
         "util/bcast.h",
         "util/cuda_kernel_helper.h",
         "util/device_name_utils.h",
@@ -801,8 +813,6 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor",
         "//tensorflow/core/kernels:bounds_check_lib",
-        "//third_party/eigen3",
-        "@farmhash_archive//:farmhash",
     ],
     alwayslink = 1,
 )
@@ -1518,6 +1528,13 @@ tf_pyclif_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_pyclif_proto_library(
+    name = "framework/cost_graph_pyclif",
+    proto_lib = ":protos_all_cc",
+    proto_srcfile = "framework/cost_graph.proto",
+    visibility = ["//visibility:public"],
+)
+
 tf_pyclif_proto_library(
     name = "framework/tensor_pyclif",
     proto_lib = ":protos_all_cc",
@@ -1655,13 +1672,6 @@ LIB_INTERNAL_PRIVATE_HEADERS = ["framework/resource_handle.h"] + glob(
         "platform/**/cuda.h",
         "platform/**/stream_executor.h",
     ],
-) + tf_additional_lib_srcs(
-    exclude = [
-        "**/*.cc",
-        "**/*test*",
-        "platform/**/cuda.h",
-        "platform/**/stream_executor.h",
-    ],
 )
 
 LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [
@@ -1961,6 +1971,7 @@ tf_proto_library(
     j2objc_api_version = 1,
     java_api_version = 2,
     js_api_version = 2,
+    provide_cc_alias = True,
 )
 
 tf_generate_proto_text_sources(
@@ -2359,6 +2370,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/executor.h",
     "common_runtime/graph_optimizer.h",
     "common_runtime/local_device.h",
+    "common_runtime/lower_if_op.h",
     "common_runtime/memory_types.h",
     "common_runtime/mkl_cpu_allocator.h",
     "common_runtime/optimization_registry.h",
@@ -2409,6 +2421,7 @@ tf_cuda_library(
         "common_runtime/graph_optimizer.cc",
         "common_runtime/graph_runner.cc",
         "common_runtime/local_device.cc",
+        "common_runtime/lower_if_op.cc",
         "common_runtime/memory_types.cc",
         "common_runtime/mkl_cpu_allocator.cc",
         "common_runtime/optimization_registry.cc",
@@ -2566,6 +2579,7 @@ tf_cuda_library(
     ],
     copts = tf_copts(),
     cuda_deps = tf_additional_cupti_wrapper_deps() + tf_additional_device_tracer_cuda_deps(),
+    visibility = ["//visibility:private"],
     deps = [
         ":core_cpu_internal",
         ":lib",
@@ -3277,7 +3291,10 @@ tf_cc_tests_gpu(
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
-    srcs = ["common_runtime/mkl_cpu_allocator_test.cc"],
+    srcs = [
+        "common_runtime/mkl_cpu_allocator_test.cc",
+        "common_runtime/mkl_threadpool_device_test.cc",
+    ],
     linkstatic = 1,
     deps = [
         ":core",
@@ -3379,6 +3396,37 @@ tf_cc_tests_gpu(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "gpu_device_unified_memory_test",
+    size = "small",
+    srcs = [
+        "common_runtime/gpu/gpu_device_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    # Runs test on a Guitar cluster that uses P100s to test unified memory
+    # allocations.
+    tags = tf_cuda_tests_tags() + [
+        "guitar",
+        "multi_gpu",
+    ],
+    deps = [
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session",
+        ":framework",
+        ":framework_internal",
+        ":gpu_id",
+        ":lib",
+        ":lib_internal",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
 tf_cc_test_gpu(
     name = "cuda_libdevice_path_test",
     size = "small",
@@ -3402,7 +3450,11 @@ tf_cuda_only_cc_test(
         ":test",
         ":test_main",
         "//third_party/eigen3",
-    ],
+    ] + if_mkl(
+        [
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    ),
 )
 
 tf_cc_test_gpu(
@@ -4068,6 +4120,29 @@ tf_cc_test_gpu(
     ],
 )
 
+tf_cc_tests(
+    name = "common_runtime_lower_if_op_test",
+    size = "small",
+    srcs = ["common_runtime/lower_if_op_test.cc"],
+    deps = [
+        ":all_kernels",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:ops",
+    ],
+)
+
 # Test data
 filegroup(
     name = "image_testdata",
@@ -4161,9 +4236,3 @@ alias(
     actual = ":mobile_srcs",
     visibility = ["//visibility:public"],
 )
-
-closure_proto_library(
-    name = "example_protos_closure",
-    visibility = ["//visibility:public"],
-    deps = [":example_protos"],
-)
diff --git a/tensorflow/core/api_def/base_api/api_def_CropAndResize.pbtxt b/tensorflow/core/api_def/base_api/api_def_CropAndResize.pbtxt
index 629f575d0a25ceb97dedb4e94f84e3204450da6a..e6609a16e125ada8474cd5d805895d4b48b30374 100644
--- a/tensorflow/core/api_def/base_api/api_def_CropAndResize.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CropAndResize.pbtxt
@@ -47,8 +47,9 @@ END
   attr {
     name: "method"
     description: <<END
-A string specifying the interpolation method. Only 'bilinear' is
-supported for now.
+A string specifying the sampling method for resizing. It can be either
+`"bilinear"` or `"nearest"` and default to `"bilinear"`. Currently two sampling
+methods are supported: Bilinear and Nearest Neighbor.
 END
   }
   attr {
@@ -57,18 +58,22 @@ END
 Value used for extrapolation, when applicable.
 END
   }
-  summary: "Extracts crops from the input image tensor and bilinearly resizes them (possibly"
+  summary: "Extracts crops from the input image tensor and resizes them."
   description: <<END
-with aspect ratio change) to a common output size specified by `crop_size`. This
-is more general than the `crop_to_bounding_box` op which extracts a fixed size
-slice from the input image and does not allow resizing or aspect ratio change.
+Extracts crops from the input image tensor and resizes them using bilinear
+sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
+common output size specified by `crop_size`. This is more general than the
+`crop_to_bounding_box` op which extracts a fixed size slice from the input image
+and does not allow resizing or aspect ratio change.
 
 Returns a tensor with `crops` from the input `image` at positions defined at the
 bounding box locations in `boxes`. The cropped boxes are all resized (with
-bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
-result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`. The
-resizing is corner aligned. In particular, if `boxes = [[0, 0, 1, 1]]`, the
-method will give identical results to using `tf.image.resize_bilinear()`
-with `align_corners=True`.
+bilinear or nearest neighbor interpolation) to a fixed
+`size = [crop_height, crop_width]`. The result is a 4-D tensor
+`[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
+In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
+results to using `tf.image.resize_bilinear()` or
+`tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
+`align_corners=True`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
index bf544703de559900317f71eaf5936221ac62080e..e230c51edfe9355b556812b0946b3a4879f160bc 100644
--- a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
@@ -1,5 +1,19 @@
 op {
   graph_op_name: "MapAndBatchDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+    name: "other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when building a closure
+for `f`.
+END
+  }
   in_arg {
     name: "batch_size"
     description: <<END
@@ -11,13 +25,26 @@ END
   in_arg {
     name: "num_parallel_batches"
     description: <<END
-A scalar representing the number of batches to create in
-parallel. Processing multiple batches in parallel benefits workloads prone to
-stragglers.
+A scalar representing the number of batches to create in parallel. Processing
+multiple batches in parallel benefits workloads prone to stragglers.
+END
+  }
+  in_arg {
+    name: "drop_remainder"
+    description: <<END
+A scalar representing whether the last batch should be dropped in case its size
+is smaller than desired.
+END
+  }
+  attr {
+    name: "f"
+    description: <<END
+A function to apply to the outputs of `input_dataset`.
 END
   }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset` and then"
+  summary: "Creates a dataset that fuses mapping with batching."
   description: <<END
+Creates a dataset that applies `f` to the outputs of `input_dataset` and then
 batches `batch_size` of them.
 
 Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
diff --git a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..81ef92cae0c95c765a82c993f58f261509c47d71
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "MapAndBatchDatasetV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+    name: "other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when building a closure
+for `f`.
+END
+  }
+  in_arg {
+    name: "batch_size"
+    description: <<END
+A scalar representing the number of elements to accumulate in a
+batch. It determines the number of concurrent invocations of `f` that process
+elements from `input_dataset` in parallel.
+END
+  }
+  in_arg {
+    name: "num_parallel_calls"
+    description: <<END
+A scalar representing the maximum number of parallel invocations of the `map_fn`
+function. Applying the `map_fn` on consecutive input elements in parallel has
+the potential to improve input pipeline throughput.
+END
+  }
+  in_arg {
+    name: "drop_remainder"
+    description: <<END
+A scalar representing whether the last batch should be dropped in case its size
+is smaller than desired.
+END
+  }
+  attr {
+    name: "f"
+    description: <<END
+A function to apply to the outputs of `input_dataset`.
+END
+  }
+  summary: "Creates a dataset that fuses mapping with batching."
+  description: <<END
+Creates a dataset that applies `f` to the outputs of `input_dataset` and then
+batches `batch_size` of them.
+
+Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
+to `batch_size * num_parallel_batches` copies of `f` in parallel.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NonMaxSuppressionV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_NonMaxSuppressionV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25ec87eeca27c7aaeae77d3f0e5bf78d67fec78c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NonMaxSuppressionV3.pbtxt
@@ -0,0 +1,64 @@
+op {
+  graph_op_name: "NonMaxSuppressionV3"
+  in_arg {
+    name: "boxes"
+    description: <<END
+A 2-D float tensor of shape `[num_boxes, 4]`.
+END
+  }
+  in_arg {
+    name: "scores"
+    description: <<END
+A 1-D float tensor of shape `[num_boxes]` representing a single
+score corresponding to each box (each row of boxes).
+END
+  }
+  in_arg {
+    name: "max_output_size"
+    description: <<END
+A scalar integer tensor representing the maximum number of
+boxes to be selected by non max suppression.
+END
+  }
+  in_arg {
+    name: "iou_threshold"
+    description: <<END
+A 0-D float tensor representing the threshold for deciding whether
+boxes overlap too much with respect to IOU.
+END
+  }
+  in_arg {
+    name: "score_threshold"
+    description: <<END
+A 0-D float tensor representing the threshold for deciding when to remove
+boxes based on score.
+END
+  }
+  out_arg {
+    name: "selected_indices"
+    description: <<END
+A 1-D integer tensor of shape `[M]` representing the selected
+indices from the boxes tensor, where `M <= max_output_size`.
+END
+  }
+  summary: "Greedily selects a subset of bounding boxes in descending order of score,"
+  description: <<END
+pruning away boxes that have high intersection-over-union (IOU) overlap
+with previously selected boxes.  Bounding boxes with score less than
+`score_threshold` are removed.  Bounding boxes are supplied as
+[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+diagonal pair of box corners and the coordinates can be provided as normalized
+(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+is agnostic to where the origin is in the coordinate system and more
+generally is invariant to orthogonal transformations and translations
+of the coordinate system; thus translating or reflections of the coordinate
+system result in the same boxes being selected by the algorithm.
+The output of this operation is a set of integers indexing into the input
+collection of bounding boxes representing the selected boxes.  The bounding
+box coordinates corresponding to the selected indices can then be obtained
+using the `tf.gather operation`.  For example:
+  selected_indices = tf.image.non_max_suppression_v2(
+      boxes, scores, max_output_size, iou_threshold, score_threshold)
+  selected_boxes = tf.gather(boxes, selected_indices)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RegexFullMatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_RegexFullMatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cef243aee3a9bc050af95ab03958a844731da46
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RegexFullMatch.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "RegexFullMatch"
+  in_arg {
+    name: "input"
+    description: <<END
+A string tensor of the text to be processed.
+END
+  }
+  in_arg {
+    name: "pattern"
+    description: <<END
+A 1-D string tensor of the regular expression to match the input.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A bool tensor with the same shape as `input`.
+END
+  }
+  summary: "Check if the input matches the regex pattern."
+  description: <<END
+The input is a string tensor of any shape. The pattern is a scalar
+string tensor which is applied to every element of the input tensor.
+The boolean values (True or False) of the output tensor indicate
+if the input matches the regex pattern provided.
+
+The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_NonMaxSuppressionV3.pbtxt b/tensorflow/core/api_def/python_api/api_def_NonMaxSuppressionV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..263cba14eb930795760b609f8921d379cc1cb4a2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_NonMaxSuppressionV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NonMaxSuppressionV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RegexFullMatch.pbtxt b/tensorflow/core/api_def/python_api/api_def_RegexFullMatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec310c8aebdee9500b0344f465fc1f18f812021f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RegexFullMatch.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RegexFullMatch"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 8f2a4197563af57af7f82a3da55f58a0b390bbe1..9cda17867bafd6d33afdf020f0f56d00cd72328f 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -86,7 +86,7 @@ BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
   return &(chunks_[h]);
 }
 
-bool BFCAllocator::Extend(size_t rounded_bytes) {
+bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
   size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
   // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
   available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
@@ -108,7 +108,7 @@ bool BFCAllocator::Extend(size_t rounded_bytes) {
 
   // Try allocating.
   size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
-  void* mem_addr = suballocator_->Alloc(32, bytes);
+  void* mem_addr = suballocator_->Alloc(alignment, bytes);
   if (mem_addr == nullptr && !started_backpedal_) {
     // Only backpedal once.
     started_backpedal_ = true;
@@ -119,7 +119,7 @@ bool BFCAllocator::Extend(size_t rounded_bytes) {
     while (mem_addr == nullptr) {
       bytes = RoundedBytes(bytes * kBackpedalFactor);
       if (bytes < rounded_bytes) break;
-      mem_addr = suballocator_->Alloc(32, bytes);
+      mem_addr = suballocator_->Alloc(alignment, bytes);
     }
   }
 
@@ -261,7 +261,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
   }
 
   // Try to extend
-  if (Extend(rounded_bytes)) {
+  if (Extend(unused_alignment, rounded_bytes)) {
     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
     if (ptr != nullptr) {
       return ptr;
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index ba5a3eea3ac07115b2f3ea57528c7baed2008dfe..52aedb1e9c2865418bd93af8d75053d1c2280b2e 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -305,7 +305,8 @@ class BFCAllocator : public VisitableAllocator {
   // Try to add a new memory region that can satisfy an allocation of
   // 'rounded_bytes' bytes.  Returns true on success and false on
   // failure.
-  bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  bool Extend(size_t alignment, size_t rounded_bytes)
+      EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
diff --git a/tensorflow/core/common_runtime/broadcaster.cc b/tensorflow/core/common_runtime/broadcaster.cc
index e42d3f6b92b98b361e5835825880548ea212d45f..9646a0856ed60bf63a2e26d75b52ddd48ed61b2e 100644
--- a/tensorflow/core/common_runtime/broadcaster.cc
+++ b/tensorflow/core/common_runtime/broadcaster.cc
@@ -134,7 +134,7 @@ void Broadcaster::TreeSendTo(const CollectiveParams& cp,
 // Execute a tree broadcast, i.e. each non-source device receives from
 // one other and sends to up-to two others.
 void Broadcaster::RunTree() {
-  mutex mu;
+  mutex mu;               // also guards status_ while callbacks are pending
   int pending_count = 0;  // GUARDED_BY(mu)
   condition_variable all_done;
   std::vector<int> send_to_ranks;
@@ -162,15 +162,13 @@ void Broadcaster::RunTree() {
         ++pending_count;
       }
       DispatchSend(
-          target_rank, output_,
+          target_rank, (is_source_ ? &ctx_->input(0) : output_),
           [this, target_rank, &mu, &pending_count, &all_done](const Status& s) {
+            mutex_lock l(mu);
             status_.Update(s);
-            {
-              mutex_lock l(mu);
-              --pending_count;
-              if (pending_count == 0) {
-                all_done.notify_all();
-              }
+            --pending_count;
+            if (pending_count == 0) {
+              all_done.notify_all();
             }
           });
     }
@@ -191,13 +189,11 @@ void Broadcaster::RunTree() {
           op_dev_ctx, op_dev_ctx, device_, device_, ctx_->input_alloc_attr(0),
           ctx_->output_alloc_attr(0), input, output_,
           [this, &mu, &pending_count, &all_done](const Status& s) {
+            mutex_lock l(mu);
             status_.Update(s);
-            {
-              mutex_lock l(mu);
-              --pending_count;
-              if (0 == pending_count) {
-                all_done.notify_all();
-              }
+            --pending_count;
+            if (0 == pending_count) {
+              all_done.notify_all();
             }
           });
     }
diff --git a/tensorflow/core/common_runtime/broadcaster_test.cc b/tensorflow/core/common_runtime/broadcaster_test.cc
index 89d39144b3d212f267846e4d48a075ff383e4e47..959b93d56e7fd4bfa0885a3e8dc9f20f8d523fb7 100644
--- a/tensorflow/core/common_runtime/broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/broadcaster_test.cc
@@ -314,11 +314,11 @@ class BroadcasterTest : public ::testing::Test {
 
   typedef std::function<void(Tensor*)> InitFunc;
 
-  void Broadcast() {
+  void Broadcast(bool forward_input) {
     std::atomic<int> done(0);
     for (auto di : instances_) {
-      SchedClosure([di, &done] {
-        di->DoBroadcast();
+      SchedClosure([di, forward_input, &done] {
+        di->DoBroadcast(forward_input);
         ++done;
       });
     }
@@ -380,7 +380,8 @@ class BroadcasterTest : public ::testing::Test {
 
   template <typename T>
   void RunTest(DataType dtype, const DeviceType& device_type, int num_workers,
-               int num_devices, int tensor_len, int fail_after) {
+               int num_devices, int tensor_len, int fail_after,
+               bool forward_input) {
     Init(num_workers, num_devices, dtype, device_type, fail_after);
 
     // Initialize each instance tensor with distinct values.
@@ -423,7 +424,7 @@ class BroadcasterTest : public ::testing::Test {
       expected[i] = t->flat<T>()(i);
     }
 
-    Broadcast();
+    Broadcast(forward_input);
 
     // At this point all of the ops have terminated.
     for (int di = 0; di < instances_.size(); ++di) {
@@ -573,7 +574,7 @@ class BroadcasterTest : public ::testing::Test {
       }
     }
 
-    void DoBroadcast() {
+    void DoBroadcast(bool forward_input) {
       // Prepare an OpKernelContext.
       OpKernelContext::Params op_params;
       op_params.step_id = parent_->step_id_;
@@ -596,7 +597,8 @@ class BroadcasterTest : public ::testing::Test {
       input_dc.push_back(dev_ctx);
       op_params.input_device_contexts = &input_dc;
       op_params.op_device_context = dev_ctx;
-      int forward_from[] = {0};
+      int forward_from[] = {OpKernelContext::Params::kNeverForward};
+      if (forward_input) forward_from[0] = 0;
       if (col_params_.is_source) {
         op_params.forward_from_array = &forward_from[0];
       }
@@ -680,61 +682,61 @@ class BroadcasterTest : public ::testing::Test {
 // D = number of devices per worker
 // L = tensor length
 // A = abort after count
-#define DEF_TEST(B, T, W, D, L, A)                                 \
-  TEST_F(BroadcasterTest,                                          \
-         DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Len##L##_Abt##A) { \
-    DataType dtype = DT_##B;                                       \
-    switch (dtype) {                                               \
-      case DT_FLOAT: {                                             \
-        RunTest<float>(dtype, DEVICE_##T, W, D, L, A);             \
-      } break;                                                     \
-      case DT_DOUBLE: {                                            \
-        RunTest<double>(dtype, DEVICE_##T, W, D, L, A);            \
-      } break;                                                     \
-      case DT_INT32: {                                             \
-        RunTest<int32>(dtype, DEVICE_##T, W, D, L, A);             \
-      } break;                                                     \
-      case DT_INT64: {                                             \
-        RunTest<int64>(dtype, DEVICE_##T, W, D, L, A);             \
-      } break;                                                     \
-      default:                                                     \
-        LOG(FATAL) << "Unimplemented";                             \
-    }                                                              \
+#define DEF_TEST(B, T, W, D, L, A, F)                                      \
+  TEST_F(BroadcasterTest,                                                  \
+         DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Len##L##_Abt##A##_Fw##F) { \
+    DataType dtype = DT_##B;                                               \
+    switch (dtype) {                                                       \
+      case DT_FLOAT: {                                                     \
+        RunTest<float>(dtype, DEVICE_##T, W, D, L, A, F);                  \
+      } break;                                                             \
+      case DT_DOUBLE: {                                                    \
+        RunTest<double>(dtype, DEVICE_##T, W, D, L, A, F);                 \
+      } break;                                                             \
+      case DT_INT32: {                                                     \
+        RunTest<int32>(dtype, DEVICE_##T, W, D, L, A, F);                  \
+      } break;                                                             \
+      case DT_INT64: {                                                     \
+        RunTest<int64>(dtype, DEVICE_##T, W, D, L, A, F);                  \
+      } break;                                                             \
+      default:                                                             \
+        LOG(FATAL) << "Unimplemented";                                     \
+    }                                                                      \
   }
 
 #ifndef GOOGLE_CUDA
-//       B      T    W  D  L  A
-DEF_TEST(FLOAT, CPU, 1, 2, 1, 0)
-DEF_TEST(FLOAT, CPU, 1, 2, 1001, 0)
-DEF_TEST(FLOAT, CPU, 2, 1, 128, 0)
-DEF_TEST(FLOAT, CPU, 2, 4, 128, 0)
-DEF_TEST(FLOAT, CPU, 2, 8, 4095, 0)
-DEF_TEST(FLOAT, CPU, 4, 4, 1045991, 0)
-
-DEF_TEST(DOUBLE, CPU, 2, 4, 128, 0)
-DEF_TEST(INT32, CPU, 2, 4, 128, 0)
-DEF_TEST(INT64, CPU, 2, 4, 128, 0)
+//       B      T    W  D  L  A  F
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 0, false)
+DEF_TEST(FLOAT, CPU, 1, 2, 1001, 0, true)
+DEF_TEST(FLOAT, CPU, 2, 1, 128, 0, false)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 0, true)
+DEF_TEST(FLOAT, CPU, 2, 8, 4095, 0, false)
+DEF_TEST(FLOAT, CPU, 4, 4, 1045991, 0, true)
+
+DEF_TEST(DOUBLE, CPU, 2, 4, 128, 0, false)
+DEF_TEST(INT32, CPU, 2, 4, 128, 0, true)
+DEF_TEST(INT64, CPU, 2, 4, 128, 0, false)
 
 // Failure cases
-DEF_TEST(FLOAT, CPU, 2, 4, 128, 1)
-DEF_TEST(FLOAT, CPU, 2, 4, 128, 5)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 1, true)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 5, false)
 #endif
 
 #ifdef GOOGLE_CUDA
 // Can only set W=1 for GPU tests.
-//       B      T    W  D  L  A
-DEF_TEST(FLOAT, GPU, 1, 2, 1, 0)
-DEF_TEST(FLOAT, GPU, 1, 2, 33, 0)
-DEF_TEST(FLOAT, GPU, 1, 3, 64, 0)
-DEF_TEST(FLOAT, GPU, 1, 8, 1001, 0)
-DEF_TEST(FLOAT, GPU, 1, 8, 4095, 0)
-DEF_TEST(FLOAT, GPU, 1, 8, 1045991, 0)
+//       B      T    W  D  L  A  F
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 0, true)
+DEF_TEST(FLOAT, GPU, 1, 2, 33, 0, false)
+DEF_TEST(FLOAT, GPU, 1, 3, 64, 0, true)
+DEF_TEST(FLOAT, GPU, 1, 8, 1001, 0, false)
+DEF_TEST(FLOAT, GPU, 1, 8, 4095, 0, true)
+DEF_TEST(FLOAT, GPU, 1, 8, 1045991, 0, false)
 
-DEF_TEST(DOUBLE, GPU, 1, 8, 1001, 0)
-DEF_TEST(INT64, GPU, 1, 8, 1001, 0)
+DEF_TEST(DOUBLE, GPU, 1, 8, 1001, 0, true)
+DEF_TEST(INT64, GPU, 1, 8, 1001, 0, false)
 
 // Failure cases
-DEF_TEST(FLOAT, GPU, 1, 8, 128, 6)
+DEF_TEST(FLOAT, GPU, 1, 8, 128, 6, true)
 #endif
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index bdddf927d89152481e0d952ba1e08d68c7b38a09..8b2e0d1e0a47c323cb742b7a0c6e30b76a71bfa3 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -34,7 +34,8 @@ void CollectiveParamResolverLocal::CompleteGroupAsync(
 
 void CollectiveParamResolverLocal::CompleteGroupLocal(
     const string& device, CollectiveParams* cp, const GroupRecCallback& done) {
-  VLOG(1) << "CompleteGroupLocal " << cp << ": " << cp->ToString();
+  VLOG(1) << "CompleteGroupLocal device=" << device << " cp: " << cp << ": "
+          << cp->ToString();
   std::vector<StatusCallback> to_be_called;
   GroupRec* gr = nullptr;
   {
@@ -317,9 +318,6 @@ void SortDevicesAndTasks(CollectiveParams* cp) {
 // ring order implicit in the device order.
 void GenerateSubdivPerms(const string& device, int source_rank,
                          CollectiveParams* cp) {
-  CHECK_GT(cp->instance.impl_details.subdiv_offsets.size(), 0);
-  cp->instance.impl_details.subdiv_permutations.resize(
-      cp->instance.impl_details.subdiv_offsets.size());
   // Each subdiv permutation is a ring formed by rotating each
   // single-task subsequence of devices by an offset.  This makes most
   // sense when each task has the same number of devices but we can't
@@ -434,8 +432,9 @@ void CollectiveParamResolverLocal::SetDefaultRank(const string& device,
   }
 }
 
-Status CollectiveParamResolverLocal::InitInstanceSharedParams(
-    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir) {
+void CollectiveParamResolverLocal::InitInstanceSharedParams(
+    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
+    const StatusCallback& done) {
   VLOG(1) << "InitInstanceSharedParams " << ir;
   ir->shared.instance = cp->instance;
   {
@@ -461,19 +460,19 @@ Status CollectiveParamResolverLocal::InitInstanceSharedParams(
   // called by a derived class, some of the devices may be non-local and
   // GetDeviceLocalitiesAsync will use those fields to launch RPCs.
   CompleteTaskIsLocal(task_name_, &ir->shared);
-  std::vector<DeviceLocality> localities;
-  Notification note;
-  Status status;
-  dev_resolver_->GetDeviceLocalitiesAsync(ir->shared.instance, &localities,
-                                          [&note, &status](const Status& s) {
-                                            status = s;
-                                            note.Notify();
-                                          });
-  note.WaitForNotification();
-  if (status.ok()) {
-    CompleteDefaultRanking(gr, cp, ir, localities);
-  }
-  return status;
+  std::vector<DeviceLocality>* localities = new std::vector<DeviceLocality>;
+  dev_resolver_->GetDeviceLocalitiesAsync(
+      ir->shared.instance, localities,
+      [this, gr, cp, ir, localities, done](const Status& s)
+          EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu) {
+            if (s.ok()) {
+              CompleteDefaultRanking(gr, cp, ir, *localities);
+              done(Status::OK());
+            } else {
+              done(s);
+            }
+            delete localities;
+          });
 }
 
 void CollectiveParamResolverLocal::CompleteDefaultRanking(
@@ -548,28 +547,50 @@ void CollectiveParamResolverLocal::FindInstanceRec(
     CallbackWithStatus(done, irec);
     return;
   }
-  // Initialize the new InstanceRec while holding out_mu.
-  {
-    mutex_lock il(irec->out_mu);
-    irec->known.resize(cp->group.group_size, false);
-    irec->status = InitInstanceSharedParams(gr, cp, irec);
-  }
-  // Prepare to invoke any waiters that accumlated during initialization.
-  std::vector<IRConsumer> init_waiters;
-  {
-    mutex_lock tl(instance_mu_);
-    {
-      mutex_lock l(irec->in_mu);
-      irec->is_init = true;
-      if (!irec->init_waiters.empty()) {
-        std::swap(init_waiters, irec->init_waiters);
-      }
-    }
-  }
-  CallbackWithStatus(done, irec);
-  for (auto& f : init_waiters) {
-    f(irec);
-  }
+
+  CallInitInstanceSharedParams(gr, cp, irec, done);
+}
+
+void CollectiveParamResolverLocal::CallInitInstanceSharedParams(
+    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
+    const InstanceRecCallback& done) NO_THREAD_SAFETY_ANALYSIS {
+  // This function serves merely to make a function call that should
+  // be thread/mutex safe but violates the simple model applied by
+  // static analysis, so we turn off analysis only within this
+  // function body.
+  //
+  // A lock on ir->out_mu must be held throughout the _bodies_ of the
+  // chain of function calls initiated here, each of which calls
+  // another as its last action, but it will be dropped within the
+  // callback defined below, which means that the lock can be dropped
+  // before all the function stack frames pop. The static analysis will
+  // not allow that.
+  ir->out_mu.lock();
+  ir->known.resize(cp->group.group_size, false);
+  InitInstanceSharedParams(
+      gr, cp, ir,
+      [this, ir, done](const Status& s) UNLOCK_FUNCTION(ir->out_mu) {
+        DCHECK(!ir->out_mu.try_lock());
+        ir->status.Update(s);
+        ir->out_mu.unlock();
+        // Prepare to invoke any waiters that accumlated during
+        // initialization.
+        std::vector<IRConsumer> init_waiters;
+        {
+          mutex_lock tl(instance_mu_);
+          {
+            mutex_lock l(ir->in_mu);
+            ir->is_init = true;
+            if (!ir->init_waiters.empty()) {
+              std::swap(init_waiters, ir->init_waiters);
+            }
+          }
+        }
+        CallbackWithStatus(done, ir);
+        for (auto& f : init_waiters) {
+          f(ir);
+        }
+      });
 }
 
 void CollectiveParamResolverLocal::CompleteParamsAsync(
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index 7b2946e9368518f7e19a9507eeda913d5e1202a9..3a871f962dfbefd0e06d0e594be6dca9170a2089 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -145,10 +145,15 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   //
   // Preconditions:
   //  cp is populated with all DeviceLocalities
-  Status InitInstanceSharedParams(const GroupRec* gr,
-                                  const CollectiveParams* cp, InstanceRec* ir)
+  void InitInstanceSharedParams(const GroupRec* gr, const CollectiveParams* cp,
+                                InstanceRec* ir, const StatusCallback& done)
       EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu) LOCKS_EXCLUDED(gr->mu);
 
+  void CallInitInstanceSharedParams(const GroupRec* gr,
+                                    const CollectiveParams* cp, InstanceRec* ir,
+                                    const InstanceRecCallback& done)
+      LOCKS_EXCLUDED(ir->out_mu, gr->mu);
+
   // Establishes the final order of ir->shared.instance.device_names and
   // ir->shared.instance.task_names by considering localities of all devices.
   void CompleteDefaultRanking(const GroupRec* gr, const CollectiveParams* cp,
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc
index ad9b32ce3514dcfb29662d781ca6f1febd406c89..69f1a9f24cde888adafb1c09285f98e1549202f6 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local.cc
@@ -54,9 +54,13 @@ void CollectiveRemoteAccessLocal::RecvFromPeer(
                       hook->prod_value,  // src Tensor*
                       to_tensor,         // dst Tensor*
                       [hook, done](const Status& s) {
+                        // This callback may be executing in the GPUEventMgr
+                        // pool in which case it must be very short duration
+                        // and non-blocking (except e.g. for queue insertion).
+                        // It would be safer, though expensive, to transfer
+                        // to another thread here.
                         done(s);
-                        hook->prod_cb(s);
-                        delete hook;
+                        BufRendezvous::DoneWithHook(hook);
                       });
         }
       });
@@ -91,6 +95,21 @@ void CollectiveRemoteAccessLocal::MemCpyAsync(
       dst_attr.on_host() ? DEVICE_CPU : dst_dev->attributes().device_type());
   const bool non_cpu_src = src_device_type != DeviceType(DEVICE_CPU);
   const bool non_cpu_dst = dst_device_type != DeviceType(DEVICE_CPU);
+  // For GPU devices when only one compute stream is used (the default)
+  // the OpKernelContext does not supply a DeviceContext.  It's assumed
+  // that all nodes use the default context.
+  if (src_dev_ctx == nullptr && src_device_type == DEVICE_GPU) {
+    const DeviceBase::GpuDeviceInfo* dev_info =
+        src_dev->tensorflow_gpu_device_info();
+    CHECK(dev_info);
+    src_dev_ctx = dev_info->default_context;
+  }
+  if (dst_dev_ctx == nullptr && dst_device_type == DEVICE_GPU) {
+    const DeviceBase::GpuDeviceInfo* dev_info =
+        src_dev->tensorflow_gpu_device_info();
+    CHECK(dev_info);
+    dst_dev_ctx = dev_info->default_context;
+  }
   if (non_cpu_src) CHECK(src_dev_ctx);
   if (non_cpu_dst) CHECK(dst_dev_ctx);
   if (non_cpu_src || non_cpu_dst) {
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 5918cd9bbf35a7e277ec8d7e17f9008400e1eea3..b537666492ce29da5913d7b7fafbfc639395d0cd 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -51,6 +51,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+class DeviceMgr;
+
 class Device : public DeviceBase {
  public:
   Device(Env* env, const DeviceAttributes& device_attributes);
@@ -133,6 +135,10 @@ class Device : public DeviceBase {
   // Returns the resource manager associated w/ this device.
   virtual ResourceMgr* resource_manager() { return rmgr_; }
 
+  // Returns the device manager that owns this device, or nullptr if this Device
+  // is not owned by a device manager.
+  DeviceMgr* device_mgr() const { return device_mgr_; }
+
   // Summarizes the status of this Device, for debugging.
   string DebugString() const { return ProtoDebugString(device_attributes_); }
 
@@ -158,6 +164,11 @@ class Device : public DeviceBase {
   }
 
  private:
+  friend class DeviceMgr;
+
+  // Pointer to the device manager that owns this device. Not owned.
+  DeviceMgr* device_mgr_ = nullptr;
+
   const DeviceAttributes device_attributes_;
   DeviceNameUtils::ParsedName parsed_name_;
 
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index a77601ba79bf29998e1bb78b5e39dbdbd48434e9..470abc1431292820dec747110a60c08246470c3c 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -27,6 +27,9 @@ namespace tensorflow {
 DeviceMgr::DeviceMgr(const std::vector<Device*>& devices)
     : name_backing_store_(128) {
   for (Device* d : devices) {
+    CHECK(d->device_mgr_ == nullptr);
+    d->device_mgr_ = this;
+
     devices_.push_back(d);
 
     // Register under the (1) full name and (2) canonical name.
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 695423b2cb19935a40a3fa9472f14efc88821b74..9028e6298c503531d53626d6f3c19388e1215464 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -102,9 +102,25 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
         if (node->name() == y->name()) {
-          EXPECT_EQ(9, cm->AllocationId(node, 0));
+#ifdef INTEL_MKL
+          // if MKL is used, it goes through various additional 
+          // graph rewrite pass. In TF, everytime a graph pass 
+          // happens, "constant" nodes are allocated
+          // and deallocated. Each allocation calls the
+          // (FindChunkPtr of BFCAllocator),
+          // which increments the value of AllocationId. 
+          // Thus AllocationId becomes more than 3 and 4 if 
+          // MKL is used. Now they are 9 and 10 for MKL. 
+          EXPECT_EQ(19, cm->AllocationId(node, 0));
+#else
+          EXPECT_EQ(21, cm->AllocationId(node, 0));
+#endif 
         } else {
-          EXPECT_EQ(10, cm->AllocationId(node, 0));
+#ifdef INTEL_MKL
+          EXPECT_EQ(20, cm->AllocationId(node, 0));
+#else
+          EXPECT_EQ(22, cm->AllocationId(node, 0));
+#endif 
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 13d6b021b549e88c49ca5d38f1e3d3cbeff95e56..b5120f2872996228435dfc58200d44fba316eb6b 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -51,6 +51,9 @@ tf_cuda_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
+        "//tensorflow/core/distributed_runtime:worker_session",
+        "//tensorflow/core/distributed_runtime/eager:eager_client",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
     ],
 )
 
@@ -64,9 +67,9 @@ tf_cuda_library(
     ],
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":attr_builder",
         ":context",
         ":tensor_handle",
-        "//tensorflow/c/eager:runtime",
     ],
 )
 
@@ -136,8 +139,8 @@ tf_cc_test(
     name = "kernel_and_device_test",
     srcs = ["kernel_and_device_test.cc"],
     deps = [
+        ":attr_builder",
         ":kernel_and_device",
-        "//tensorflow/c/eager:runtime",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
         "//tensorflow/cc:ops",
@@ -168,6 +171,49 @@ cc_library(
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/distributed_runtime/eager:eager_client",
+        "//tensorflow/core/distributed_runtime/eager:remote_execute_node",
+    ],
+)
+
+tf_cuda_library(
+    name = "attr_builder",
+    srcs = ["attr_builder.cc"],
+    hdrs = ["attr_builder.h"],
+    visibility = ["//tensorflow:internal"],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            ":kernel_and_device",
+            "//tensorflow/c:c_api",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:core_cpu_internal",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:protos_all_cc",
+        ],
+    }),
+)
+
+tf_cc_test(
+    name = "attr_builder_test",
+    srcs = ["attr_builder_test.cc"],
+    deps = [
+        ":attr_builder",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
     ],
 )
diff --git a/tensorflow/c/eager/runtime.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
similarity index 99%
rename from tensorflow/c/eager/runtime.cc
rename to tensorflow/core/common_runtime/eager/attr_builder.cc
index e6c51ab17a867a0697f15d7683d8ca52c062035d..92307d78f2d0b3f21c9d166eed3264e7fae42ff5 100644
--- a/tensorflow/c/eager/runtime.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/c/eager/runtime.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
 
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
diff --git a/tensorflow/c/eager/runtime.h b/tensorflow/core/common_runtime/eager/attr_builder.h
similarity index 100%
rename from tensorflow/c/eager/runtime.h
rename to tensorflow/core/common_runtime/eager/attr_builder.h
diff --git a/tensorflow/c/eager/runtime_test.cc b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
similarity index 97%
rename from tensorflow/c/eager/runtime_test.cc
rename to tensorflow/core/common_runtime/eager/attr_builder_test.cc
index 27ebeb0508844ee1ee89e0733b66f6ed129b7757..79b094f2e008786661b0236bc7bcdb3f37a23946 100644
--- a/tensorflow/c/eager/runtime_test.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/c/eager/runtime.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
 
 #include <memory>
 #include <vector>
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index d3fe6a7edeabecdaba0d894cca700608b896026d..8381cb58d23e4b7dfc718fee2655bb85f5bb9bf3 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/context.h"
 
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
 
 namespace tensorflow {
 
@@ -24,15 +25,44 @@ EagerContext::EagerContext(const SessionOptions& opts,
                            bool async, std::unique_ptr<DeviceMgr> device_mgr,
                            Rendezvous* rendezvous)
     : policy_(default_policy),
-      device_manager_(std::move(device_mgr)),
-      devices_(device_manager_->ListDevices()),
+      local_device_manager_(std::move(device_mgr)),
+      local_unowned_device_manager_(nullptr),
+      devices_(local_device_manager_->ListDevices()),
       rendezvous_(rendezvous),
       thread_pool_(NewThreadPoolFromSessionOptions(opts)),
       pflr_(new ProcessFunctionLibraryRuntime(
-          device_manager_.get(), opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_,
-          {}, thread_pool_.get())),
+          local_device_manager_.get(), opts.env, TF_GRAPH_DEF_VERSION,
+          &func_lib_def_, {}, thread_pool_.get())),
       log_device_placement_(opts.config.log_device_placement()),
       async_default_(async) {
+  InitDeviceMapAndAsync();
+}
+
+EagerContext::EagerContext(
+    const SessionOptions& opts, ContextDevicePlacementPolicy default_policy,
+    bool async, DeviceMgr* local_device_mgr, Rendezvous* rendezvous,
+    std::unique_ptr<GrpcServer> server,
+    std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
+    std::unique_ptr<DeviceMgr> remote_device_manager,
+    const gtl::FlatMap<string, uint64>& remote_contexts)
+    : policy_(default_policy),
+      local_unowned_device_manager_(local_device_mgr),
+      devices_(local_unowned_device_manager_->ListDevices()),
+      rendezvous_(rendezvous),
+      thread_pool_(NewThreadPoolFromSessionOptions(opts)),
+      pflr_(new ProcessFunctionLibraryRuntime(
+          local_unowned_device_manager_, opts.env, TF_GRAPH_DEF_VERSION,
+          &func_lib_def_, {}, thread_pool_.get())),
+      log_device_placement_(opts.config.log_device_placement()),
+      async_default_(async),
+      server_(std::move(server)),
+      remote_eager_workers_(std::move(remote_eager_workers)),
+      remote_device_manager_(std::move(remote_device_manager)),
+      remote_contexts_(remote_contexts) {
+  InitDeviceMapAndAsync();
+}
+
+void EagerContext::InitDeviceMapAndAsync() {
   if (async_default_) {
     executor_.EnableAsync();
   }
@@ -40,6 +70,15 @@ EagerContext::EagerContext(const SessionOptions& opts,
   for (auto* device : devices_) {
     devices_map_[device->name()] = device;
   }
+
+  if (remote_device_manager_ != nullptr) {
+    for (auto* device : remote_device_manager_->ListDevices()) {
+      if (devices_map_.find(device->name()) == devices_map_.end()) {
+        devices_map_[device->name()] = device;
+        devices_.push_back(device);
+      }
+    }
+  }
 }
 
 bool EagerContext::Async() const {
@@ -86,6 +125,40 @@ ContextDevicePlacementPolicy EagerContext::GetDevicePlacementPolicy() {
 }
 
 EagerContext::~EagerContext() {
+  if (server_) {
+    // TODO(nareshmodi): Fix this.
+    LOG(WARNING) << "Unable to destroy server_ object, so releasing instead. "
+                    "GrpcServer doesn't support clean shutdown.";
+    server_.release();
+  }
+
+  // Close all remote contexts.
+  std::vector<eager::CloseContextRequest> requests(remote_contexts_.size());
+  std::vector<eager::CloseContextResponse> responses(remote_contexts_.size());
+  BlockingCounter counter(static_cast<int>(remote_contexts_.size()));
+
+  int i = 0;
+  for (const auto& worker_and_context_id : remote_contexts_) {
+    auto* client =
+        remote_eager_workers_->GetClient(worker_and_context_id.first);
+
+    requests[i].set_context_id(worker_and_context_id.second);
+    client->CloseContextAsync(
+        &requests[i], &responses[i],
+        [&worker_and_context_id, &counter](const Status& s) {
+          if (!s.ok()) {
+            LOG(ERROR) << "Unable to close remote context with ID "
+                       << worker_and_context_id.second
+                       << " for worker: " << worker_and_context_id.first
+                       << " due to " << s.error_message();
+          }
+          counter.DecrementCount();
+        });
+    i++;
+  }
+
+  counter.Wait();
+
   executor_.WaitForAllPendingNodes().IgnoreError();
   ClearCaches();
   rendezvous_->Unref();
@@ -140,4 +213,45 @@ void EagerContext::SetShouldStoreMetadata(bool value) {
   }
 }
 
+namespace {
+Status GetTaskName(Device* d, string* task_name) {
+  string ignored;
+  if (!DeviceNameUtils::SplitDeviceName(d->name(), task_name, &ignored)) {
+    return errors::InvalidArgument("Unable to parse device name: ", d->name());
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Status EagerContext::GetClientAndContextID(Device* device,
+                                           eager::EagerClient** client,
+                                           uint64* context_id) {
+  auto it = device_to_client_cache_.find(device);
+  if (it != device_to_client_cache_.end()) {
+    *client = it->second.first;
+    *context_id = it->second.second;
+  }
+  string device_task_name;
+  TF_RETURN_IF_ERROR(GetTaskName(device, &device_task_name));
+
+  *client = remote_eager_workers_->GetClient(device_task_name);
+
+  if (*client == nullptr) {
+    return errors::InvalidArgument(
+        "Unable to find eager client corresponding to device ", device->name());
+  }
+
+  auto context_iterator = remote_contexts_.find(device_task_name);
+  if (context_iterator == remote_contexts_.end()) {
+    return errors::Internal("Unable to find a context for handle on task: ",
+                            device_task_name, ". This should not be possible");
+  }
+  *context_id = context_iterator->second;
+
+  device_to_client_cache_.insert({device, {*client, *context_id}});
+
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 6665df27d09a73d4c30756cf01e383834fcae339..096ed3112e8443b3cc4509afd0d1532a120262bf 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -24,13 +24,17 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
@@ -63,6 +67,29 @@ class EagerContext {
                         std::unique_ptr<DeviceMgr> device_mgr,
                         Rendezvous* rendezvous);
 
+  // TODO(nareshmodi): Split this into 2 classes and hide functionality behind
+  // an interface. Alternatively, encapsulate remote state into a separate
+  // class/struct.
+  //
+  // Constructs an eager context that is able to communicate with remote
+  // workers.
+  //
+  // Additional remote-specific args are:
+  //  - server: A GrpcServer that exports the tensorflow.WorkerService. Note
+  //  that this class expects the server to already have been started.
+  //  - remote_eager_workers: A cache from which we can get "EagerClient"s to
+  //  communicate with remote eager services.
+  //  - remote_device_mgr: A DeviceMgr* which contains all remote devices
+  //  (should contain no local devices).
+  //  - remote_contexts: A map containing task name to remote context ID.
+  explicit EagerContext(
+      const SessionOptions& opts, ContextDevicePlacementPolicy default_policy,
+      bool async, DeviceMgr* local_device_mgr, Rendezvous* rendezvous,
+      std::unique_ptr<GrpcServer> server,
+      std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
+      std::unique_ptr<DeviceMgr> remote_device_manager,
+      const gtl::FlatMap<string, uint64>& remote_contexts);
+
   ~EagerContext();
 
   // Returns the function library runtime for the given device.
@@ -128,10 +155,16 @@ class EagerContext {
 
   mutex* FunctionsMu() { return &functions_mu_; }
 
-  tensorflow::DeviceMgr* device_mgr() { return device_manager_.get(); }
+  const tensorflow::DeviceMgr* local_device_mgr() const {
+    return (local_device_manager_ != nullptr) ? local_device_manager_.get()
+                                              : local_unowned_device_manager_;
+  }
+  const tensorflow::DeviceMgr* remote_device_mgr() {
+    return remote_device_manager_.get();
+  }
 
   // TODO(apassos) remove the need for this
-  void ReleaseDeviceMgr() { device_manager_.release(); }
+  void ReleaseDeviceMgr() { local_device_manager_.release(); }
 
   // TODO(apassos) clean up RunMetadata storage.
   mutex* MetadataMu() { return &metadata_mu_; }
@@ -141,7 +174,12 @@ class EagerContext {
 
   FunctionLibraryDefinition* FuncLibDef() { return &func_lib_def_; }
 
+  Status GetClientAndContextID(Device* device, eager::EagerClient** client,
+                               uint64* context_id);
+
  private:
+  void InitDeviceMapAndAsync();
+
   const ContextDevicePlacementPolicy policy_;
 
   // Note: we cannot use C++11 thread_local here as there is no concept of a
@@ -150,7 +188,10 @@ class EagerContext {
   std::unordered_map<std::thread::id, ContextDevicePlacementPolicy>
       thread_local_policies_ GUARDED_BY(policy_map_mu_);
 
-  std::unique_ptr<DeviceMgr> device_manager_;
+  // Only one of the below is set.
+  std::unique_ptr<DeviceMgr> local_device_manager_;
+  const DeviceMgr* local_unowned_device_manager_;
+
   // Devices owned by device_manager
   std::vector<Device*> devices_;
   // All devices are not owned.
@@ -186,6 +227,17 @@ class EagerContext {
   mutable mutex async_map_mu_;
   std::unordered_map<std::thread::id, bool> thread_local_async_
       GUARDED_BY(async_map_mu_);
+
+  // The server_ is not const since we release it when the context is destroyed.
+  // Therefore the server_ object is not marked as const (even though it should
+  // be).
+  std::unique_ptr<GrpcServer> server_;
+  const std::unique_ptr<eager::EagerClientCache> remote_eager_workers_;
+  const std::unique_ptr<DeviceMgr> remote_device_manager_;
+
+  const gtl::FlatMap<string, uint64> remote_contexts_;
+  gtl::FlatMap<Device*, std::pair<eager::EagerClient*, uint64>>
+      device_to_client_cache_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 6b6e53da87acb88ab8662fbc89024480108e97e0..fcf62c7715320466a49c707e31cf7a5045f16b8e 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
 
-#include "tensorflow/c/eager/runtime.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index a514f81e146f20c8d2fe828ae15bd650f60c5e40..ce989f4b4eb9b42ae26f5b9fb2e0ac0a32d37c00 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -24,11 +24,14 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/execute_node.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 
@@ -183,14 +186,14 @@ Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
 // primitive op (e.g. matmul).
 //
 // The wrapper function conforms to the function signature expected by
-// _XlaLaunchOp, with input params ordered by <constants, (variable) args and
+// XlaLaunch, with input params ordered by <constants, (variable) args and
 // resources>. For example, if the op has input params <Const1, Arg2, Const3,
 // Resource4, Arg5>, they will be reordered to <Const1, Const3, Arg2, Arg5,
 // Resource4> as the input params to the synthesized function.
 //
 // It populates `const_input_types`, `arg_input_types` and
 // `op_input_to_func_input` based on the reordering results, that the caller can
-// use them to build an _XlaLaunchOp. On error, it returns NULL, and sets
+// use them to build an XlaLaunch. On error, it returns NULL, and sets
 // `status` accordingly.
 const FunctionDef* OpToFunction(TFE_Op* op,
                                 std::vector<TF_DataType>* const_input_types,
@@ -308,12 +311,12 @@ const FunctionDef* OpToFunction(TFE_Op* op,
   return ret;
 }
 
-// Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed
+// Builds an XlaLaunch as a wrapper over 'op', so that 'op' can be executed
 // via XLA.
 std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
-  VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->operation.Name();
+  VLOG(1) << "Creating XlaLaunch for TFE_Op " << op->operation.Name();
   auto launch_op = std::unique_ptr<TFE_Op>(
-      TFE_NewOp(op->operation.ctx, "_XlaLaunch", status));
+      TFE_NewOp(op->operation.ctx, "XlaLaunch", status));
   if (TF_GetCode(status) != TF_OK) return nullptr;
   if (op->operation.device) {
     TFE_OpSetDevice(launch_op.get(), op->operation.device->name().c_str(),
@@ -328,7 +331,7 @@ std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
   gtl::FlatMap<int, int> op_input_to_func_input;
   if (fdef == nullptr) {
     // See if this is a primitive op, and if so create a function for it, so
-    // that _XlaLaunchOp can access it.
+    // that XlaLaunch can access it.
     fdef = OpToFunction(op, &const_input_types, &arg_input_types,
                         &op_input_to_func_input, status);
     if (!status.ok()) return nullptr;
@@ -392,17 +395,35 @@ std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
 }
 #endif  // TENSORFLOW_EAGER_USE_XLA
 
+Status GetOutputDTypes(EagerOperation* op, DataTypeVector* output_dtypes) {
+  const auto& node_def = op->MutableAttrs()->BuildNodeDef();
+  const OpDef* op_def = nullptr;
+
+  TF_RETURN_IF_ERROR(OpDefForOp(op->Name().c_str(), &op_def));
+
+  TF_RETURN_IF_ERROR(OutputTypesForNode(node_def, *op_def, output_dtypes));
+
+  return Status::OK();
+}
+
 }  // namespace
 
-Status EagerExecute(EagerOperation* op,
-                    gtl::InlinedVector<TensorHandle*, 2>* retvals,
-                    int* num_retvals) {
+namespace {
+bool IsLocal(EagerContext* ctx, tensorflow::Device* d) {
+  if (d == nullptr || ctx->remote_device_mgr() == nullptr) return true;
+  tensorflow::Device* tmp;
+  return ctx->local_device_mgr()->LookupDevice(d->name(), &tmp).ok();
+}
+
+Status EagerLocalExecute(EagerOperation* op,
+                         gtl::InlinedVector<TensorHandle*, 2>* retvals,
+                         int* num_retvals) {
   EagerContext* ctx = op->EagerContext();
   auto status = ctx->GetStatus();
   if (!status.ok()) return status;
 #ifdef TENSORFLOW_EAGER_USE_XLA
   std::unique_ptr<TFE_Op> xla_launch_op;
-  if (op->UseXla() && op->Name() != "_XlaLaunch") {
+  if (op->UseXla() && op->Name() != "XlaLaunch") {
     xla_launch_op = BuildXlaLaunch(op, status);
     if (!status.ok()) return status;
     op = xla_launch_op.get();
@@ -521,6 +542,127 @@ Status EagerExecute(EagerOperation* op,
   return status;
 }
 
+Status EagerRemoteExecute(EagerOperation* op, eager::EagerClient* eager_client,
+                          uint64 context_id, TensorHandle** retvals,
+                          int* num_retvals) {
+  // All tensors must be on the same device.
+  // TODO(nareshmodi): handle silent copies
+  eager::EnqueueRequest request;
+  eager::EnqueueResponse response;
+
+  auto* remote_op = request.add_queue()->mutable_operation();
+
+  for (auto* input : op->Inputs()) {
+    tensorflow::Device* input_device;
+    TF_RETURN_IF_ERROR(input->Device(&input_device));
+    if (op->Device() != input_device) {
+      return tensorflow::errors::InvalidArgument(
+          "Ops and inputs are not on the same device. Use "
+          "TFE_TensorHandleCopyToDevice to get ops on the same "
+          "device. Expected device: ",
+          op->Device()->name(), ", Actual device: ", input_device->name());
+    }
+
+    tensorflow::uint64 op_id;
+    int32 output_num;
+    TF_RETURN_IF_ERROR(input->RemoteAddress(&op_id, &output_num));
+
+    auto* remote_op_input = remote_op->add_inputs();
+    remote_op_input->set_op_id(op_id);
+    remote_op_input->set_output_num(output_num);
+  }
+
+  remote_op->set_id(op->EagerContext()->NextId());
+  remote_op->set_name(op->Name());
+  // Inputs set above.
+  op->Attrs().FillAttrValueMap(remote_op->mutable_attrs());
+  remote_op->set_device(op->Device()->name());
+
+  request.set_context_id(context_id);
+
+  if (op->EagerContext()->Async()) {
+    tensorflow::uint64 id = op->EagerContext()->NextId();
+    auto* node = new eager::RemoteExecuteNode(id, request, eager_client);
+    op->EagerContext()->ExecutorAdd(node);
+  } else {
+    Notification n;
+    Status status;
+    eager_client->EnqueueAsync(&request, &response,
+                               [&n, &status](const Status& s) {
+                                 status = s;
+                                 n.Notify();
+                               });
+    n.WaitForNotification();
+    if (!status.ok()) return status;
+  }
+
+  DataTypeVector output_dtypes;
+  TF_RETURN_IF_ERROR(GetOutputDTypes(op, &output_dtypes));
+
+  if (*num_retvals != output_dtypes.size()) {
+    return errors::InvalidArgument(
+        "num_retvals does not match expected output dtypes");
+  }
+
+  tensorflow::Device* op_device = op->Device();
+  EagerContext* ctx = op->EagerContext();
+
+  const tensorflow::uint64 id = remote_op->id();
+  for (int i = 0; i < *num_retvals; i++) {
+    // TODO(nareshmodi): Change the callback to instead add the decref to a list
+    // of pending decrefs that we can send as a batch with the next execute.
+    std::function<void()> callback = [ctx, eager_client, context_id, id, i]() {
+      eager::EnqueueRequest request;
+      request.set_context_id(context_id);
+
+      auto* handle_to_decref = request.add_queue()->mutable_handle_to_decref();
+      handle_to_decref->set_op_id(id);
+      handle_to_decref->set_output_num(i);
+
+      if (ctx->Async()) {
+        tensorflow::uint64 id = ctx->NextId();
+        auto* node = new eager::RemoteExecuteNode(id, request, eager_client);
+        ctx->ExecutorAdd(node);
+      } else {
+        Notification n;
+        eager::EnqueueResponse response;
+        eager_client->EnqueueAsync(
+            &request, &response,
+            [&n](const tensorflow::Status& s) { n.Notify(); });
+        n.WaitForNotification();
+      }
+
+      return tensorflow::Status::OK();
+    };
+    retvals[i] = new TensorHandle(remote_op->id(), i, output_dtypes[i],
+                                  std::move(callback), op_device, op_device,
+                                  op->EagerContext());
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Status EagerExecute(EagerOperation* op,
+                    gtl::InlinedVector<TensorHandle*, 2>* retvals,
+                    int* num_retvals) {
+  bool op_is_local = IsLocal(op->EagerContext(), op->Device());
+
+  if (op_is_local) {
+    return EagerLocalExecute(op, retvals, num_retvals);
+  }
+
+  auto* ctx = op->EagerContext();
+
+  tensorflow::eager::EagerClient* eager_client;
+  tensorflow::uint64 context_id;
+  TF_RETURN_IF_ERROR(
+      ctx->GetClientAndContextID(op->Device(), &eager_client, &context_id));
+
+  return EagerRemoteExecute(op, eager_client, context_id, retvals->data(),
+                            num_retvals);
+}
+
 Status EagerExecute(EagerContext* ctx, Device* device,
                     const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
                     KernelAndDevice* kernel, NodeExecStats* maybe_stats,
@@ -593,13 +735,11 @@ Status EagerExecute(EagerContext* ctx, Device* device,
   return Status::OK();
 }
 
-Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
-                         const char* device_name, TensorHandle** result) {
+namespace {
+
+Status LocalEagerCopyToDevice(TensorHandle* h, EagerContext* ctx, Device* dstd,
+                              TensorHandle** result) {
   TF_RETURN_IF_ERROR(ctx->GetStatus());
-  Device* dstd = ctx->HostCPU();
-  if (device_name != nullptr && strlen(device_name) > 0) {
-    TF_RETURN_IF_ERROR(ctx->device_mgr()->LookupDevice(device_name, &dstd));
-  }
   if (ctx->Async()) {
     // Note that `h` may not be currently ready. However execution order will
     // make sure that `h` is ready before the copy is actually done.
@@ -616,4 +756,118 @@ Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
   }
 }
 
+Status FindDeviceFromName(EagerContext* ctx, const char* device_name,
+                          Device** device) {
+  *device = ctx->HostCPU();
+  if (device_name == nullptr || strlen(device_name) == 0) {
+    return Status::OK();
+  }
+
+  auto status = ctx->local_device_mgr()->LookupDevice(device_name, device);
+  if (status.ok()) {
+    return status;
+  }
+
+  if (ctx->remote_device_mgr() != nullptr) {
+    return ctx->remote_device_mgr()->LookupDevice(device_name, device);
+  }
+
+  return status;
+}
+
+Status ExecuteSend(EagerContext* ctx, tensorflow::Device* device,
+                   TensorHandle* h, StringPiece wire_id,
+                   const string& recv_device) {
+  const tensorflow::AttrTypeMap* types;
+  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp("_Send", &types));
+  tensorflow::EagerOperation op(ctx, "_Send", types);
+
+  op.AddInput(h);
+
+  op.SetDevice(device);
+
+  op.MutableAttrs()->Set("tensor_name", wire_id);
+  op.MutableAttrs()->Set("send_device", device->name());
+  op.MutableAttrs()->Set(
+      "send_device_incarnation",
+      static_cast<int64>(device->attributes().incarnation()));
+  op.MutableAttrs()->Set("recv_device", recv_device);
+  op.MutableAttrs()->Set("client_terminated", false);
+
+  op.MutableAttrs()->Set("T", h->dtype);
+
+  int num_outputs = 0;
+  gtl::InlinedVector<TensorHandle*, 2> retvals;
+
+  return EagerExecute(&op, &retvals, &num_outputs);
+}
+
+Status ExecuteRecv(EagerContext* ctx, tensorflow::Device* device,
+                   DataType dtype, StringPiece wire_id,
+                   const string& send_device, int64 send_device_incarnation,
+                   TensorHandle** result) {
+  const tensorflow::AttrTypeMap* types;
+  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp("_Recv", &types));
+  tensorflow::EagerOperation op(ctx, "_Recv", types);
+
+  op.SetDevice(device);
+
+  op.MutableAttrs()->Set("tensor_name", wire_id);
+  op.MutableAttrs()->Set("send_device", send_device);
+  op.MutableAttrs()->Set("send_device_incarnation", send_device_incarnation);
+  op.MutableAttrs()->Set("recv_device", device->name());
+  op.MutableAttrs()->Set("client_terminated", false);
+
+  op.MutableAttrs()->Set("tensor_type", dtype);
+
+  int num_outputs = 1;
+  gtl::InlinedVector<TensorHandle*, 2> retvals(num_outputs);
+
+  TF_RETURN_IF_ERROR(EagerExecute(&op, &retvals, &num_outputs));
+
+  *result = retvals.at(0);
+
+  return Status::OK();
+}
+
+// This gets a unique wire ID. We add a random identifier so that if the worker
+// has other clients that it is servicing, we don't have any collision.
+string GetUniqueWireID() {
+  static tensorflow::uint64 random_seed = random::New64();
+  static tensorflow::mutex wireid_mutex(tensorflow::LINKER_INITIALIZED);
+  static tensorflow::int64 wireid GUARDED_BY(wireid_mutex) = 0;
+  tensorflow::mutex_lock l(wireid_mutex);
+  return strings::StrCat(random_seed, "_", wireid++);
+}
+
+}  // namespace
+
+Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
+                         const char* device_name, TensorHandle** result) {
+  tensorflow::Device* send_device;
+  TF_RETURN_IF_ERROR(h->Device(&send_device));
+
+  if (send_device == nullptr) {
+    send_device = ctx->HostCPU();
+  }
+
+  bool sender_is_local = IsLocal(ctx, send_device);
+
+  tensorflow::Device* recv_device;
+  TF_RETURN_IF_ERROR(FindDeviceFromName(ctx, device_name, &recv_device));
+
+  bool recver_is_local = IsLocal(ctx, recv_device);
+
+  if (sender_is_local && recver_is_local) {
+    return LocalEagerCopyToDevice(h, ctx, recv_device, result);
+  } else {
+    string wire_id = GetUniqueWireID();
+
+    TF_RETURN_IF_ERROR(
+        ExecuteSend(ctx, send_device, h, wire_id, recv_device->name()));
+
+    return ExecuteRecv(ctx, recv_device, h->dtype, wire_id, send_device->name(),
+                       send_device->attributes().incarnation(), result);
+  }
+}
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
index 7c8d7e164d0b8061c9f8380fb238f17bcd6d74d3..f4f84980fb90a02f031f9651ef02d2c796a30934 100644
--- a/tensorflow/core/common_runtime/eager/execute.h
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -27,6 +27,15 @@ limitations under the License.
 namespace tensorflow {
 
 // Utility function that executes a fully constructed EagerOperation.
+// There are a few possible different combinations of how things can be
+// executed:
+//  - Async (the op context is configured to schedule asynchronously)
+//    Eager execute should return quickly after scheduling this operation to
+//    execute.
+//  - Remote (the op device is on a remote task)
+//    Eager execute will send an RPC to execute the op on a remote device.
+//  Note that in the Async + Remote case, EagerExecute should still return
+//  quickly, but it will schedule the op to be executed remotely.
 Status EagerExecute(
     EagerOperation* op,
     tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>* retvals,
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
index dd055c3c3eb6df3eb440a78b7a8d3e72ff9335bd..b4349e1dee72626812fb3ef3b88dbb05424a1e65 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/c/eager/runtime.h"
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 8e11f7b7104ce22cc585e2b03fcfd914e0eb80aa..1a811aa8df872868534717382dcc0cbb81bead70 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -50,6 +50,10 @@ bool TensorHandle::IsReady() {
   return is_ready_;
 }
 
+bool TensorHandle::IsRemote() {
+  return remote_op_id_ >= 0 && remote_output_num_ >= 0;
+}
+
 Status TensorHandle::WaitReady() {
   if (node_id == 0) return Status::OK();
   EagerExecutor* executor = nullptr;
@@ -62,6 +66,11 @@ Status TensorHandle::WaitReady() {
 }
 
 Status TensorHandle::Tensor(const tensorflow::Tensor** t) {
+  if (IsRemote()) {
+    return errors::Unavailable(
+        "Unable to get a tensor for a remote device. Please copy the tensor "
+        "handle to a local device using TFE_TensorHandleCopyToDevice");
+  }
   TF_RETURN_IF_ERROR(WaitReady());
   DCHECK(IsReady());
   *t = &tensor_;
@@ -85,6 +94,11 @@ Status TensorHandle::OpDevice(tensorflow::Device** d) {
 Status TensorHandle::TensorAndDevice(const tensorflow::Tensor** tensor,
                                      tensorflow::Device** device,
                                      tensorflow::Device** op_device) {
+  if (IsRemote()) {
+    return errors::Unavailable(
+        "Unable to get a tensor for a remote device. Please copy the tensor "
+        "handle to a local device using TFE_TensorHandleCopyToDevice");
+  }
   TF_RETURN_IF_ERROR(WaitReady());
   DCHECK(IsReady());
   *tensor = &tensor_;
@@ -93,6 +107,17 @@ Status TensorHandle::TensorAndDevice(const tensorflow::Tensor** tensor,
   return Status::OK();
 }
 
+Status TensorHandle::RemoteAddress(uint64* op_id, int32* output_num) {
+  if (!IsRemote()) {
+    return errors::FailedPrecondition(
+        "This TensorHandle refers to a local tensor handle");
+  }
+  *op_id = remote_op_id_;
+  *output_num = remote_output_num_;
+
+  return Status::OK();
+}
+
 void TensorHandle::SetTensorAndDevice(const tensorflow::Tensor& tensor,
                                       tensorflow::Device* device,
                                       tensorflow::Device* op_device) {
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index d66c4d95e2a5513680f81e3f7c1875266b2dfb02..a3b7dd862e368a7ead9f6367007021ef4b4446d6 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -45,7 +45,7 @@ limitations under the License.
 namespace tensorflow {
 
 // Associates a Tensor and a Device, used in the eager runtime. Internal version
-// executor_of the TFE_TensorHandle struct and the python EagerTensor class
+// of the TFE_TensorHandle struct and the python EagerTensor class
 // (unrelated to python TensorHandle).
 class TensorHandle : public core::RefCounted {
  public:
@@ -55,6 +55,8 @@ class TensorHandle : public core::RefCounted {
         tensor_(t),
         device_(d),
         op_device_(op_device),
+        remote_op_id_(-1),
+        remote_output_num_(-1),
         ctx_(ctx),
         is_ready_(true) {}
 
@@ -64,12 +66,35 @@ class TensorHandle : public core::RefCounted {
         tensor_(dtype),
         device_(nullptr),
         op_device_(nullptr),
+        remote_op_id_(-1),
+        remote_output_num_(-1),
         ctx_(ctx),
         is_ready_(ctx == nullptr) {
     DCHECK_GT(node_id, 0);
   }
 
-  ~TensorHandle() override {}
+  // Remote tensor handle constructor.
+  TensorHandle(uint64 op_id, int32 output_num, DataType dtype,
+               std::function<void()> call_on_destroy, Device* d,
+               Device* op_device, EagerContext* ctx)
+      : dtype(dtype),
+        node_id(0),
+        device_(d),
+        op_device_(op_device),
+        remote_op_id_(op_id),
+        remote_output_num_(output_num),
+        call_on_destroy_(std::move(call_on_destroy)),
+        ctx_(ctx),
+        is_ready_(true) {
+    DCHECK(IsRemote()) << "Op ID and output num should be >= 0. Op ID: "
+                       << op_id << ", Output num: " << output_num;
+  }
+
+  ~TensorHandle() override {
+    if (call_on_destroy_) {
+      call_on_destroy_();
+    }
+  }
 
   Status Tensor(const tensorflow::Tensor** t);
 
@@ -81,6 +106,9 @@ class TensorHandle : public core::RefCounted {
                          tensorflow::Device** device,
                          tensorflow::Device** op_device);
 
+  // Return the op_id and output num if the handle refers to a remote tensor.
+  Status RemoteAddress(uint64* op_id, int32* output_num);
+
   // Note that this can be called at most once, and only on non-ready handles,
   // and makes them ready.
   void SetTensorAndDevice(const tensorflow::Tensor& tensor,
@@ -108,6 +136,8 @@ class TensorHandle : public core::RefCounted {
 
   bool IsReady();
 
+  bool IsRemote();
+
   // Id for the EagerNode that will compute the value pointed to by this handle.
   // If the value is 0, the handle is already ready, but not vice-versa.
   const uint64 node_id;
@@ -128,6 +158,15 @@ class TensorHandle : public core::RefCounted {
   // device_ for constant tensors.
   tensorflow::Device* op_device_;
 
+  // IDs required when this class is representing a remote tensor handle.
+  const uint64 remote_op_id_;
+  const int32 remote_output_num_;
+
+  // A callback that is executed when the class is destroyed.
+  //
+  // This is currently used for remote tensor handles.
+  const std::function<void()> call_on_destroy_;
+
   mutex ctx_mutex_;
 
   // `ctx` is only guaranteed to be set if the handle is not "ready". This is
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index e389eb9b2a8b5cf531c68984e1a872d85b911989..802bfee890556435cd03b16295ab16a24bf33392 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -272,9 +272,9 @@ struct NodeItem {
   // (uint8 is enough for DataType).
   //   EdgeInfo            out_edges[num_out_edges];
   //   AllocatorAttributes output_attr[num_outputs];
+  //   int                 forward_from[num_outputs];
   //   uint8               input_type[num_inputs];
   //   uint8               output_type[num_outputs];
-  //   int                 forward_from[num_outputs];
 
   // Return pointer to variable length section.
   char* var() const {
@@ -289,22 +289,20 @@ struct NodeItem {
     return reinterpret_cast<AllocatorAttributes*>(var() + sizeof(EdgeInfo) *
                                                               num_output_edges);
   }
+  int* forward_from_base() const {
+    return reinterpret_cast<int*>(var() + sizeof(EdgeInfo) * num_output_edges +
+                                  sizeof(AllocatorAttributes) * num_outputs);
+  }
   uint8* input_type_base() const {
-    return reinterpret_cast<uint8*>(var() +
-                                    sizeof(EdgeInfo) * num_output_edges +
-                                    sizeof(AllocatorAttributes) * num_outputs);
+    return reinterpret_cast<uint8*>(
+        var() + sizeof(EdgeInfo) * num_output_edges +
+        sizeof(AllocatorAttributes) * num_outputs + sizeof(int) * num_outputs);
   }
   uint8* output_type_base() const {
     return reinterpret_cast<uint8*>(
         var() + sizeof(EdgeInfo) * num_output_edges +
-        sizeof(AllocatorAttributes) * num_outputs + sizeof(uint8) * num_inputs);
-  }
-
-  int* forward_from_base() const {
-    return reinterpret_cast<int*>(var() + sizeof(EdgeInfo) * num_output_edges +
-                                  sizeof(AllocatorAttributes) * num_outputs +
-                                  sizeof(uint8) * num_inputs +
-                                  sizeof(uint8) * num_outputs);
+        sizeof(AllocatorAttributes) * num_outputs + sizeof(int) * num_outputs +
+        sizeof(uint8) * num_inputs);
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(NodeItem);
@@ -481,9 +479,9 @@ size_t GraphView::NodeItemBytes(const Node* n) {
       sizeof(NodeItem)                             // Fixed
       + num_output_edges * sizeof(EdgeInfo)        // output_edges[...]
       + num_outputs * sizeof(AllocatorAttributes)  // output_attr[...]
+      + num_outputs * sizeof(int)                  // forward_from[num_outputs]
       + num_inputs * sizeof(uint8)                 // input_type[num_inputs]
-      + num_outputs * sizeof(uint8)                // output_type[num_outputs]
-      + num_outputs * sizeof(int);                 // forward_from[num_outputs]
+      + num_outputs * sizeof(uint8);               // output_type[num_outputs]
   static constexpr size_t kItemAlignment = sizeof(NodeItem*);
   static_assert(kItemAlignment % alignof(NodeItem) == 0,
                 "NodeItem must be aligned with kItemAlignment");
@@ -1676,7 +1674,8 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
 
     if (vlog_) {
       VLOG(1) << "Process node: " << id << " step " << params.step_id << " "
-              << SummarizeNode(*node) << " is dead: " << tagged_node.is_dead;
+              << SummarizeNode(*node) << " is dead: " << tagged_node.is_dead
+              << " device: " << device->name();
     }
 
     Entry* input_tensors = GetInputTensors(input_frame, input_iter);
@@ -1737,7 +1736,8 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
             VLOG(2) << "Async kernel done: " << state->item->node->id()
                     << " step " << step_id_ << " "
                     << SummarizeNode(*state->item->node)
-                    << " is dead: " << state->tagged_node.is_dead;
+                    << " is dead: " << state->tagged_node.is_dead
+                    << " device: " << device->name();
           }
 
           // Clears inputs.
@@ -1790,7 +1790,8 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
       if (vlog_) {
         VLOG(2) << "Synchronous kernel done: " << id << " step "
                 << params.step_id << " " << SummarizeNode(*node)
-                << " is dead: " << tagged_node.is_dead;
+                << " is dead: " << tagged_node.is_dead
+                << " device: " << device->name();
       }
 
       // Clears inputs.
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index a6f637b48837a097e613fc8aa51485b26a94efdb..d05564e9c496095b4ff8882309068e0c87f2f482 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -208,19 +208,19 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   // The instantiated and transformed function is encoded as a Graph
   // object, and an executor is created for the graph.
-  struct Item : public core::RefCounted {
-    bool invalidated = false;
+  struct Item {
+    uint64 instantiation_counter = 0;
     const Graph* graph = nullptr;                            // Owned by exec.
     const FunctionLibraryDefinition* overlay_lib = nullptr;  // Not owned.
     FunctionBody* func_graph = nullptr;
     Executor* exec = nullptr;
 
-    ~Item() override {
+    ~Item() {
       delete this->func_graph;
       delete this->exec;
     }
   };
-  std::unordered_map<Handle, Item*> items_ GUARDED_BY(mu_);
+  std::unordered_map<Handle, std::unique_ptr<Item>> items_ GUARDED_BY(mu_);
 
   ProcessFunctionLibraryRuntime* parent_ = nullptr;  // not owned.
 
@@ -284,9 +284,7 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
   }
 }
 
-FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() {
-  for (auto p : items_) p.second->Unref();
-}
+FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() {}
 
 // An asynchronous op kernel which executes an instantiated function
 // defined in a library.
@@ -490,30 +488,24 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
   options_copy.target = device_name_;
   const string key = Canonicalize(function_name, attrs, options_copy);
 
-  Handle found_handle = kInvalidHandle;
   {
     mutex_lock l(mu_);
-    found_handle = parent_->GetHandle(key);
-    if (found_handle != kInvalidHandle) {
+    *handle = parent_->GetHandle(key);
+    if (*handle != kInvalidHandle) {
       FunctionLibraryRuntime::LocalHandle handle_on_device =
-          parent_->GetHandleOnDevice(device_name_, found_handle);
+          parent_->GetHandleOnDevice(device_name_, *handle);
       if (handle_on_device == kInvalidLocalHandle) {
         return errors::Internal("LocalHandle not found for handle ", *handle,
                                 ".");
       }
-      auto iter = items_.find(handle_on_device);
-      if (iter == items_.end()) {
+      auto item_handle = items_.find(handle_on_device);
+      if (item_handle == items_.end()) {
         return errors::Internal("LocalHandle ", handle_on_device,
-                                " for handle ", found_handle,
+                                " for handle ", *handle,
                                 " not found in items.");
       }
-      Item* item = iter->second;
-      if (!item->invalidated) {
-        *handle = found_handle;
-        return Status::OK();
-      }
-      // *item is invalidated. Fall through and instantiate the given
-      // function_name/attrs/option again.
+      ++item_handle->second->instantiation_counter;
+      return Status::OK();
     }
   }
 
@@ -545,16 +537,18 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
 
   {
     mutex_lock l(mu_);
-    Handle found_handle_again = parent_->GetHandle(key);
-    if (found_handle_again != found_handle) {
+    *handle = parent_->GetHandle(key);
+    if (*handle != kInvalidHandle) {
       delete fbody;
-      *handle = found_handle_again;
+      ++items_[parent_->GetHandleOnDevice(device_name_, *handle)]
+            ->instantiation_counter;
     } else {
       *handle = parent_->AddHandle(key, device_name_, next_handle_);
       Item* item = new Item;
       item->func_graph = fbody;
       item->overlay_lib = options.overlay_lib;
-      items_.insert({next_handle_, item});
+      item->instantiation_counter = 1;
+      items_.emplace(next_handle_, std::unique_ptr<Item>(item));
       next_handle_++;
     }
   }
@@ -565,12 +559,17 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
   if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
     return parent_->ReleaseHandle(handle);
   }
+
   LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle);
   CHECK_NE(h, kInvalidLocalHandle);
   mutex_lock l(mu_);
   CHECK_EQ(1, items_.count(h));
-  Item* item = items_[h];
-  item->invalidated = true;  // Reinstantiate later.
+  std::unique_ptr<Item>& item = items_[h];
+  --item->instantiation_counter;
+  if (item->instantiation_counter == 0) {
+    items_.erase(h);
+    TF_RETURN_IF_ERROR(parent_->RemoveHandle(handle));
+  }
   return Status::OK();
 }
 
@@ -680,7 +679,7 @@ Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) {
       return errors::NotFound("Function handle ", handle,
                               " is not valid. Likely an internal error.");
     }
-    *item = items_[local_handle];
+    *item = items_[local_handle].get();
     if ((*item)->exec != nullptr) {
       return Status::OK();
     }
@@ -731,7 +730,6 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
   // computation is done and stored in *rets, we send the return values back
   // to the source_device (caller) so that the ProcFLR can receive them later.
   std::vector<Tensor>* remote_args = new std::vector<Tensor>;
-  item->Ref();
   ProcessFunctionLibraryRuntime::ReceiveTensorsAsync(
       source_device, target_device, "arg_", src_incarnation, args.size(),
       device_context, {}, rendezvous, remote_args,
@@ -743,7 +741,6 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
           s = frame->SetArgs(*remote_args);
         }
         if (!s.ok()) {
-          item->Unref();
           delete frame;
           delete remote_args;
           delete exec_args;
@@ -751,10 +748,9 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
           return;
         }
         item->exec->RunAsync(
-            *exec_args, [item, frame, rets, done, source_device, target_device,
+            *exec_args, [frame, rets, done, source_device, target_device,
                          target_incarnation, rendezvous, device_context,
                          remote_args, exec_args](const Status& status) {
-              core::ScopedUnref unref(item);
               Status s = status;
               if (s.ok()) {
                 s = frame->ConsumeRetvals(rets);
@@ -795,16 +791,16 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     };
   }
 
-  if (run_opts.runner == nullptr) {
-    run_opts.runner = &default_runner_;
-  }
-  DCHECK(run_opts.runner != nullptr);
-
   if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
     parent_->Run(run_opts, handle, args, rets, done);
     return;
   }
 
+  if (run_opts.runner == nullptr) {
+    run_opts.runner = &default_runner_;
+  }
+  DCHECK(run_opts.runner != nullptr);
+
   Executor::Args* exec_args = new Executor::Args;
   // Inherit the step_id from the caller.
   exec_args->step_id = run_opts.step_id;
@@ -840,13 +836,11 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     return;
   }
 
-  item->Ref();
   item->exec->RunAsync(
       // Executor args
       *exec_args,
       // Done callback.
-      [item, frame, rets, done, exec_args](const Status& status) {
-        core::ScopedUnref unref(item);
+      [frame, rets, done, exec_args](const Status& status) {
         Status s = status;
         if (s.ok()) {
           s = frame->ConsumeRetvals(rets);
@@ -906,7 +900,6 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   exec_args->runner = *run_opts.runner;
   exec_args->call_frame = frame;
 
-  item->Ref();
   item->exec->RunAsync(
       // Executor args
       *exec_args,
@@ -915,7 +908,6 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
           [item, frame, exec_args](DoneCallback done,
                                    // Start unbound arguments.
                                    const Status& status) {
-            core::ScopedUnref unref(item);
             delete exec_args;
             done(status);
           },
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 373fc64007e43e87d1a4af1263cabbd5757773c1..61b2f0e60f7ea6ca7f7b36f21845766399489795 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -231,8 +231,19 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       return status;
     }
     FunctionLibraryRuntime::Options opts;
-    TF_RETURN_IF_ERROR(Run(flr, handle, opts, args, rets, add_runner));
-    return flr->ReleaseHandle(handle);
+    status = Run(flr, handle, opts, args, rets, add_runner);
+    if (!status.ok()) return status;
+
+    // Release the handle and try running again. It should not succeed.
+    status = flr->ReleaseHandle(handle);
+    if (!status.ok()) return status;
+
+    Status status2 = Run(flr, handle, opts, args, std::move(rets));
+    EXPECT_TRUE(errors::IsInvalidArgument(status2));
+    EXPECT_TRUE(
+        str_util::StrContains(status2.error_message(), "remote execution."));
+
+    return status;
   }
 
   Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
@@ -293,8 +304,16 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       *rets[i] = retvals[i];
     }
 
-    // Release the handle.
-    return flr->ReleaseHandle(handle);
+    // Release the handle and try running again. It should not succeed.
+    status = flr->ReleaseHandle(handle);
+    if (!status.ok()) return status;
+
+    Status status2 = Run(flr, handle, opts, args, std::move(rets));
+    EXPECT_TRUE(errors::IsInvalidArgument(status2));
+    EXPECT_TRUE(
+        str_util::StrContains(status2.error_message(), "remote execution."));
+
+    return status;
   }
 
   std::unique_ptr<Graph> GetFuncBody(FunctionLibraryRuntime* flr,
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 98dac38a8cb903695506714ff07ab23f66594027..655a68cfc936c739fd9d90d0e39b46afb2bb1f45 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -144,7 +143,19 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       return status;
     }
     FunctionLibraryRuntime::Options opts;
-    return Run(flr, handle, opts, args, std::move(rets), add_runner);
+    status = Run(flr, handle, opts, args, rets, add_runner);
+    if (!status.ok()) return status;
+
+    // Release the handle and try running again. It should not succeed.
+    status = flr->ReleaseHandle(handle);
+    if (!status.ok()) return status;
+
+    Status status2 = Run(flr, handle, opts, args, std::move(rets));
+    EXPECT_TRUE(errors::IsInvalidArgument(status2));
+    EXPECT_TRUE(
+        str_util::StrContains(status2.error_message(), "remote execution."));
+
+    return status;
   }
 
   Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
index 2f7fbbbec2a285976701b94c426bc3f870c65cf5..2d4c8d0201405f207a0d3b1867195bfb073f1d50 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@@ -31,7 +31,9 @@ GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
                                  const string& name)
     : BFCAllocator(
           new GPUMemAllocator(
-              GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie()),
+              GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(),
+              gpu_options.per_process_gpu_memory_fraction() > 1.0 ||
+                  gpu_options.experimental().use_unified_memory()),
           total_memory, gpu_options.allow_growth(), name) {}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
index ad142e9982a9ed255f6551526b6e7c0e6300d708..a3e0d0734ffa63b2da20ed0643599c3cb6fd056e 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -50,8 +50,9 @@ class GPUBFCAllocator : public BFCAllocator {
 class GPUMemAllocator : public SubAllocator {
  public:
   // Note: stream_exec cannot be null.
-  explicit GPUMemAllocator(se::StreamExecutor* stream_exec)
-      : stream_exec_(stream_exec) {
+  explicit GPUMemAllocator(se::StreamExecutor* stream_exec,
+                           bool use_unified_memory)
+      : stream_exec_(stream_exec), use_unified_memory_(use_unified_memory) {
     CHECK(stream_exec_ != nullptr);
   }
   ~GPUMemAllocator() override {}
@@ -59,20 +60,29 @@ class GPUMemAllocator : public SubAllocator {
   void* Alloc(size_t alignment, size_t num_bytes) override {
     void* ptr = nullptr;
     if (num_bytes > 0) {
-      ptr = stream_exec_->AllocateArray<char>(num_bytes).opaque();
+      if (use_unified_memory_) {
+        ptr = stream_exec_->UnifiedMemoryAllocate(num_bytes);
+      } else {
+        ptr = stream_exec_->AllocateArray<char>(num_bytes).opaque();
+      }
     }
     return ptr;
   }
 
   void Free(void* ptr, size_t num_bytes) override {
     if (ptr != nullptr) {
-      se::DeviceMemoryBase gpu_ptr(ptr);
-      stream_exec_->Deallocate(&gpu_ptr);
+      if (use_unified_memory_) {
+        stream_exec_->UnifiedMemoryDeallocate(ptr);
+      } else {
+        se::DeviceMemoryBase gpu_ptr(ptr);
+        stream_exec_->Deallocate(&gpu_ptr);
+      }
     }
   }
 
  private:
   se::StreamExecutor* stream_exec_;  // not owned, non-null
+  const bool use_unified_memory_ = false;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUMemAllocator);
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 9b434e5e2fdb94d070d655b93f3132074a6d0d72..cf5d11ec8b44ad3eef1a93d79757e78e74085565 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -104,8 +104,9 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
         reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize);
     stream_ = cuda_stream;
     allocator_ = alloc;
-    const int cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id).value();
-    device_prop_ = &Eigen::m_deviceProperties[cuda_gpu_id];
+    CudaGpuId cuda_gpu_id;
+    TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id));
+    device_prop_ = &Eigen::m_deviceProperties[cuda_gpu_id.value()];
   }
 
   const cudaStream_t& stream() const override { return *stream_; }
@@ -317,7 +318,9 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
   gpu_device_info_->stream = streams_[0]->compute;
   gpu_device_info_->default_context = device_contexts_[0];
   gpu_device_info_->event_mgr = em_.get();
-  gpu_device_info_->gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id_).value();
+  CudaGpuId cuda_gpu_id;
+  TF_RETURN_IF_ERROR(GpuIdManager::TfToCudaGpuId(tf_gpu_id_, &cuda_gpu_id));
+  gpu_device_info_->gpu_id = cuda_gpu_id.value();
   set_tensorflow_gpu_device_info(gpu_device_info_);
 
   // Whether and how the GPU device uses its own threadpool.
@@ -428,6 +431,13 @@ void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   }
 }
 
+string BaseGPUDevice::ComputeOpKernelDebugString(const OpKernel& op_kernel,
+                                                 const int& stream_id) {
+  return strings::StrCat(op_kernel.name(), " op ", op_kernel.type_string(),
+                         " on GPU ", tf_gpu_id_.value(), " stream[", stream_id,
+                         "]");
+}
+
 void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
                                   OpKernelContext* context) {
   GPUDeviceContext* gpu_device_context = device_contexts_[0];
@@ -442,9 +452,8 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
   const bool vlog_2 = vlog_1 && VLOG_IS_ON(2);
 
   if (vlog_1) {
-    VLOG(1) << "GpuDevice::Compute " << op_kernel->name() << " op "
-            << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
-            << stream_id << "]";
+    VLOG(1) << "GpuDevice::ComputeHelper "
+            << ComputeOpKernelDebugString(*op_kernel, stream_id);
   }
 
   const auto num_streams = streams_.size();
@@ -488,6 +497,18 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
       // all streams.  Given that this flag is typically used for
       // debugging it makes more sense to sync all GPU activity.
       context->SetStatus(GPUUtil::SyncAll(this));
+      if (vlog_1) {
+        VLOG(1) << "GpuDevice::ComputeHelper finished "
+                << ComputeOpKernelDebugString(*op_kernel, stream_id);
+      }
+    } else if (vlog_1) {
+      VLOG(1) << "GpuDevice::ComputeHelper scheduled "
+              << ComputeOpKernelDebugString(*op_kernel, stream_id);
+    }
+  } else {
+    if (vlog_1) {
+      VLOG(1) << "GpuDevice::ComputeHelper failed to schedule "
+              << ComputeOpKernelDebugString(*op_kernel, stream_id);
     }
   }
 }
@@ -788,6 +809,20 @@ Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
   int64 allocated_memory = 0;
   const double per_process_gpu_memory_fraction =
       gpu_options.per_process_gpu_memory_fraction();
+  if (per_process_gpu_memory_fraction > 1.0 ||
+      gpu_options.experimental().use_unified_memory()) {
+    int cc_major = 0, cc_minor = 0;
+    if (!se->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                            &cc_minor)) {
+      return errors::Internal("Failed to get compute capability for device.");
+    }
+    if (cc_major < 6) {
+      return errors::Internal(
+          "Unified memory on GPUs with compute capability lower than 6.0 "
+          "(pre-Pascal class GPUs) does not support oversubscription.");
+    }
+  }
+
   if (per_process_gpu_memory_fraction == 0) {
     allocated_memory = available_memory;
     const int64 min_system_memory = MinSystemMemory(available_memory);
@@ -876,7 +911,8 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
   if (num_gpus_to_use > valid_cuda_gpu_ids.size()) {
     num_gpus_to_use = valid_cuda_gpu_ids.size();
   }
-  if (!valid_cuda_gpu_ids.empty()) {
+  // If we aren't going to use any GPUs, don't initialize them.
+  if (num_gpus_to_use > 0 && !valid_cuda_gpu_ids.empty()) {
     // Save the original device.
     int original_device = 0;
     cudaError_t err = cudaGetDevice(&original_device);
@@ -965,7 +1001,8 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
     while (next_tf_gpu_id < memory_limit_bytes.size()) {
       TfGpuId tf_gpu_id(next_tf_gpu_id);
       ++next_tf_gpu_id;
-      GpuIdManager::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id);
+      TF_RETURN_IF_ERROR(
+          GpuIdManager::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id));
     }
   }
   const int num_tf_gpus = next_tf_gpu_id;
@@ -1016,7 +1053,8 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
   const string device_name =
       strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
   GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
-  CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
+  CudaGpuId cuda_gpu_id;
+  TF_RETURN_IF_ERROR(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id));
   int numa_node = dev_locality.numa_node();
 
   se::StreamExecutor* se =
@@ -1101,7 +1139,8 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
     all_tf_gpu_ids.push_back(TfGpuId(i));
   }
   for (TfGpuId tf_gpu_id : all_tf_gpu_ids) {
-    CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
+    CudaGpuId cuda_gpu_id;
+    TF_RETURN_IF_ERROR(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id));
     // Get GPU bus_id from its reported NUMA affinity.  Because GPUs are
     // virtualized in some environments, we can't just use the GPU id.
     // NUMA locales are indexed from 0, buses are indexed from 1.
@@ -1129,7 +1168,9 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
     LocalLinks* links = dev_locality.mutable_links();
     for (const InterconnectMap& imap : interconnects) {
       for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
-        CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst);
+        CudaGpuId cuda_gpu_dst;
+        TF_RETURN_IF_ERROR(
+            GpuIdManager::TfToCudaGpuId(tf_gpu_dst, &cuda_gpu_dst));
         if (imap.directed_links.find({cuda_gpu_id, cuda_gpu_dst}) !=
             imap.directed_links.end()) {
           InterconnectLink* ilink = links->add_link();
@@ -1144,7 +1185,9 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
     // add high strength links to the others.
     for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
       if (tf_gpu_id == tf_gpu_dst) continue;
-      CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst);
+      CudaGpuId cuda_gpu_dst;
+      TF_RETURN_IF_ERROR(
+          GpuIdManager::TfToCudaGpuId(tf_gpu_dst, &cuda_gpu_dst));
       if (cuda_gpu_id == cuda_gpu_dst) {
         InterconnectLink* ilink = links->add_link();
         ilink->set_device_id(tf_gpu_dst.value());
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index b754ffd2db00a2d6c57e7d38692f3629734648c1..737a3515b6b4991b6474de5dc99f945fc29350d1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -90,7 +90,11 @@ class BaseGPUDevice : public LocalDevice {
 
   // Returns the CUDA GPU id of this device within the native driver system;
   // e.g., for CUDA this is the ordinal of the GPU within the system.
-  int gpu_id() const { return GpuIdManager::TfToCudaGpuId(tf_gpu_id_).value(); }
+  int gpu_id() const {
+    CudaGpuId cuda_gpu_id;
+    TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id_, &cuda_gpu_id));
+    return cuda_gpu_id.value();
+  }
 
   // The executor that provides control for the device; e.g., for CUDA this
   // corresponds to the cuda context.
@@ -135,6 +139,9 @@ class BaseGPUDevice : public LocalDevice {
 
   void ComputeHelper(OpKernel* op_kernel, OpKernelContext* context);
 
+  string ComputeOpKernelDebugString(const OpKernel& op_kernel,
+                                    const int& stream_id);
+
   // This method returns an initialization status, in addition to
   // calling the "done" StatusCallback, if there is a failure to
   // allocate memory or if the tensor "from" is not DMA-copyable.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index f3935f6ba26c49a9967d0848bfb6d965c73d2fab..5c6cb43eff17d309f45fb72f7d26eb1c5240663f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -17,19 +17,48 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
+namespace {
 const char* kDeviceNamePrefix = "/job:localhost/replica:0/task:0";
 
+int64 GetTotalGPUMemory(CudaGpuId gpu_id) {
+  se::StreamExecutor* se =
+      GpuIdUtil::ExecutorForCudaGpuId(GPUMachineManager(), gpu_id).ValueOrDie();
+
+  int64 total_memory, available_memory;
+  CHECK(se->DeviceMemoryUsage(&available_memory, &total_memory));
+  return total_memory;
+}
+
+Status GetComputeCapability(CudaGpuId gpu_id, int* cc_major, int* cc_minor) {
+  se::StreamExecutor* se =
+      GpuIdUtil::ExecutorForCudaGpuId(GPUMachineManager(), gpu_id).ValueOrDie();
+  if (!se->GetDeviceDescription().cuda_compute_capability(cc_major, cc_minor)) {
+    *cc_major = 0;
+    *cc_minor = 0;
+    return errors::Internal("Failed to get compute capability for device.");
+  }
+  return Status::OK();
+}
+
+void ExpectErrorMessageSubstr(const Status& s, StringPiece substr) {
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
+      << s << ", expected substring " << substr;
+}
+}  // namespace
+
 class GPUDeviceTest : public ::testing::Test {
  public:
-  void TearDown() { ProcessState::singleton()->TestOnlyReset(); }
+  void TearDown() override { ProcessState::singleton()->TestOnlyReset(); }
 
  protected:
   static SessionOptions MakeSessionOptions(
@@ -52,11 +81,6 @@ class GPUDeviceTest : public ::testing::Test {
     }
     return options;
   }
-
-  static bool StartsWith(const string& lhs, const string& rhs) {
-    if (rhs.length() > lhs.length()) return false;
-    return lhs.substr(0, rhs.length()) == rhs;
-  }
 };
 
 TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
@@ -65,8 +89,7 @@ TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(StartsWith(status.error_message(), "Could not parse entry"))
-      << status;
+  ExpectErrorMessageSubstr(status, "Could not parse entry");
 }
 
 TEST_F(GPUDeviceTest, InvalidGpuId) {
@@ -75,9 +98,8 @@ TEST_F(GPUDeviceTest, InvalidGpuId) {
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(StartsWith(status.error_message(),
-                         "'visible_device_list' listed an invalid GPU id"))
-      << status;
+  ExpectErrorMessageSubstr(status,
+                           "'visible_device_list' listed an invalid GPU id");
 }
 
 TEST_F(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
@@ -86,9 +108,8 @@ TEST_F(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(StartsWith(status.error_message(),
-                         "visible_device_list contained a duplicate entry"))
-      << status;
+  ExpectErrorMessageSubstr(status,
+                           "visible_device_list contained a duplicate entry");
 }
 
 TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithMemoryFractionSettings) {
@@ -97,9 +118,8 @@ TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithMemoryFractionSettings) {
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(StartsWith(status.error_message(),
-                         "It's invalid to set per_process_gpu_memory_fraction"))
-      << status;
+  ExpectErrorMessageSubstr(
+      status, "It's invalid to set per_process_gpu_memory_fraction");
 }
 
 TEST_F(GPUDeviceTest, GpuDeviceCountTooSmall) {
@@ -110,9 +130,8 @@ TEST_F(GPUDeviceTest, GpuDeviceCountTooSmall) {
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::UNKNOWN);
-  EXPECT_TRUE(StartsWith(status.error_message(),
-                         "Not enough GPUs to create virtual devices."))
-      << status;
+  ExpectErrorMessageSubstr(status,
+                           "Not enough GPUs to create virtual devices.");
 }
 
 TEST_F(GPUDeviceTest, NotEnoughGpuInVisibleDeviceList) {
@@ -123,9 +142,8 @@ TEST_F(GPUDeviceTest, NotEnoughGpuInVisibleDeviceList) {
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::UNKNOWN);
-  EXPECT_TRUE(StartsWith(status.error_message(),
-                         "Not enough GPUs to create virtual devices."))
-      << status;
+  ExpectErrorMessageSubstr(status,
+                           "Not enough GPUs to create virtual devices.");
 }
 
 TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
@@ -138,11 +156,11 @@ TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(StartsWith(status.error_message(),
-                         "The number of GPUs in visible_device_list doesn't "
-                         "match the number of elements in the virtual_devices "
-                         "list."))
-      << status;
+  ExpectErrorMessageSubstr(
+      status,
+      "The number of GPUs in visible_device_list doesn't "
+      "match the number of elements in the virtual_devices "
+      "list.");
 }
 
 TEST_F(GPUDeviceTest, EmptyVirtualDeviceConfig) {
@@ -153,7 +171,7 @@ TEST_F(GPUDeviceTest, EmptyVirtualDeviceConfig) {
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
-  for (auto d : devices) delete d;
+  gtl::STLDeleteElements(&devices);
 }
 
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
@@ -165,7 +183,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
-  for (auto d : devices) delete d;
+  gtl::STLDeleteElements(&devices);
 }
 
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) {
@@ -175,7 +193,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) {
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
-  for (auto d : devices) delete d;
+  gtl::STLDeleteElements(&devices);
 }
 
 TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
@@ -198,7 +216,67 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
             devices[1]->attributes().locality().links().link(0).type());
   EXPECT_EQ(BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength,
             devices[1]->attributes().locality().links().link(0).strength());
-  for (auto d : devices) delete d;
+  gtl::STLDeleteElements(&devices);
+}
+
+// Enabling unified memory on pre-Pascal GPUs results in an initialization
+// error.
+TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
+  int cc_major, cc_minor;
+  TF_ASSERT_OK(GetComputeCapability(CudaGpuId(0), &cc_major, &cc_minor));
+  // Exit early while running on Pascal or later GPUs.
+  if (cc_major >= 6) {
+    return;
+  }
+
+  SessionOptions opts = MakeSessionOptions("0", /*memory_fraction=*/1.2);
+  opts.config.mutable_gpu_options()
+      ->mutable_experimental()
+      ->set_use_unified_memory(true);
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::INTERNAL);
+  ExpectErrorMessageSubstr(status, "does not support oversubscription.");
+}
+
+// Enabling unified memory on Pascal or later GPUs makes it possible to allocate
+// more memory than what is available on the device.
+TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
+  static constexpr double kGpuMemoryFraction = 1.2;
+  static constexpr CudaGpuId kCudaGpuId(0);
+
+  int cc_major, cc_minor;
+  TF_ASSERT_OK(GetComputeCapability(kCudaGpuId, &cc_major, &cc_minor));
+  // Exit early if running on pre-Pascal GPUs.
+  if (cc_major < 6) {
+    LOG(INFO)
+        << "Unified memory allocation is not supported with pre-Pascal GPUs.";
+    return;
+  }
+
+  SessionOptions opts = MakeSessionOptions("0", kGpuMemoryFraction);
+  std::vector<tensorflow::Device*> devices;
+  TF_ASSERT_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices));
+  ASSERT_EQ(1, devices.size());
+
+  int64 memory_limit = devices[0]->attributes().memory_limit();
+  ASSERT_EQ(memory_limit, static_cast<int64>(GetTotalGPUMemory(kCudaGpuId) *
+                                             kGpuMemoryFraction));
+
+  AllocatorAttributes allocator_attributes = AllocatorAttributes();
+  allocator_attributes.set_gpu_compatible(true);
+  Allocator* allocator = devices[0]->GetAllocator(allocator_attributes);
+
+  // Try to allocate all the available memory after rounding down to the nearest
+  // multiple of MB.
+  void* ptr = allocator->AllocateRaw(Allocator::kAllocatorAlignment,
+                                     (memory_limit >> 20) << 20);
+  EXPECT_NE(ptr, nullptr);
+  allocator->DeallocateRaw(ptr);
+
+  gtl::STLDeleteElements(&devices);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index 1d4ad957b94bfcc0381fd6250c08ac8d34f90ff6..c5ff6c97a176a871e4fd47555f6ea8e3513ab8c2 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -119,7 +119,7 @@ TEST(EventMgr, DelayedPolling) {
   EXPECT_EQ(0, th.queue_size());
   TensorReferenceVector* v = nullptr;
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
-  CHECK(stream.get());
+  CHECK(stream);
   stream->Init();
   for (int i = 0; i < 5; ++i) {
     v = new TensorReferenceVector;
@@ -151,7 +151,7 @@ TEST(EventMgr, FlushLargeTensorImmediately) {
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
-  CHECK(stream.get());
+  CHECK(stream);
   stream->Init();
   for (int i = 0; i < 5; ++i) {
     TensorReferenceVector v;
@@ -168,7 +168,7 @@ TEST(EventMgr, ManySmallTensorsFlushedImmediately) {
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
-  CHECK(stream.get());
+  CHECK(stream);
   stream->Init();
   for (int i = 0; i < 5; ++i) {
     TensorReferenceVector v;
@@ -209,7 +209,7 @@ TEST(EventMgr, ManySmallTensorsSeparateCallsFlushed) {
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
-  CHECK(stream.get());
+  CHECK(stream);
   stream->Init();
   for (int i = 0; i < 5; ++i) {
     for (int i = 0; i < 1000; i++) {
@@ -232,7 +232,7 @@ TEST(EventMgr, NonEmptyShutdown) {
   EXPECT_EQ(0, th.queue_size());
   EXPECT_EQ(0, th.free_size());
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
-  CHECK(stream.get());
+  CHECK(stream);
   stream->Init();
   for (int i = 0; i < 5; ++i) {
     TensorReferenceVector* v = new TensorReferenceVector;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
index 7dfff3269cf91582adf783dcd15dd55d1c4e1451..b5099dc8ef07dcfc64bb41482cefbbcf90141e3e 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
@@ -34,46 +34,40 @@ class TfToCudaGpuIdMap {
     return id_map;
   }
 
-  void InsertOrDie(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id)
-      LOCKS_EXCLUDED(mu_) {
+  Status Insert(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id) LOCKS_EXCLUDED(mu_) {
     std::pair<IdMapType::iterator, bool> result;
     {
       mutex_lock lock(mu_);
       result = id_map_.insert({tf_gpu_id.value(), cuda_gpu_id.value()});
     }
-    if (!result.second) {
-      CHECK_EQ(cuda_gpu_id.value(), result.first->second)
-          << "Mapping the same TfGpuId to a different CUDA GPU id."
-          << " TfGpuId: " << tf_gpu_id
-          << " Existing mapped CUDA GPU id: " << result.first->second
-          << " CUDA GPU id being tried to map to: " << cuda_gpu_id;
+    if (!result.second && cuda_gpu_id.value() != result.first->second) {
+      return errors::AlreadyExists(
+          "TensorFlow device (GPU:", tf_gpu_id.value(),
+          ") is being mapped to "
+          "multiple CUDA devices (",
+          cuda_gpu_id.value(), " now, and ", result.first->second,
+          " previously), which is not supported. "
+          "This may be the result of providing different GPU configurations "
+          "(ConfigProto.gpu_options, for example different visible_device_list)"
+          " when creating multiple Sessions in the same process. This is not "
+          " currently supported, see "
+          "https://github.com/tensorflow/tensorflow/issues/19083");
     }
-  }
-
-  CudaGpuId FindOrDie(TfGpuId tf_gpu_id) const LOCKS_EXCLUDED(mu_) {
-    mutex_lock lock(mu_);
-    return FindOrDieLocked(tf_gpu_id);
+    return Status::OK();
   }
 
   bool Find(TfGpuId tf_gpu_id, CudaGpuId* cuda_gpu_id) const
       LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
-    if (id_map_.count(tf_gpu_id.value()) == 0) return false;
-    *cuda_gpu_id = FindOrDieLocked(tf_gpu_id);
+    auto result = id_map_.find(tf_gpu_id.value());
+    if (result == id_map_.end()) return false;
+    *cuda_gpu_id = result->second;
     return true;
   }
 
  private:
   TfToCudaGpuIdMap() = default;
 
-  CudaGpuId FindOrDieLocked(TfGpuId tf_gpu_id) const
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    auto result = id_map_.find(tf_gpu_id.value());
-    CHECK(result != id_map_.end())
-        << "Could not find the mapping for TfGpuId: " << tf_gpu_id;
-    return CudaGpuId(result->second);
-  }
-
   void TestOnlyReset() LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     id_map_.clear();
@@ -88,23 +82,19 @@ class TfToCudaGpuIdMap {
 };
 }  // namespace
 
-void GpuIdManager::InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id,
-                                         CudaGpuId cuda_gpu_id) {
-  TfToCudaGpuIdMap::singleton()->InsertOrDie(tf_gpu_id, cuda_gpu_id);
+Status GpuIdManager::InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id,
+                                           CudaGpuId cuda_gpu_id) {
+  return TfToCudaGpuIdMap::singleton()->Insert(tf_gpu_id, cuda_gpu_id);
 }
 
 Status GpuIdManager::TfToCudaGpuId(TfGpuId tf_gpu_id, CudaGpuId* cuda_gpu_id) {
   if (TfToCudaGpuIdMap::singleton()->Find(tf_gpu_id, cuda_gpu_id)) {
     return Status::OK();
   }
-  return errors::NotFound("TF GPU device with id ", tf_gpu_id.value(),
+  return errors::NotFound("TensorFlow device GPU:", tf_gpu_id.value(),
                           " was not registered");
 }
 
-CudaGpuId GpuIdManager::TfToCudaGpuId(TfGpuId tf_gpu_id) {
-  return TfToCudaGpuIdMap::singleton()->FindOrDie(tf_gpu_id);
-}
-
 void GpuIdManager::TestOnlyReset() {
   TfToCudaGpuIdMap::singleton()->TestOnlyReset();
 }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager.h b/tensorflow/core/common_runtime/gpu/gpu_id_manager.h
index 2b54cc184ca508b94e2a715642cdb13fe8a4c3e1..491d92ccddd4667bc32ce17bdec36d04836120cd 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_manager.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.h
@@ -26,13 +26,10 @@ namespace tensorflow {
 class GpuIdManager {
  public:
   // Adds a mapping from tf_gpu_id to cuda_gpu_id.
-  static void InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id);
+  static Status InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id);
 
   // Gets the cuda_gpu_id associated with tf_gpu_id. Returns OK if found.
   static Status TfToCudaGpuId(TfGpuId tf_gpu_id, CudaGpuId* cuda_gpu_id);
-  // Similar to the above version, but returns the result, and checks fail if
-  // no result is found.
-  static CudaGpuId TfToCudaGpuId(TfGpuId tf_gpu_id);
 
   // Clears the map. Used in unit tests only.
   static void TestOnlyReset();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc b/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc
index bdbd8d065b398159305504202ed342c08cc3ee7d..a663ec705107541ee5e607a020d72d7f39622b76 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc
@@ -16,40 +16,45 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
-namespace test {
+namespace {
+
+CudaGpuId TfToCudaGpuId(TfGpuId tf) {
+  CudaGpuId cuda;
+  TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf, &cuda));
+  return cuda;
+}
 
 TEST(GpuIdManagerTest, Basics) {
   TfGpuId key_0(0);
   CudaGpuId value_0(0);
-  GpuIdManager::InsertTfCudaGpuIdPair(key_0, value_0);
-  EXPECT_EQ(value_0, GpuIdManager::TfToCudaGpuId(key_0));
+  TF_ASSERT_OK(GpuIdManager::InsertTfCudaGpuIdPair(key_0, value_0));
+  EXPECT_EQ(value_0, TfToCudaGpuId(key_0));
 
   // Multiple calls to map the same value is ok.
-  GpuIdManager::InsertTfCudaGpuIdPair(key_0, value_0);
-  EXPECT_EQ(value_0, GpuIdManager::TfToCudaGpuId(key_0));
+  TF_ASSERT_OK(GpuIdManager::InsertTfCudaGpuIdPair(key_0, value_0));
+  EXPECT_EQ(value_0, TfToCudaGpuId(key_0));
 
   // Map a different TfGpuId to a different value.
   TfGpuId key_1(3);
   CudaGpuId value_1(2);
-  GpuIdManager::InsertTfCudaGpuIdPair(key_1, value_1);
-  EXPECT_EQ(value_1, GpuIdManager::TfToCudaGpuId(key_1));
+  TF_ASSERT_OK(GpuIdManager::InsertTfCudaGpuIdPair(key_1, value_1));
+  EXPECT_EQ(value_1, TfToCudaGpuId(key_1));
 
   // Mapping a different TfGpuId to the same value is ok.
   TfGpuId key_2(10);
-  GpuIdManager::InsertTfCudaGpuIdPair(key_2, value_1);
-  EXPECT_EQ(value_1, GpuIdManager::TfToCudaGpuId(key_2));
+  TF_ASSERT_OK(GpuIdManager::InsertTfCudaGpuIdPair(key_2, value_1));
+  EXPECT_EQ(value_1, TfToCudaGpuId(key_2));
 
-  // Mapping the same TfGpuId to a different value will crash the program.
-  ASSERT_DEATH(GpuIdManager::InsertTfCudaGpuIdPair(key_2, value_0),
-               "Mapping the same TfGpuId to a different CUDA GPU id");
+  // Mapping the same TfGpuId to a different value.
+  ASSERT_FALSE(GpuIdManager::InsertTfCudaGpuIdPair(key_2, value_0).ok());
 
-  // Getting an nonexistent mapping will crash the program.
-  ASSERT_DEATH(GpuIdManager::TfToCudaGpuId(TfGpuId(100)),
-               "Could not find the mapping for TfGpuId");
+  // Getting a nonexistent mapping.
+  ASSERT_FALSE(GpuIdManager::TfToCudaGpuId(TfGpuId(100), &value_0).ok());
 }
 
-}  // namespace test
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
index 42bf074e630363ba20b360c270cd13367c13f254..b9c66b33286a70bbe13a3917a5e65e49666b4fe9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
@@ -39,12 +39,15 @@ class GpuIdUtil {
   }
   static se::port::StatusOr<se::StreamExecutor*> ExecutorForTfGpuId(
       TfGpuId tf_gpu_id) {
-    return ExecutorForCudaGpuId(GpuIdManager::TfToCudaGpuId(tf_gpu_id));
+    CudaGpuId cuda_gpu_id;
+    TF_RETURN_IF_ERROR(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id));
+    return ExecutorForCudaGpuId(cuda_gpu_id);
   }
 
   // Verify that the cuda_gpu_id associated with a TfGpuId is legitimate.
   static void CheckValidTfGpuId(TfGpuId tf_gpu_id) {
-    const CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
+    CudaGpuId cuda_gpu_id;
+    TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id));
     const int visible_device_count = GPUMachineManager()->VisibleDeviceCount();
     CHECK_LT(cuda_gpu_id.value(), visible_device_count)
         << "cuda_gpu_id is outside discovered device range."
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index 5ed01278c137576661ed90073d1e968cade61696..2b442071e25f9d01a708a00c2ba20930f9b79c7e 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -126,7 +126,8 @@ Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options,
       return nullptr;
     }
 
-    const CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
+    CudaGpuId cuda_gpu_id;
+    TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id));
     gpu_allocator =
         new GPUBFCAllocator(cuda_gpu_id, total_bytes, options,
                             strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b5fee36ff43e0808ee4128d55dcbcf2935889f0d
--- /dev/null
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -0,0 +1,283 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/lower_if_op.h"
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+
+namespace tensorflow {
+
+// TODO(jpienaar): Consider making it a public attribute.
+const char* const LowerIfOpPass::kLowerUsingSwitchMergeAttr =
+    "_lower_using_switch_merge";
+
+namespace {
+
+using NodeOut = NodeBuilder::NodeOut;
+
+// Convenience builder to make it easy to construct a conditional with a single
+// function call in the then and else branch. This first converts the if node
+// into switches (for inputs) and merges (for outputs) around a function call
+// per branch, then inlines the function calls.
+class CondBuilder {
+ public:
+  enum Branch { kElseBranch = 0, kThenBranch = 1 };
+
+  // Create a CondBuilder to create the lowering of If op.  that has then and
+  // else functions named `then_fn_name` and `else_fn_name` respectively in the
+  // given graph.
+  CondBuilder(Node* if_op, const string& then_fn_name,
+              const string& else_fn_name, Graph* graph);
+
+  // Constructs the basic conditional control flow using switch and merge nodes.
+  Status CreatePivotNodes();
+
+  // Adds the inputs from the if node to the merge nodes of the lowered if.
+  Status AddInputs();
+
+  // Adds the outputs from the if node to the merge nodes of the lowered if.
+  // Note: no inputs can be added once outputs are added as the then and else
+  // nodes are finalized while adding outputs.
+  Status AddOutputs();
+
+  // Builds an identity node with the same outputs as If.
+  Status BuildLoweredIfOutput();
+
+  // Inline call nodes for then and else.
+  Status InlineCallNodes();
+
+ private:
+  // Returns unique name containing the name of the If op being rewritten
+  // (name_), infix and a suffix to ensure it is unique within the graph.
+  string NewName(const string& infix);
+
+  // Adds input to both the then and else nodes from src:src_output.
+  Status AddInput(Node* src, int src_output);
+
+  // The merged outputs of the then and else nodes.
+  std::vector<NodeOut> outputs_;
+
+  // The node that dominates all execution of the then and else body nodes.
+  Node* control_predecessor_;
+  // The original If op.
+  Node* if_op_;
+  // The identity node with the same outputs as the original If op.
+  Node* lowered_if_output_;
+  // The predicate of the conditional.
+  Node* pred_;
+  // Node corresponding to pivot_f branch of predicate switch which is
+  // the pivot node that dominates all nodes in the false/else branch.
+  Node* pivot_f_;
+  // Node corresponding to pivot_t branch of predicate switch which is
+  // the pivot node that dominates all nodes in the true/then branch.
+  Node* pivot_t_;
+  Node* then_call_node_;
+  Node* else_call_node_;
+  Graph* graph_;
+  string name_;
+
+  NodeBuilder then_call_builder_;
+  NodeBuilder else_call_builder_;
+};
+
+CondBuilder::CondBuilder(Node* if_op, const string& then_fn_name,
+                         const string& else_fn_name, Graph* graph)
+    : if_op_(if_op),
+      graph_(graph),
+      name_(if_op->name()),
+      then_call_builder_(NewName("then"), then_fn_name, graph->op_registry()),
+      else_call_builder_(NewName("else"), else_fn_name, graph->op_registry()) {
+  TF_CHECK_OK(if_op_->input_node(0, &pred_));
+}
+
+Status CondBuilder::CreatePivotNodes() {
+  // Construct the basic cond body (consisting of feeding in the predicate to
+  // create pivot nodes).
+  Node* switch_pred;
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(NewName("switch_pred"), "Switch", graph_->op_registry())
+          .Input(NodeOut(pred_, 0))
+          .Input(NodeOut(pred_, 0))
+          .Finalize(graph_, &switch_pred));
+  control_predecessor_ = switch_pred;
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(NewName("pivot_f"), "Identity", graph_->op_registry())
+          .Input(switch_pred, kElseBranch)
+          .Finalize(graph_, &pivot_f_));
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(NewName("pivot_t"), "Identity", graph_->op_registry())
+          .Input(switch_pred, kThenBranch)
+          .Finalize(graph_, &pivot_t_));
+  return Status::OK();
+}
+
+string CondBuilder::NewName(const string& infix) {
+  return graph_->NewName(strings::StrCat(name_, "/", infix));
+}
+
+Status CondBuilder::AddInput(Node* src, int src_output) {
+  Node* input;
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(NewName(src->name()), "Switch", graph_->op_registry())
+          .Input(src, src_output)
+          .Input(pred_, 0)
+          .Finalize(graph_, &input));
+  then_call_builder_.Input(input, kThenBranch);
+  else_call_builder_.Input(input, kElseBranch);
+  return Status::OK();
+}
+
+Status CondBuilder::AddInputs() {
+  // Add input data edges.
+  std::vector<const Edge*> edges;
+  TF_RETURN_IF_ERROR(if_op_->input_edges(&edges));
+  // Start at index 1 as the first input is the predicate.
+  for (int i = 1; i < edges.size(); ++i) {
+    const Edge* e = edges[i];
+    TF_RETURN_IF_ERROR(AddInput(e->src(), e->src_output()));
+  }
+  // Add input control edges.
+  for (const Edge* e : if_op_->in_edges()) {
+    if (e->IsControlEdge()) {
+      graph_->AddControlEdge(e->src(), control_predecessor_);
+    }
+  }
+  return Status::OK();
+}
+
+Status CondBuilder::AddOutputs() {
+  // Construct the then and else nodes.
+  TF_RETURN_IF_ERROR(then_call_builder_.Finalize(graph_, &then_call_node_));
+  graph_->AddControlEdge(pivot_t_, then_call_node_);
+  TF_RETURN_IF_ERROR(else_call_builder_.Finalize(graph_, &else_call_node_));
+  graph_->AddControlEdge(pivot_f_, else_call_node_);
+
+  // Merge the outputs from the two branches.
+  std::vector<Node*> merges(then_call_node_->num_outputs());
+  outputs_.resize(merges.size());
+  for (int i = 0; i < then_call_node_->num_outputs(); ++i) {
+    TF_RETURN_IF_ERROR(
+        NodeBuilder(graph_->NewName("merge"), "Merge", graph_->op_registry())
+            .Input({NodeOut(then_call_node_, i), NodeOut(else_call_node_, i)})
+            .Finalize(graph_, &merges[i]));
+    outputs_[i] = NodeOut(merges[i], 0);
+  }
+
+  TF_RETURN_IF_ERROR(BuildLoweredIfOutput());
+
+  // Add outputs.
+  for (const Edge* e : if_op_->out_edges()) {
+    if (e->IsControlEdge()) {
+      graph_->AddControlEdge(lowered_if_output_, e->dst());
+    } else {
+      // Feed the outputs directly from the merge nodes so that downstream ops
+      // can start before all the outputs have been computed.
+      graph_->AddEdge(merges[e->src_output()], e->src_output(), e->dst(),
+                      e->dst_input());
+    }
+  }
+  return Status::OK();
+}
+
+Status InlineCallInGraph(Node* n, Graph* g) {
+  const auto& lib = g->flib_def();
+  const FunctionDef* fdef = lib.Find(n->type_string());
+  CHECK(fdef != nullptr);
+  FunctionBody* fbody;
+  TF_RETURN_IF_ERROR(
+      FunctionDefToBodyHelper(*fdef, n->attrs(), &lib,
+                              [&lib](const string& op, const OpDef** sig) {
+                                return lib.LookUpOpDef(op, sig);
+                              },
+                              &fbody));
+  // TODO(jpienaar): Improve this interface to make the need to delete it
+  // explicit.
+  InlineFunctionBody(g->flib_def(), g, n, fbody);
+  delete fbody;
+  return Status::OK();
+}
+
+Status CondBuilder::BuildLoweredIfOutput() {
+  // Build the identity node output.
+  NodeBuilder ib(name_, "IdentityN");
+  ib.Input(outputs_);
+  return ib.Finalize(graph_, &lowered_if_output_);
+}
+
+Status CondBuilder::InlineCallNodes() {
+  TF_RETURN_IF_ERROR(InlineCallInGraph(then_call_node_, graph_));
+  TF_RETURN_IF_ERROR(InlineCallInGraph(else_call_node_, graph_));
+  return Status::OK();
+}
+
+}  // namespace
+
+Status LowerIfOpPass::Run(const GraphOptimizationPassOptions& options) {
+  if (options.partition_graphs != nullptr) {
+    return errors::Internal(
+        "Lowering If op should happen before partitioning.");
+  }
+  if (options.graph == nullptr) {
+    return Status::OK();
+  }
+
+  Graph* g = options.graph->get();
+  if (g == nullptr) {
+    return errors::Internal("Lowering If op requires a graph to be available.");
+  }
+
+  // Match all the nodes that need to be rewritten.
+  gtl::InlinedVector<Node*, 2> matches;
+  for (Node* n : g->op_nodes()) {
+    if (n->type_string() == "If") {
+      // Only rewrite if the If op is marked as needing to be lowered.
+      bool match;
+      Status s = GetNodeAttr(n->attrs(), kLowerUsingSwitchMergeAttr, &match);
+      if (s.ok() && match) matches.push_back(n);
+    }
+  }
+  for (Node* n : matches) {
+    TF_RETURN_IF_ERROR(RewriteNode(n, g));
+  }
+  return Status::OK();
+}
+
+Status LowerIfOpPass::RewriteNode(Node* n, Graph* g) {
+  const AttrValue* then_attr = n->attrs().Find("then_branch");
+  if (then_attr == nullptr) {
+    return errors::InvalidArgument("Then branch function missing");
+  }
+  const AttrValue* else_attr = n->attrs().Find("else_branch");
+  if (else_attr == nullptr) {
+    return errors::InvalidArgument("Else branch function missing");
+  }
+
+  CondBuilder cb(n, then_attr->func().name(), else_attr->func().name(), g);
+  TF_RETURN_IF_ERROR(cb.CreatePivotNodes());
+  TF_RETURN_IF_ERROR(cb.AddInputs());
+  TF_RETURN_IF_ERROR(cb.AddOutputs());
+  TF_RETURN_IF_ERROR(cb.InlineCallNodes());
+  g->RemoveNode(n);
+
+  return Status::OK();
+}
+
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 0,
+                      LowerIfOpPass);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/lower_if_op.h b/tensorflow/core/common_runtime/lower_if_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a9ef39ae5c828d9f540153b93cd556cafc82b1c1
--- /dev/null
+++ b/tensorflow/core/common_runtime/lower_if_op.h
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_IF_OP_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_IF_OP_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Rewrite If ops to use switch and merge nodes instead.
+class LowerIfOpPass : public GraphOptimizationPass {
+ public:
+  static const char* const kLowerUsingSwitchMergeAttr;
+
+  Status Run(const GraphOptimizationPassOptions& options) override;
+
+ private:
+  // Rewrite the given If node `n` in graph `g` to use the switch-merge form.
+  Status RewriteNode(Node* n, Graph* g);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_IF_OP_H_
diff --git a/tensorflow/core/common_runtime/lower_if_op_test.cc b/tensorflow/core/common_runtime/lower_if_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..319a617b32259143a83b11c39f5477814919d253
--- /dev/null
+++ b/tensorflow/core/common_runtime/lower_if_op_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/lower_if_op.h"
+
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+Status Rewrite(std::unique_ptr<Graph>* graph) {
+  FunctionDefLibrary flib;
+  FunctionLibraryDefinition flib_def((*graph)->op_registry(), flib);
+
+  GraphOptimizationPassOptions opt_options;
+  opt_options.graph = graph;
+  opt_options.flib_def = &flib_def;
+  LowerIfOpPass pass;
+  return pass.Run(opt_options);
+}
+
+TEST(LowerIfOpTest, Simple) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  // Add test functions for then and else branch.
+  FunctionDefLibrary f_lib_proto;
+  *(f_lib_proto.add_function()) = test::function::XTimesTwo();
+  *(f_lib_proto.add_function()) = test::function::XTimesFour();
+  FunctionLibraryDefinition f_lib(OpRegistry::Global(), f_lib_proto);
+
+  // Construct simple conditional that switches on `pred` and operates only on
+  // single input `A`.
+  Scope root = Scope::NewRootScope().ExitOnError();
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
+  auto a = ops::_Arg(root.WithOpName("A"), DT_INT32, 0);
+  auto pred = ops::_Arg(root.WithOpName("pred"), DT_BOOL, 1);
+  Node* written_if;
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
+  AttrValue tb;
+  tb.mutable_func()->set_name("XTimesTwo");
+  AttrValue eb;
+  eb.mutable_func()->set_name("XTimesFour");
+  TF_ASSERT_OK(NodeBuilder("if", "If", &f_lib)
+                   .Input(pred.node())
+                   .Input(inputs)
+                   .Attr("then_branch", tb)
+                   .Attr("else_branch", eb)
+                   .Attr(LowerIfOpPass::kLowerUsingSwitchMergeAttr, true)
+                   .Attr("Tout", {DT_INT32})
+                   .Finalize(root.graph(), &written_if));
+  TF_ASSERT_OK(root.DoShapeInference(written_if));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  // The input graph has no switch or merge nodes.
+  int node_called_if_count = 0;
+  for (const auto* op : graph->op_nodes()) {
+    ASSERT_FALSE(op->IsSwitch());
+    ASSERT_FALSE(op->IsMerge());
+    if (op->name() == "if") {
+      ++node_called_if_count;
+    }
+  }
+  ASSERT_EQ(node_called_if_count, 1);
+
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  // Verify the resultant graph has switch and merge nodes, and a node called
+  // `if` (but not If nodes).
+  int switch_count = 0;
+  int merge_count = 0;
+  node_called_if_count = 0;
+  for (const auto* op : graph->op_nodes()) {
+    if (op->IsSwitch()) {
+      ++switch_count;
+    }
+    if (op->IsMerge()) {
+      ++merge_count;
+    }
+    ASSERT_NE(op->type_string(), "If");
+    if (op->name() == "if") {
+      ++node_called_if_count;
+    }
+  }
+  // One switch for predicate and one for input (A).
+  ASSERT_EQ(switch_count, 2);
+  // One merge for the single output values of then and else.
+  ASSERT_EQ(merge_count, 1);
+  ASSERT_EQ(node_called_if_count, 1);
+
+  // Verify execution.
+  ClientSession session(root);
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(pred.node()), Input::Initializer(false));
+    feeds.emplace(Output(a.node()), Input::Initializer(10));
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(written_if)}, &out_tensors));
+    EXPECT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 40);
+  }
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(pred.node()), Input::Initializer(true));
+    feeds.emplace(Output(a.node()), Input::Initializer(10));
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(written_if)}, &out_tensors));
+    EXPECT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 20);
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d583a8360b8a91d015bb3f42b60e6a34002b2d8
--- /dev/null
+++ b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+#ifdef _OPENMP
+TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) {
+  SessionOptions options;
+  unsetenv("OMP_NUM_THREADS");
+
+  ThreadPoolDevice* tp = new ThreadPoolDevice(
+      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+  const int ht = port::NumHyperthreadsPerCore();
+  EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht);
+}
+
+TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) {
+  SessionOptions options;
+  setenv("OMP_NUM_THREADS", "314", 1);
+
+  ThreadPoolDevice* tp = new ThreadPoolDevice(
+      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+  EXPECT_EQ(omp_get_max_threads(), 314);
+}
+#endif  // _OPENMP
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index e61ed8c4794883ccc2fdd78b63faf7ca149c20de..729312a310cad4dcd204d39464e71cc09573db5a 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/rendezvous_util.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 
@@ -144,7 +145,8 @@ Status ProcessFunctionLibraryRuntime::GetDeviceContext(
   }
   Device* device = flr->device();
   string device_type = device->parsed_name().type;
-  if (device_type == "CPU" || device_type == "TPU_SYSTEM") {
+  if (device_type == "CPU" || device_type == "TPU_SYSTEM" ||
+      device_type == "TPU") {
     // "TPU_SYSTEM" indicates that `device` is a CPU.
     return Status::OK();
   }
@@ -182,8 +184,8 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
     FunctionLibraryRuntime::LocalHandle local_handle) {
   mutex_lock l(mu_);
   auto h = next_handle_;
-  FunctionData* fd = new FunctionData(device_name, local_handle);
-  function_data_[h] = std::unique_ptr<FunctionData>(fd);
+  function_data_[h] = MakeUnique<FunctionData>(
+      device_name, local_handle, function_key);
   table_[function_key] = h;
   next_handle_++;
   return h;
@@ -246,8 +248,8 @@ Status ProcessFunctionLibraryRuntime::Instantiate(
         gtl::FindWithDefault(table_, function_key, kInvalidHandle);
     if (h == kInvalidHandle || function_data_.count(h) == 0) {
       h = next_handle_;
-      FunctionData* fd = new FunctionData(options.target, kInvalidHandle);
-      function_data_[h] = std::unique_ptr<FunctionData>(fd);
+      function_data_[h] = MakeUnique<FunctionData>(
+          options.target, kInvalidHandle, function_key);
       table_[function_key] = h;
       next_handle_++;
     }
@@ -262,6 +264,14 @@ Status ProcessFunctionLibraryRuntime::Instantiate(
   return Status::OK();
 }
 
+Status ProcessFunctionLibraryRuntime::RemoveHandle(
+    FunctionLibraryRuntime::Handle handle) {
+  mutex_lock l(mu_);
+  table_.erase(function_data_[handle]->function_key());
+  function_data_.erase(handle);
+  return Status::OK();
+}
+
 Status ProcessFunctionLibraryRuntime::ReleaseHandle(
     FunctionLibraryRuntime::Handle handle) {
   FunctionLibraryRuntime* flr = nullptr;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 05e57708993a93512c359b2742863564dd76a51a..69381dd34d94ec1ca502eff5e2f166147f007df1 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -134,6 +134,9 @@ class ProcessFunctionLibraryRuntime {
   // of the device where the function is registered.
   string GetDeviceName(FunctionLibraryRuntime::Handle handle);
 
+  // Removes handle from the state owned by this object.
+  Status RemoveHandle(FunctionLibraryRuntime::Handle handle);
+
   Status Clone(Env* env, int graph_def_version,
                const OptimizerOptions& optimizer_options,
                CustomKernelCreator custom_kernel_creator,
@@ -147,10 +150,14 @@ class ProcessFunctionLibraryRuntime {
   class FunctionData {
    public:
     FunctionData(const string& target_device,
-                 FunctionLibraryRuntime::LocalHandle local_handle)
-        : target_device_(target_device), local_handle_(local_handle) {}
+                 FunctionLibraryRuntime::LocalHandle local_handle,
+                 const string& function_key)
+        : target_device_(target_device),
+          local_handle_(local_handle),
+          function_key_(function_key) {}
 
     string target_device() { return target_device_; }
+    const string& function_key() { return function_key_; }
 
     FunctionLibraryRuntime::LocalHandle local_handle() {
       mutex_lock l(mu_);
@@ -169,6 +176,7 @@ class ProcessFunctionLibraryRuntime {
 
     const string target_device_;
     FunctionLibraryRuntime::LocalHandle local_handle_ GUARDED_BY(mu_);
+    const string function_key_;
     bool init_started_ GUARDED_BY(mu_) = false;
     Status init_result_ GUARDED_BY(mu_);
     Notification init_done_;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index cc10e77ad2e9528a41db24cc86f7149eae5beb5c..cce230801183591371268bf2827d153c9c19b840 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -39,7 +39,7 @@ class TestClusterFLR : public DistributedFunctionLibraryRuntime {
   Status Instantiate(const string& function_name,
                      const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
                      const FunctionLibraryRuntime::InstantiateOptions& options,
-                     FunctionLibraryRuntime::LocalHandle* handle) {
+                     FunctionLibraryRuntime::LocalHandle* handle) override {
     mutex_lock l(mu_);
     *handle = next_handle_;
     next_handle_++;
@@ -49,7 +49,7 @@ class TestClusterFLR : public DistributedFunctionLibraryRuntime {
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::LocalHandle handle,
            gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
-           FunctionLibraryRuntime::DoneCallback done) {}
+           FunctionLibraryRuntime::DoneCallback done) override {}
 
  private:
   mutex mu_;
@@ -119,13 +119,12 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
 
     EXPECT_GE(call_count, 1);  // Test runner is used.
 
-    // Release the handle and then try running the function.  It
-    // should still succeed.
+    // Release the handle and then try running the function. It shouldn't
+    // succeed.
     status = proc_flr_->ReleaseHandle(handle);
     if (!status.ok()) {
       return status;
     }
-
     Notification done2;
     proc_flr_->Run(opts, handle, args, &out,
                    [&status, &done2](const Status& s) {
@@ -133,7 +132,10 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
                      done2.Notify();
                    });
     done2.WaitForNotification();
-    return status;
+    EXPECT_TRUE(errors::IsNotFound(status));
+    EXPECT_TRUE(str_util::StrContains(status.error_message(), "not found."));
+
+    return Status::OK();
   }
 
   std::vector<Device*> devices_;
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index 21912236d079bdd76f1a3c1f5deb2d2cc1fc443e..a5d31b75c75b8f0608f01f07c263f6343e5735cc 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 
 #ifdef INTEL_MKL
+#ifdef _OPENMP
 #include <omp.h>
-#endif
+#endif  // _OPENMP
+#endif  // INTEL_MKL
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -57,7 +59,10 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   // MKL library executes ops in parallel using OMP threads
   // Set inter_op conservatively to avoid thread oversubscription that could
   // lead to severe perf degradations and OMP resource exhaustion
-  const int mkl_intra_op = omp_get_max_threads();
+  int mkl_intra_op = 1;
+#ifdef _OPENMP
+  mkl_intra_op = omp_get_max_threads();
+#endif  // _OPENMP
   CHECK_GE(mkl_intra_op, 1);
   const int32 mkl_inter_op = std::max(
       (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
@@ -68,7 +73,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
 #else
   // Default to using the number of cores available in the process.
   return port::NumSchedulableCPUs();
-#endif
+#endif  // INTEL_MKL
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
diff --git a/tensorflow/core/common_runtime/profile_handler.h b/tensorflow/core/common_runtime/profile_handler.h
index 9d31b1aecbce210e8409db66aeb20a8e9245d9bc..391dc8c19878b7fddf22f1c0950460005033c129 100644
--- a/tensorflow/core/common_runtime/profile_handler.h
+++ b/tensorflow/core/common_runtime/profile_handler.h
@@ -29,22 +29,6 @@ class ProfileHandler {
   ProfileHandler() {}
   virtual ~ProfileHandler() {}
 
-  // Records that a miscellaneous activity occurred in the current step.
-  //
-  // Implementations of this method must be thread-safe.
-  //
-  // Args:
-  // - device: The device on which the activity occurred.
-  // - start: The time at which the activity started.
-  // - limit: The time at which the activity finished.
-  // - label: A label for the op, which may be used in visualization.
-  // - op_type: A type string for the op, which may be used in visualization.
-  // - details: A details string, which may be used in visualization.
-  // from time "start" to "limit" with "op_type" and "details".
-  virtual void RecordActivity(const string& device, Microseconds start,
-                              Microseconds limit, StringPiece label,
-                              StringPiece op_type, StringPiece details) = 0;
-
   // Records that a single Op was executed in the current step.
   //
   // Implementations of this method must be thread-safe.
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index a74c502a9265a546882ea7d185fb4bd704e6dc3a..f8428f2fde3464aa3269c0fc190990ec2ef40e3b 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -157,21 +157,27 @@ void RingReducer::Run(StatusCallback done) {
   // we're not computing in-place on the input tensor.
   if ((input_ != output_) &&
       (DMAHelper::base(input_) != DMAHelper::base(output_))) {
+    // We are running in a blockable thread and the callback can't block so
+    // just wait here on the copy.
+    Notification note;
     CollectiveRemoteAccessLocal::MemCpyAsync(
         ctx_->input_device_context(0), ctx_->op_device_context(), device_,
         device_, ctx_->input_alloc_attr(0), ctx_->output_alloc_attr(0), input_,
-        output_, [this](const Status& s) {
-          if (!s.ok()) {
-            done_(s);
-          } else {
-            ContinueAfterInputCopy();
-          }
+        output_, [this, &note, &status](const Status& s) {
+          status.Update(s);
+          note.Notify();
         });
-  } else {
-    ContinueAfterInputCopy();
+    note.WaitForNotification();
+    if (!status.ok()) {
+      done_(status);
+      return;
+    }
   }
+  ContinueAfterInputCopy();
 }
 
+// Note that this function is blocking and must not run in any thread
+// which cannot be blocked.
 void RingReducer::ContinueAfterInputCopy() {
   AllocatorAttributes attr = ctx_->output_alloc_attr(0);
   ca_.reset(MakeCollectiveAdapter(output_, group_size_ * num_subdivs_,
@@ -235,6 +241,7 @@ void RingReducer::Finish(bool ok) {
     mutex_lock l(status_mu_);
     s = status_;
   }
+  rfv_.clear();  // Give up Refs on output tensor.
   done_(s);
 }
 
@@ -252,6 +259,7 @@ RingReducer::SubContext::SubContext(OpKernelContext* ctx,
   sub_params_.input_device_contexts = &sub_input_dc_;
   sub_params_.eigen_gpu_device = nullptr;
   sub_params_.ensure_eigen_gpu_device();
+  sub_params_.forward_from_array = &forward_from_;
   sub_ctx_ = new OpKernelContext(&sub_params_, 1);
 }
 
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 06dbe049868b2f85e8ebcabe4df5cec2170486b4..fa4d1eda625fa74c59ca3597d9d4f8f6654c7cfb 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -232,13 +232,12 @@ Status ShapeRefiner::AddNode(const Node* node) {
     input_nodes[e->dst_input()] = input;
     input_shapes[e->dst_input()] = c->output(e->src_output());
 
-    // Only propagate handle data of edges which are carrying resource handles.
-    if (e->src()->output_type(e->src_output()) == DT_RESOURCE) {
-      const auto* in_v = c->output_handle_shapes_and_types(e->src_output());
-      if (in_v != nullptr) {
-        input_handle_shapes_and_types[e->dst_input()].reset(
-            new std::vector<ShapeAndType>(*in_v));
-      }
+    const auto* in_v = c->output_handle_shapes_and_types(e->src_output());
+    if (in_v != nullptr) {
+      DataType input_type = e->src()->output_type(e->src_output());
+      DCHECK(input_type == DT_RESOURCE || input_type == DT_VARIANT);
+      input_handle_shapes_and_types[e->dst_input()].reset(
+          new std::vector<ShapeAndType>(*in_v));
     }
   }
 
@@ -422,6 +421,28 @@ Status ShapeRefiner::EvaluateConstantTensorForEdge(const Node* node,
                                 kMaxTensorSize, disable_constant_propagation_);
 }
 
+Status ShapeRefiner::EvaluateConstantIntScalarEdge(const Node* node,
+                                                   int dst_idx, bool* evaluated,
+                                                   int64* result) {
+  Tensor scalar;
+  TF_RETURN_IF_ERROR(
+      EvaluateConstantTensorForEdge(node, dst_idx, evaluated, &scalar));
+  if (*evaluated) {
+    DCHECK_EQ(scalar.NumElements(), 1)
+        << "EvaluateConstantIntScalarEdge called on non-scalar edge: "
+        << scalar.NumElements();
+    if (scalar.dtype() == DT_INT32) {
+      *result = scalar.scalar<int32>()();
+    } else {
+      DCHECK_EQ(scalar.dtype(), DT_INT64)
+          << "EvaluateConstantIntScalarEdge called on non-integer edge: "
+          << scalar.dtype();
+      *result = scalar.scalar<int64>()();
+    }
+  }
+  return Status::OK();
+}
+
 Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
                                           const Node* node, int dst_idx,
                                           ShapeHandle* result) {
@@ -472,19 +493,11 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
     std::vector<DimensionHandle> dims;
     // Pack is concatenating its input scalars to form the shape tensor vector.
     for (int i = 0; i < src_context->num_inputs(); ++i) {
-      Tensor scalar;
-      bool evaluated = false;
-      TF_RETURN_IF_ERROR(EvaluateConstantTensorForEdge(input_edge->src(), i,
-                                                       &evaluated, &scalar));
+      int64 size;
+      bool evaluated;
+      TF_RETURN_IF_ERROR(EvaluateConstantIntScalarEdge(input_edge->src(), i,
+                                                       &evaluated, &size));
       if (evaluated) {
-        int64 size;
-        if (scalar.dtype() == DT_INT32) {
-          size = scalar.scalar<int32>()();
-        } else if (scalar.dtype() == DT_INT64) {
-          size = scalar.scalar<int64>()();
-        } else {
-          return errors::InvalidArgument("Pack input must be int32 or int64");
-        }
         dims.push_back(size < 0 ? target_context->UnknownDim()
                                 : target_context->MakeDim(size));
       } else {
@@ -514,6 +527,9 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
       TF_RETURN_IF_ERROR(
           target_context->Concatenate(*result, sub_result, result));
     }
+  } else if (src_op == "StridedSlice") {
+    TF_RETURN_IF_ERROR(
+        PartialStridedSliceShape(input_edge->src(), src_context, result));
   } else {
     Tensor t;
     bool evaluated = false;
@@ -525,6 +541,78 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
   return Status::OK();
 }
 
+Status ShapeRefiner::PartialStridedSliceShape(Node* slice_node,
+                                              InferenceContext* ctx,
+                                              ShapeHandle* result) {
+  // Only attempt to evaluate if begin/end/strides all are scalars.
+  for (int i = 1; i <= 3; ++i) {
+    ShapeHandle input_shape = ctx->input(i);
+    if (ctx->Value(ctx->Dim(input_shape, 0)) != 1) {
+      *result = ctx->UnknownShape();
+      return Status::OK();
+    }
+  }
+
+  int begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(slice_node->attrs(), "begin_mask", &begin_mask));
+  TF_RETURN_IF_ERROR(GetNodeAttr(slice_node->attrs(), "end_mask", &end_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(slice_node->attrs(), "ellipsis_mask", &ellipsis_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(slice_node->attrs(), "new_axis_mask", &new_axis_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(slice_node->attrs(), "shrink_axis_mask", &shrink_axis_mask));
+
+  // Only attempt to evaluate if there are no special masks set (note that we
+  // can handle begin/end_mask == 1).
+  if (!(begin_mask == 0 || begin_mask == 1) ||
+      !(end_mask == 0 || end_mask == 1) || ellipsis_mask != 0 ||
+      new_axis_mask != 0 || shrink_axis_mask != 0) {
+    *result = ctx->UnknownShape();
+    return Status::OK();
+  }
+
+  bool evaluated;
+  int64 begin;
+  if (begin_mask == 1) {
+    begin = 0;
+  } else {
+    TF_RETURN_IF_ERROR(
+        EvaluateConstantIntScalarEdge(slice_node, 1, &evaluated, &begin));
+    if (!evaluated) {
+      *result = ctx->UnknownShape();
+      return Status::OK();
+    }
+  }
+
+  int64 end;
+  if (end_mask == 1) {
+    end = std::numeric_limits<int64>::max();
+  } else {
+    TF_RETURN_IF_ERROR(
+        EvaluateConstantIntScalarEdge(slice_node, 2, &evaluated, &end));
+    if (!evaluated) {
+      *result = ctx->UnknownShape();
+      return Status::OK();
+    }
+  }
+
+  int64 stride;
+  TF_RETURN_IF_ERROR(
+      EvaluateConstantIntScalarEdge(slice_node, 3, &evaluated, &stride));
+  if (!evaluated) {
+    *result = ctx->UnknownShape();
+    return Status::OK();
+  }
+
+  // Apply stride to input interpreted as a partial shape.
+  ShapeHandle input;
+  TF_RETURN_IF_ERROR(ConstantPartialShape(ctx, slice_node, 0, &input));
+  TF_RETURN_IF_ERROR(ctx->Subshape(input, begin, end, stride, result));
+  return Status::OK();
+}
+
 Status ShapeRefiner::RunShapeFn(const Node* node,
                                 const OpRegistrationData* op_reg_data,
                                 ExtendedInferenceContext* ec) {
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index d49c4373f0b8c8721bb5f151dfe4d8774fbceb50..9c96dcbc206ae8103e0b39f9cc88dce6e6b7fec3 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -215,9 +215,18 @@ class ShapeRefiner {
                                 bool keep_nested_shapes,
                                 ExtendedInferenceContext* outer_context);
 
+  // Attempts to evaluate the 'dst_idx'-th input to 'node'. If the input edge
+  // value can be evaluated, 'evaluated' is set to true and the value returned
+  // in 'result'. Otherwise 'evaluated' is set to false.
   Status EvaluateConstantTensorForEdge(const Node* node, int dst_idx,
                                        bool* evaluated, Tensor* result);
 
+  // Wrapper around EvaluateConstantTensorForEdge for scalar int32/int64 input
+  // tensors. The caller is responsible for checking that the specified edge is
+  // scalar and int32 or int64.
+  Status EvaluateConstantIntScalarEdge(const Node* node, int dst_idx,
+                                       bool* evaluated, int64* result);
+
   // This function tries to materialize as much information about the 'node''s
   // dst_idx input as a statically computable shape, and the result may be
   // partially known, depending on what is statically inferable.
@@ -243,6 +252,11 @@ class ShapeRefiner {
                               const Node* node, int dst_idx,
                               shape_inference::ShapeHandle* result);
 
+  // Implementation of ConstantPartialShape for StridedSlice nodes.
+  Status PartialStridedSliceShape(Node* slice_node,
+                                  shape_inference::InferenceContext* ctx,
+                                  shape_inference::ShapeHandle* result);
+
   Status RunShapeFn(const Node* node, const OpRegistrationData* op_reg_data,
                     ExtendedInferenceContext* ec);
 
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index f48638afc0f602e4b0c1376f7e5732f3c637d025..8b9657eec88db68c205733d68a78b7b9eb921e4a 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -60,6 +60,39 @@ class ShapeRefinerTest : public ::testing::Test {
   }
 
   static constexpr int64 kMaxTensorSize = ShapeRefiner::kMaxTensorSize;
+
+  void TestStridedSlice(const PartialTensorShape& input_shape, int begin,
+                        int end, int stride, const char* expected,
+                        int begin_mask = 0, int end_mask = 0,
+                        int ellipsis_mask = 0) {
+    Scope root = Scope::DisabledShapeInferenceScope();
+    auto placeholder =
+        ops::Placeholder(root, DT_INT32, ops::Placeholder::Shape(input_shape));
+    auto input = ops::Shape(root, placeholder);
+    auto begin_op = ops::Const(root, {begin});
+    auto end_op = ops::Const(root, {end});
+    auto stride_op = ops::Const(root, {stride});
+    auto slice = ops::StridedSlice(root, input, begin_op, end_op, stride_op,
+                                   ops::StridedSlice::BeginMask(begin_mask)
+                                       .EndMask(end_mask)
+                                       .EllipsisMask(ellipsis_mask));
+    Node* result;
+    TF_ASSERT_OK(NodeBuilder("test", "TensorAsShapeInt32")
+                     .Input(slice.node())
+                     .Finalize(root.graph(), &result));
+
+    ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+    TF_ASSERT_OK(m.AddNode(placeholder.node()));
+    TF_ASSERT_OK(m.AddNode(input.node()));
+    TF_ASSERT_OK(m.AddNode(begin_op.node()));
+    TF_ASSERT_OK(m.AddNode(end_op.node()));
+    TF_ASSERT_OK(m.AddNode(stride_op.node()));
+    TF_ASSERT_OK(m.AddNode(slice.node()));
+    TF_ASSERT_OK(m.AddNode(result));
+
+    shape_inference::InferenceContext* ctx = m.GetContext(result);
+    EXPECT_EQ(ctx->DebugString(ctx->output(0)), expected);
+  }
 };
 
 namespace {
@@ -1156,6 +1189,73 @@ TEST_F(ShapeRefinerTest, ConstantValueAsShape_ConcatInvalidDimValue) {
             m.AddNode(result).error_message());
 }
 
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_StridedSlice) {
+  TestStridedSlice(
+      /*input_shape=*/{1, -1, 3, -1, 5},
+      /*begin=*/2,
+      /*end=*/5,
+      /*stride=*/1,
+      /*expected=*/"[3,?,5]");
+}
+
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_StridedSliceNegativeStride) {
+  // clang-format off
+  TestStridedSlice(
+      /*input_shape=*/{1, -1, 3, -1, 5},
+      /*begin=*/10,
+      /*end=*/0,
+      /*stride=*/-1,
+      /*expected=*/"[5,?,3,?]");
+  // clang-format on
+}
+
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_StridedSliceMasks) {
+  TestStridedSlice(
+      /*input_shape=*/{1, -1, 3, -1, 5},
+      /*begin=*/3,
+      /*end=*/4,
+      /*stride=*/1,
+      /*expected=*/"[1,?,3,?,5]",
+      /*begin_mask=*/1,
+      /*end_mask=*/1);
+}
+
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_StridedSliceInvalidMask) {
+  TestStridedSlice(
+      /*input_shape=*/{1, -1, 3},
+      /*begin=*/2,
+      /*end=*/3,
+      /*stride=*/1,
+      /*expected=*/"[?,?,?]",
+      /*begin_mask=*/0,
+      /*end_mask=*/0,
+      /*ellipsis_mask=*/1);
+}
+
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_StridedSliceMulti) {
+  Scope root = Scope::DisabledShapeInferenceScope();
+  auto input = ops::Placeholder(root, DT_INT32);
+  auto begin = ops::Const(root, {0, 0});
+  auto end = ops::Const(root, {2, 2});
+  auto stride = ops::Const(root, {1, 1});
+  auto slice = ops::StridedSlice(root, input, begin, end, stride);
+  Node* result;
+  TF_ASSERT_OK(NodeBuilder("test", "TensorAsShapeInt32")
+                   .Input(slice.node())
+                   .Finalize(root.graph(), &result));
+
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(input.node()));
+  TF_ASSERT_OK(m.AddNode(begin.node()));
+  TF_ASSERT_OK(m.AddNode(end.node()));
+  TF_ASSERT_OK(m.AddNode(stride.node()));
+  TF_ASSERT_OK(m.AddNode(slice.node()));
+  TF_ASSERT_OK(m.AddNode(result));
+
+  shape_inference::InferenceContext* ctx = m.GetContext(result);
+  EXPECT_EQ(ctx->DebugString(ctx->output(0)), "?");
+}
+
 namespace {
 
 // Dummy op to test ShapeRefiner util functions
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index f7a07fe503f26b89a70843f536400684579422e0..74a87215e1917db4c149da320a4d53c6ea6a25be 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -31,7 +31,11 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 #ifdef INTEL_MKL
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #endif
 
 namespace tensorflow {
@@ -43,7 +47,26 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
     : LocalDevice(options, Device::BuildDeviceAttributes(
                                name, DEVICE_CPU, memory_limit, locality)),
       allocator_(allocator),
-      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {}
+      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
+#ifdef INTEL_MKL
+#ifdef _OPENMP
+  const char* user_omp_threads = getenv("OMP_NUM_THREADS");
+  if (user_omp_threads == nullptr) {
+    // OMP_NUM_THREADS controls MKL's intra-op parallelization
+    // Default to available physical cores
+    const int mkl_intra_op = port::NumSchedulableCPUs();
+    const int ht = port::NumHyperthreadsPerCore();
+    omp_set_num_threads((mkl_intra_op + ht - 1) / ht);
+  } else {
+    uint64 user_val = 0;
+    if (strings::safe_strtou64(user_omp_threads, &user_val)) {
+      // Superflous but triggers OpenMP loading
+      omp_set_num_threads(user_val);
+    }
+  }
+#endif  // _OPENMP
+#endif  // INTEL_MKL
+}
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
 
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 5fab740e920519abe8ec109615b75555593ec4c8..1528c7f130657cd14c43578a548106dc855ab3fd 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -90,7 +90,6 @@ tf_cuda_library(
     deps = [
         ":debug",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:device_tracer",
         "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 256ce527a423f39cbab77053c7fe4c59d944d609..18b7069dbe5e86660f6e709042d5fa4db656c19c 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -452,6 +452,40 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "collective_rma_distributed",
+    srcs = ["collective_rma_distributed.cc"],
+    hdrs = ["collective_rma_distributed.h"],
+    deps = [
+        ":worker_cache",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",  # protobuf::Any
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:worker_proto_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "collective_rma_distributed_test",
+    size = "small",
+    srcs = ["collective_rma_distributed_test.cc"],
+    deps = [
+        ":collective_rma_distributed",
+        ":device_resolver_distributed",
+        ":test_utils",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core:worker_proto_cc",
+    ],
+)
+
 cc_library(
     name = "collective_param_resolver_distributed",
     srcs = ["collective_param_resolver_distributed.cc"],
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
index ecf5db811073f278e15d081eb6472f7b7840b8c5..7a93b54eae386fa8c8e253b1649636acbcf6f067 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
@@ -284,7 +284,6 @@ void CollectiveParamResolverDistributed::CompleteGroupDistributed(
     const GroupRecCallback& done) {
   VLOG(1) << "CompleteGroupDistributed group_key=" << cp->group.group_key
           << " dev: " << device << " is_leader=" << (group_leader_.empty());
-  VLOG(0) << "cp: " << cp->ToString();
   if (group_leader_.empty()) {
     // This is the group leader, so resolution is local.
     return CompleteGroupLocal(device, cp, done);
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c15878bfd3a2ba579e18969953c4f08ad350045e
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -0,0 +1,209 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/distributed_runtime/collective_rma_distributed.h"
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/platform/protobuf_internal.h"
+#include "tensorflow/core/protobuf/transport_options.pb.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Supports client side cancellation of WorkerInterface calls via
+// registration with a CancellationManager.
+//
+// TODO(tucker): Maybe unify this with CancellableCall in
+// collective_param_resolver_distributed.cc.
+class CancellableCall {
+ public:
+  CancellableCall(CancellationManager* cancel_mgr, const string& remote_worker,
+                  WorkerCacheInterface* wc)
+      : cancel_mgr_(cancel_mgr), remote_worker_(remote_worker), wc_(wc) {
+    wi_ = wc_->CreateWorker(remote_worker_);
+  }
+  virtual ~CancellableCall() { wc_->ReleaseWorker(remote_worker_, wi_); }
+
+  virtual void IssueCall(const StatusCallback& done) = 0;
+
+  void Start(const StatusCallback& done) {
+    CancellationToken token = cancel_mgr_->get_cancellation_token();
+    const bool not_yet_cancelled = cancel_mgr_->RegisterCallback(
+        token, [this, token]() { opts_.StartCancel(); });
+    if (not_yet_cancelled) {
+      IssueCall([this, token, done](const Status& s) {
+        cancel_mgr_->DeregisterCallback(token);
+        done(s);
+      });
+    } else {
+      done(errors::Cancelled("RPC Request was cancelled"));
+    }
+  }
+
+ protected:
+  mutable mutex mu_;
+  CancellationManager* cancel_mgr_;  // Not owned
+  const string remote_worker_;
+  WorkerCacheInterface* wc_;  // Not owned
+  WorkerInterface* wi_;       // Owned by wc_, must be released.
+  CallOptions opts_;
+};
+
+class RecvBufCall : public CancellableCall {
+ public:
+  RecvBufCall(int64 step_id, const string& peer_device, const string& peer_task,
+              const string& key, Device* to_device,
+              DeviceContext* to_device_ctx,
+              const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+              const DeviceLocality& client_locality,
+              const DeviceLocality& server_locality,
+              CancellationManager* cancel_mgr, WorkerCacheInterface* wc)
+      : CancellableCall(cancel_mgr, peer_task, wc) {
+    req_.set_step_id(step_id);
+    req_.set_buf_rendezvous_key(key);
+    *req_.mutable_client_locality() = client_locality;
+    *req_.mutable_server_locality() = server_locality;
+    req_.set_num_bytes(to_tensor->TotalBytes());
+    req_.set_buf_ptr(reinterpret_cast<int64>(DMAHelper::base(to_tensor)));
+    req_.set_src_device(peer_device);
+    req_.set_dst_device(to_device->name());
+  }
+
+  ~RecvBufCall() override {}
+
+  void IssueCall(const StatusCallback& done) override {
+    wi_->RecvBufAsync(&opts_, &req_, &resp_, done);
+  }
+
+  RecvBufRequest req_;
+  RecvBufResponse resp_;
+};
+
+}  // namespace
+
+void CollectiveRemoteAccessDistributed::RecvFromPeer(
+    const string& peer_device, const string& peer_task, bool peer_is_local,
+    const string& key, Device* to_device, DeviceContext* to_device_ctx,
+    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+    const DeviceLocality& client_locality, const StatusCallback& done) {
+  if (peer_is_local) {
+    CollectiveRemoteAccessLocal::RecvFromPeer(
+        peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+        to_alloc_attr, to_tensor, client_locality, done);
+    return;
+  }
+
+  // State that needs to be threaded through a couple of async calls
+  // in order to make this function completely non-blocking.
+  struct State {
+    DeviceLocality server_locality;
+    std::unique_ptr<RecvBufCall> call;
+  };
+  State* state = new State;
+
+  // Logic to be executed on the RecvBufferAsync callback.
+  auto recv_buf_callback = [this, state, peer_task, to_device, to_alloc_attr,
+                            to_device_ctx, to_tensor, done](const Status& s) {
+    if (s.ok()) {
+      // In this generic implementation the bytes come back in the
+      // RPC response protobuf rather than via RDMA so we need to copy
+      // them into the destination tensor here.
+      RecvBufRespExtra extra;
+      state->call->resp_.transport_options().UnpackTo(&extra);
+      int64 num_bytes = extra.tensor_content().size();
+      if (num_bytes != to_tensor->TotalBytes()) {
+        done(errors::Internal("RecvBufResponse returned ", num_bytes,
+                              " bytes where to_tensor expected ",
+                              to_tensor->TotalBytes()));
+        delete state;
+        return;
+      }
+      if (to_device->tensorflow_gpu_device_info()) {
+        // Move the bytes into a CPU tensor then use tensor-to-tensor copy.
+        // Use GPU-registered memory for the CPU tensor so the transfer
+        // goes faster.
+        Device* cpu_dev = nullptr;
+        Status status = dev_mgr_->LookupDevice("CPU:0", &cpu_dev);
+        if (!status.ok()) {
+          done(status);
+          delete state;
+          return;
+        }
+        AllocatorAttributes cpu_attr;
+        cpu_attr.set_gpu_compatible(true);
+        Tensor* cpu_tensor = new Tensor(cpu_dev->GetAllocator(cpu_attr),
+                                        to_tensor->dtype(), to_tensor->shape());
+        memcpy(DMAHelper::base(cpu_tensor), extra.tensor_content().data(),
+               num_bytes);
+        // Then copy it to the GPU.
+        CopyTensor::ViaDMA("",  // edge name (non-existent)
+                           nullptr /*send_dev_ctx*/, to_device_ctx, cpu_dev,
+                           to_device, cpu_attr, to_alloc_attr, cpu_tensor,
+                           to_tensor,
+                           [this, cpu_tensor, done](const Status& s) {
+                             delete cpu_tensor;
+                             // This callback must not block, so execute
+                             // done in another thread.
+                             SchedClosure([s, done] { done(s); });
+                           });
+        delete state;
+        return;
+      } else {
+        // CPU device
+        memcpy(DMAHelper::base(to_tensor), extra.tensor_content().data(),
+               num_bytes);
+      }
+    }
+    if (!s.ok() && errors::IsFailedPrecondition(s)) {
+      dev_resolver_->ClearTask(peer_task);
+    }
+
+    delete state;
+    done(s);
+  };
+
+  // Logic to execute once we have the device locality for the server-side
+  // device.
+  auto dev_locality_callback = [this, state, peer_device, peer_task, key,
+                                to_device, to_device_ctx, to_alloc_attr,
+                                to_tensor, client_locality,
+                                recv_buf_callback](const Status& s) {
+    if (!s.ok()) {
+      recv_buf_callback(s);
+    } else {
+      state->call.reset(new RecvBufCall(
+          step_id_, peer_device, peer_task, key, to_device, to_device_ctx,
+          to_alloc_attr, to_tensor, client_locality, state->server_locality,
+          &cancel_mgr_, worker_cache_));
+      state->call->Start(recv_buf_callback);
+    }
+  };
+
+  dev_resolver_->GetLocalityAsync(
+      peer_device, peer_task, &state->server_locality, dev_locality_callback);
+}
+
+void CollectiveRemoteAccessDistributed::StartAbort(const Status& s) {
+  CollectiveRemoteAccessLocal::StartAbort(s);
+  cancel_mgr_.StartCancel();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.h b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfa9110f473edc984f6c159140b4b61b41557b5a
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_RMA_DISTRIBUTED_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_RMA_DISTRIBUTED_H_
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+class WorkerCacheInterface;
+
+// Extend CollectiveRemoteAccessLocal with access to remote peers.
+class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
+ public:
+  CollectiveRemoteAccessDistributed(const DeviceMgr* dev_mgr,
+                                    DeviceResolverInterface* dev_resolver,
+                                    WorkerCacheInterface* worker_cache,
+                                    int64 step_id)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        worker_cache_(worker_cache) {}
+
+  ~CollectiveRemoteAccessDistributed() override {}
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override;
+
+  void StartAbort(const Status& s) override;
+
+ protected:
+  WorkerCacheInterface* worker_cache_;  // Not owned
+  CancellationManager cancel_mgr_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_RMA_DISTRIBUTED_H_
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a552f81f584cbc5d54ff8a45689e7cfb602d1b7d
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -0,0 +1,356 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/collective_rma_distributed.h"
+
+#include "google/protobuf/any.pb.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/test_utils.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/transport_options.pb.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+// The only interesting method on CollectiveRemoteAccessDistributed
+// that's not on CollectiveRemoteAccessLocal is RecvFromPeer which
+// issues a RecvBufAsync call against a WorkerInterface.  That's all
+// that's tested here.  Note that RecvFromPeer can do a
+// DeviceResolverInterface::GetDeviceLocalityAsync call in preparation
+// for the RecvBufAsync.
+
+namespace tensorflow {
+namespace {
+
+static Device* NewDevice(const string& type, const string& name) {
+  class FakeDevice : public Device {
+   public:
+    explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
+    Status Sync() override { return Status::OK(); }
+    Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
+  };
+  DeviceAttributes attr;
+  attr.set_name(name);
+  attr.set_device_type(type);
+  attr.mutable_locality()->set_numa_node(3);  // a non-default value
+  return new FakeDevice(attr);
+}
+
+static int64 kStepId = 123;
+
+class FakeWorker : public TestWorkerInterface {
+ public:
+  FakeWorker(const string& name, DeviceMgr* dev_mgr,
+             DeviceResolverDistributed* dres)
+      : name_(name),
+        device_mgr_(dev_mgr),
+        device_resolver_(dres),
+        buf_rendezvous_(kStepId) {}
+
+  // Direct access to a BufRendezvous that holds whatever the remote
+  // worker is supposed to have.
+  BufRendezvous* buf_rendezvous() { return &buf_rendezvous_; }
+
+  void GetStatusAsync(const GetStatusRequest* request,
+                      GetStatusResponse* response,
+                      StatusCallback done) override {
+    std::vector<DeviceAttributes> dev_attr;
+    device_mgr_->ListDeviceAttributes(&dev_attr);
+    for (const auto& da : dev_attr) {
+      *response->add_device_attributes() = da;
+    }
+    done(Status::OK());
+  }
+
+  void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                    RecvBufResponse* response, StatusCallback done) override {
+    opts->SetCancelCallback([this]() {
+      // Within this test the call is satisfied by a process-local
+      // BufRendezvous table. In real application the BufRendezvous
+      // would be on the other side of a network hop, so call
+      // BufRendezvous::StartAbort() from a separate thread to be
+      // more consistent with that situation and avoid mutex deadlock.
+      SchedClosure([this]() {
+        Env::Default()->SleepForMicroseconds(100);
+        buf_rendezvous_.StartAbort(errors::Internal("Cancelled"));
+      });
+    });
+    buf_rendezvous_.ConsumeBuf(
+        request->buf_rendezvous_key(),
+        [this, opts, request, response, done](const Status& s,
+                                              BufRendezvous::Hook* h) {
+          if (s.ok()) {
+            opts->ClearCancelCallback();
+            // Since this is not really RDMA into pre-allocated memory send the
+            // bytes in the response.
+            RecvBufRespExtra extra;
+            int64 num_bytes = h->prod_value->TotalBytes();
+            extra.set_tensor_content(string(
+                reinterpret_cast<const char*>(DMAHelper::base(h->prod_value)),
+                num_bytes));
+            response->mutable_transport_options()->PackFrom(extra);
+          }
+          done(s);
+          if (h) BufRendezvous::DoneWithHook(h);
+        });
+  }
+
+ private:
+  string name_;
+  DeviceMgr* device_mgr_;
+  DeviceResolverDistributed* device_resolver_;
+  BufRendezvous buf_rendezvous_;
+};
+
+class FakeCache : public TestWorkerCache {
+ public:
+  // Override the Locality methods to actually pass through to the
+  // worker.
+  bool GetDeviceLocalityNonBlocking(const string& device,
+                                    DeviceLocality* locality) override {
+    return false;
+  }
+
+  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+                              StatusCallback done) override {
+    string task_name;
+    string dev_part;
+    if (!DeviceNameUtils::SplitDeviceName(device, &task_name, &dev_part)) {
+      done(errors::Internal("failed to parse device name"));
+      return;
+    }
+    auto it = workers_.find(task_name);
+    if (it == workers_.end()) {
+      done(errors::Internal("failed to find worker ", task_name));
+      return;
+    }
+    WorkerInterface* wi = it->second;
+    GetStatusRequest req;
+    GetStatusResponse resp;
+    Notification note;
+    Status status = wi->GetStatus(&req, &resp);
+    if (!status.ok()) {
+      done(status);
+      return;
+    }
+    for (const auto& it : resp.device_attributes()) {
+      if (it.name() == device) {
+        *locality = it.locality();
+        done(Status::OK());
+        return;
+      }
+    }
+    done(errors::Internal("device not found: ", device));
+  }
+};
+
+class CollRMADistTest : public ::testing::Test {
+ protected:
+  CollRMADistTest() {}
+
+  ~CollRMADistTest() override {
+    for (DeviceMgr* dm : device_mgrs_) {
+      delete dm;
+    }
+    for (auto it : dev_resolvers_) {
+      delete it.second;
+    }
+    for (FakeWorker* w : workers_) {
+      delete w;
+    }
+  }
+
+  void SetUp() override {
+    const int num_workers = 2;
+    const int num_devices = 1;
+    string device_type = "CPU";
+    ConfigProto config;
+    string dev0_worker_name;
+    for (int w = 0; w < num_workers; ++w) {
+      string name = strings::StrCat("/job:worker/replica:0/task:", w);
+      if (w == 0) {
+        dev0_worker_name = name;
+        // TODO(tucker): Change to use config when available.
+        // config.set_collective_group_leader(name);
+      }
+      DefineWorker(config, name, device_type, num_devices);
+    }
+    // All tests simulate requests from worker 0 to worker 1.
+    rma_.reset(new CollectiveRemoteAccessDistributed(
+        device_mgrs_[0], dev_resolvers_[dev0_worker_name], &wc_, kStepId));
+
+    const int kNumElts = 8;
+    expected_value_ = Tensor(DT_FLOAT, {kNumElts});
+    to_tensor_ = Tensor(DT_FLOAT, {kNumElts});
+    auto exp_alias = expected_value_.flat<float>();
+    auto to_alias = to_tensor_.flat<float>();
+    for (int i = 0; i < kNumElts; ++i) {
+      exp_alias(i) = i;
+      to_alias(i) = -1;
+    }
+  }
+
+  void DefineWorker(const ConfigProto& config, const string& worker_name,
+                    const string& device_type, int num_devices) {
+    std::vector<Device*> devices;
+    for (int i = 0; i < num_devices; ++i) {
+      devices.push_back(NewDevice(
+          device_type,
+          strings::StrCat(worker_name, "/device:", device_type, ":", i)));
+    }
+    DeviceMgr* dev_mgr = new DeviceMgr(devices);
+    device_mgrs_.push_back(dev_mgr);
+    std::vector<string>* dv = &dev_by_task_[worker_name];
+    for (auto d : devices) {
+      dv->push_back(d->name());
+    }
+    DeviceResolverDistributed* dev_res =
+        new DeviceResolverDistributed(dev_mgr, &wc_, worker_name);
+    dev_resolvers_[worker_name] = dev_res;
+    FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, dev_res);
+    workers_.push_back(fw);
+    wc_.AddWorker(worker_name, fw);
+  }
+
+  void ValidateResultTensor() {
+    ASSERT_EQ(expected_value_.NumElements(), to_tensor_.NumElements());
+    for (int i = 0; i < to_tensor_.NumElements(); ++i) {
+      EXPECT_FLOAT_EQ(expected_value_.flat<float>()(i),
+                      to_tensor_.flat<float>()(i));
+    }
+  }
+
+  FakeCache wc_;
+  CancellationManager cm_;
+  std::vector<DeviceMgr*> device_mgrs_;
+  std::unordered_map<string, DeviceResolverDistributed*> dev_resolvers_;
+  std::unordered_map<string, std::vector<string>> dev_by_task_;
+  std::vector<FakeWorker*> workers_;
+  std::unique_ptr<CollectiveRemoteAccessDistributed> rma_;
+  mutex mu_;
+  int num_done_ GUARDED_BY(mu_);
+  condition_variable done_;
+  Tensor expected_value_;
+  Tensor to_tensor_;
+  CallOptions opts_;
+  DeviceLocality device_locality_;
+  AllocatorAttributes alloc_attr_;
+};
+
+TEST_F(CollRMADistTest, ProdFirstOK) {
+  Notification consumer_note;
+  Notification producer_note;
+  Status consumer_status;
+  Status producer_status;
+  FakeWorker* wi = workers_[1];
+  const string kBufKey = "fake_buf_key";
+  wi->buf_rendezvous()->ProvideBuf(
+      kBufKey, nullptr /*device*/, nullptr /*dev_ctx*/, &expected_value_,
+      AllocatorAttributes(),
+      [this, &producer_note, &producer_status](const Status& s) {
+        producer_status.Update(s);
+        producer_note.Notify();
+      });
+  Status status;
+  Device* dst_device = nullptr;
+  string dev_name = "CPU:0";
+  TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
+  DeviceContext* to_device_ctx = nullptr;
+  rma_->RecvFromPeer(
+      "/job:worker/replica:0/task:1/device:" + dev_name,  // peer_dev
+      "/job:worker/replica:0/task:1",                     // peer_task
+      false,                                              // peer_is_local
+      kBufKey, dst_device, to_device_ctx, alloc_attr_, &to_tensor_,
+      device_locality_,
+      [this, &consumer_status, &consumer_note](const Status& s) {
+        consumer_status = s;
+        consumer_note.Notify();
+      });
+  consumer_note.WaitForNotification();
+  TF_EXPECT_OK(consumer_status);
+  producer_note.WaitForNotification();
+  TF_EXPECT_OK(producer_status);
+  ValidateResultTensor();
+}
+
+TEST_F(CollRMADistTest, ConsFirstOK) {
+  Notification consumer_note;
+  Notification producer_note;
+  Status consumer_status;
+  Status producer_status;
+  FakeWorker* wi = workers_[1];
+  const string kBufKey = "fake_buf_key";
+  Status status;
+  Device* dst_device = nullptr;
+  string dev_name = "CPU:0";
+  TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
+  DeviceContext* to_device_ctx = nullptr;
+  rma_->RecvFromPeer(
+      "/job:worker/replica:0/task:1/device:" + dev_name,  // peer_dev
+      "/job:worker/replica:0/task:1",                     // peer_task
+      false,                                              // peer_is_local
+      kBufKey, dst_device, to_device_ctx, alloc_attr_, &to_tensor_,
+      device_locality_,
+      [this, &consumer_status, &consumer_note](const Status& s) {
+        consumer_status = s;
+        consumer_note.Notify();
+      });
+  wi->buf_rendezvous()->ProvideBuf(
+      kBufKey, nullptr /*device*/, nullptr /*dev_ctx*/, &expected_value_,
+      AllocatorAttributes(),
+      [this, &producer_note, &producer_status](const Status& s) {
+        producer_status.Update(s);
+        producer_note.Notify();
+      });
+  consumer_note.WaitForNotification();
+  TF_EXPECT_OK(consumer_status);
+  producer_note.WaitForNotification();
+  TF_EXPECT_OK(producer_status);
+  ValidateResultTensor();
+}
+
+TEST_F(CollRMADistTest, ConsFirstAbort) {
+  Notification consumer_note;
+  Status consumer_status;
+  const string kBufKey = "fake_buf_key";
+  Status status;
+  Device* dst_device = nullptr;
+  string dev_name = "CPU:0";
+  TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
+  DeviceContext* to_device_ctx = nullptr;
+  rma_->RecvFromPeer(
+      "/job:worker/replica:0/task:1/device:" + dev_name,  // peer_dev
+      "/job:worker/replica:0/task:1",                     // peer_task
+      false,                                              // peer_is_local
+      kBufKey, dst_device, to_device_ctx, alloc_attr_, &to_tensor_,
+      device_locality_,
+      [this, &consumer_status, &consumer_note](const Status& s) {
+        consumer_status = s;
+        consumer_note.Notify();
+      });
+  rma_->StartAbort(errors::Internal("Deliberate Failure"));
+  consumer_note.WaitForNotification();
+  EXPECT_EQ(consumer_status.error_message(), "Cancelled");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f3922dde74ae271465d800262ee20defa508151d
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -0,0 +1,89 @@
+package(default_visibility = [
+    "//tensorflow:internal",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+cc_library(
+    name = "remote_tensor_handle",
+    hdrs = ["remote_tensor_handle.h"],
+    deps = [
+        "//tensorflow/core:eager_service_proto_cc",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "eager_client",
+    hdrs = ["eager_client.h"],
+    deps = [
+        "//tensorflow/core:eager_service_proto_cc",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "remote_execute_node",
+    hdrs = ["remote_execute_node.h"],
+    deps = [
+        ":eager_client",
+        "//tensorflow/core:eager_service_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime/eager:eager_executor",
+    ],
+)
+
+cc_library(
+    name = "eager_service_impl",
+    srcs = ["eager_service_impl.cc"],
+    hdrs = [
+        "eager_service_impl.h",
+    ],
+    deps = [
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:eager_service_proto_cc",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:eager_operation",
+        "//tensorflow/core/common_runtime/eager:execute",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:worker_cache",
+        "//tensorflow/core/distributed_runtime:worker_cache_wrapper",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime/eager:remote_tensor_handle",
+        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc_unsecure",
+    ],
+)
+
+tf_cc_test(
+    name = "eager_service_impl_test",
+    srcs = ["eager_service_impl_test.cc"],
+    deps = [
+        ":eager_service_impl",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/core:eager_service_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+    ],
+)
diff --git a/tensorflow/core/distributed_runtime/eager/eager_client.h b/tensorflow/core/distributed_runtime/eager/eager_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ba8c8d80cb0db9f2d49b4162f82160bb345b009
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/eager_client.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace tensorflow {
+namespace eager {
+
+// This is a base class that can be implemented by a variety of
+// transports (e.g. gRPC which for each of the client methods makes an RPC).
+class EagerClient {
+ public:
+  virtual ~EagerClient() {}
+#define CLIENT_METHOD(method)                                \
+  virtual void method##Async(const method##Request* request, \
+                             method##Response* response,     \
+                             StatusCallback done) = 0;
+
+  CLIENT_METHOD(CreateContext);
+  CLIENT_METHOD(Enqueue);
+  CLIENT_METHOD(WaitQueueDone);
+  CLIENT_METHOD(KeepAlive);
+  CLIENT_METHOD(CloseContext);
+  CLIENT_METHOD(RegisterFunction);
+
+#undef CLIENT_METHOD
+};
+
+// Simple wrapper class that can be used to retrieve EagerClients.
+class EagerClientCache {
+ public:
+  virtual ~EagerClientCache() {}
+  virtual EagerClient* GetClient(const string& target) = 0;
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4bd74b81a7c43296099cbc8f6d1d2690f30e801b
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -0,0 +1,253 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/distributed_runtime/worker_cache_wrapper.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace eager {
+
+namespace {
+Status GetNumRetvals(tensorflow::EagerContext* context, const string& op_name,
+                     const google::protobuf::Map<string, tensorflow::AttrValue>& attrs,
+                     int* num_retvals) {
+  const tensorflow::OpRegistrationData* op_reg_data = nullptr;
+  auto status = tensorflow::OpRegistry::Global()->LookUp(op_name, &op_reg_data);
+  if (errors::IsNotFound(status)) {
+    status = context->FindFunctionOpData(op_name, &op_reg_data);
+  }
+  TF_RETURN_IF_ERROR(status);
+
+  const tensorflow::OpDef& op_def = op_reg_data->op_def;
+
+  for (const auto& output_arg : op_def.output_arg()) {
+    if (!output_arg.number_attr().empty()) {
+      auto iter = attrs.find(output_arg.number_attr());
+      if (iter == attrs.end()) {
+        return errors::InvalidArgument("Unable to find number_attr ",
+                                       output_arg.number_attr(),
+                                       " for Op: ", op_name);
+      }
+      *num_retvals += iter->second.i();
+    } else if (!output_arg.type_list_attr().empty()) {
+      auto iter = attrs.find(output_arg.number_attr());
+      if (iter == attrs.end()) {
+        return errors::InvalidArgument("Unable to find number_attr ",
+                                       output_arg.number_attr(),
+                                       " for Op: ", op_name);
+      }
+      *num_retvals += iter->second.list().type_size();
+    } else {
+      *num_retvals += 1;
+    }
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
+                                       CreateContextResponse* response) {
+  tensorflow::RemoteRendezvous* r = env_->rendezvous_mgr->Find(0);
+  std::vector<tensorflow::Device*> devices;
+  TF_RETURN_IF_ERROR(tensorflow::DeviceFactory::AddDevices(
+      // TODO(nareshmodi): Correctly set the SessionOptions.
+      SessionOptions(),
+      strings::Printf("/job:%s/replica:0/task:%d",
+                      request->server_def().job_name().data(),
+                      request->server_def().task_index()),
+      &devices));
+
+  response->mutable_device_attributes()->Reserve(devices.size());
+  for (auto& d : devices) {
+    *response->add_device_attributes() = d->attributes();
+  }
+
+  std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
+      new tensorflow::DeviceMgr(devices));
+  std::unique_ptr<tensorflow::EagerContext> ctx(new tensorflow::EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      request->async(), std::move(device_mgr), r));
+
+  uint64 context_id;
+  {
+    mutex_lock l(contexts_mu_);
+    do {
+      context_id = random::New64();
+    } while (contexts_.find(context_id) != contexts_.end());
+    contexts_.emplace(context_id, new ServerContext(std::move(ctx)));
+  }
+  response->set_context_id(context_id);
+
+  return Status::OK();
+}
+
+Status EagerServiceImpl::ExecuteOp(const Operation& operation,
+                                   ServerContext* server_context) {
+  std::unique_ptr<tensorflow::EagerOperation> op;
+  const char* name = operation.name().c_str();  // Shorthand
+  const tensorflow::AttrTypeMap* types;
+  auto status = tensorflow::AttrTypeMapForOp(name, &types);
+  if (status.ok()) {
+    op.reset(
+        new tensorflow::EagerOperation(server_context->Context(), name, types));
+  } else if (errors::IsNotFound(status)) {
+    if (server_context->Context()->FindFunctionByName(name)) {
+      op.reset(new tensorflow::EagerOperation(server_context->Context(), name,
+                                              nullptr));
+    } else {
+      return status;
+    }
+  } else {
+    return status;
+  }
+
+  TF_RETURN_IF_ERROR(op->SetDevice(operation.device().c_str()));
+
+  for (const auto& remote_handle : operation.inputs()) {
+    tensorflow::TensorHandle* handle;
+    TF_RETURN_IF_ERROR(server_context->GetTensorHandle(
+        RemoteTensorHandleInternal(remote_handle), &handle));
+
+    op->AddInput(handle);
+  }
+
+  for (const auto& attr : operation.attrs()) {
+    op->MutableAttrs()->Set(attr.first, attr.second);
+  }
+
+  int num_retvals = 0;
+  // TODO(nareshmodi): Consider caching this.
+  TF_RETURN_IF_ERROR(GetNumRetvals(server_context->Context(), operation.name(),
+                                   operation.attrs(), &num_retvals));
+
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> retvals;
+  TF_RETURN_IF_ERROR(EagerExecute(op.get(), &retvals, &num_retvals));
+
+  server_context->AddOperationOutputs(retvals, operation.id());
+
+  return Status::OK();
+}
+
+Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
+                                 EnqueueResponse* response) {
+  ServerContext* context = nullptr;
+  TF_RETURN_IF_ERROR(GetServerContext(request->context_id(), &context));
+  core::ScopedUnref context_unref(context);
+
+  for (const auto& item : request->queue()) {
+    if (item.has_operation()) {
+      TF_RETURN_IF_ERROR(ExecuteOp(item.operation(), context));
+    } else {
+      TF_RETURN_IF_ERROR(context->DeleteTensorHandle(
+          RemoteTensorHandleInternal(item.handle_to_decref())));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status EagerServiceImpl::WaitQueueDone(const WaitQueueDoneRequest* request,
+                                       WaitQueueDoneResponse* response) {
+  ServerContext* context = nullptr;
+  TF_RETURN_IF_ERROR(GetServerContext(request->context_id(), &context));
+  core::ScopedUnref context_unref(context);
+
+  if (request->op_id_size() > 0) {
+    return errors::Unimplemented(
+        "EagerServiceImpl::WaitQueueDone is not "
+        "implemented for particular op IDs.");
+  }
+  return context->Context()->AsyncWait();
+}
+
+Status EagerServiceImpl::KeepAlive(const KeepAliveRequest* request,
+                                   KeepAliveResponse* response) {
+  // TODO(nareshmodi): Automated context_id cleaning is not implemented
+  return errors::Unimplemented(
+      "EagerServiceImpl::KeepAlive is not implemented.");
+}
+
+Status EagerServiceImpl::CloseContext(const CloseContextRequest* request,
+                                      CloseContextResponse* response) {
+  ServerContext* context = nullptr;
+  if (!GetServerContext(request->context_id(), &context).ok()) {
+    // Swallow the error here.
+    return Status::OK();
+  }
+
+  core::ScopedUnref context_unref(context);
+
+  mutex_lock l(contexts_mu_);
+  contexts_.erase(request->context_id());
+
+  // GetServerContext returns a newly Reffed copy of ServerContext, which is
+  // unreffed by context_unref. Additionally, we need to unref it one time since
+  // we are releasing it from the map.
+  context->Unref();
+
+  return Status::OK();
+}
+
+Status EagerServiceImpl::RegisterFunction(
+    const RegisterFunctionRequest* request,
+    RegisterFunctionResponse* response) {
+  ServerContext* context = nullptr;
+  TF_RETURN_IF_ERROR(GetServerContext(request->context_id(), &context));
+  core::ScopedUnref context_unref(context);
+
+  return context->Context()->AddFunctionDef(request->function_def());
+}
+
+tensorflow::Status EagerServiceImpl::GetServerContext(
+    uint64 context_id, ServerContext** server_context) {
+  mutex_lock l(contexts_mu_);
+  auto iter = contexts_.find(context_id);
+  if (iter == contexts_.end()) {
+    *server_context = nullptr;
+    return errors::InvalidArgument(strings::Printf(
+        "Unable to find a context_id matching the specified one "
+        "(%lld). Perhaps the worker was restarted?",
+        context_id));
+  }
+
+  *server_context = iter->second;
+  (*server_context)->Ref();
+  return Status::OK();
+}
+
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebd5269a57aa727f62ccc75ac107ddc92a5d9b7a
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -0,0 +1,150 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_SERVICE_IMPL_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace tensorflow {
+namespace eager {
+
+// A TensorFlow Eager Worker runs ops and supports worker to worker
+// Tensor transfer.
+//
+// See eager_service.proto for more details about each method.
+// This class can be wrapped by specific classes that implement rpc transports
+// over this (e.g. gRPC).
+class EagerServiceImpl {
+ public:
+  explicit EagerServiceImpl(const WorkerEnv* env) : env_(env) {}
+  virtual ~EagerServiceImpl() {
+    for (auto& entry : contexts_) {
+      entry.second->Unref();
+    }
+  }
+
+  Status CreateContext(const CreateContextRequest* request,
+                       CreateContextResponse* response);
+
+  Status Enqueue(const EnqueueRequest* request, EnqueueResponse* response);
+
+  Status WaitQueueDone(const WaitQueueDoneRequest* request,
+                       WaitQueueDoneResponse* response);
+
+  Status KeepAlive(const KeepAliveRequest* request,
+                   KeepAliveResponse* response);
+
+  Status CloseContext(const CloseContextRequest* request,
+                      CloseContextResponse* response);
+
+  Status RegisterFunction(const RegisterFunctionRequest* request,
+                          RegisterFunctionResponse* response);
+
+ protected:
+  // This is the server-side execution context. All state regarding execution of
+  // a client's ops is held in this server-side context (all generated tensors,
+  // and the EagerContext).
+  class ServerContext : public core::RefCounted {
+   public:
+    explicit ServerContext(std::unique_ptr<tensorflow::EagerContext> ctx)
+        : ctx_(std::move(ctx)) {}
+    ~ServerContext() {
+      for (const auto& entry : tensors_) {
+        entry.second->Unref();
+      }
+    }
+
+    tensorflow::EagerContext* Context() const { return ctx_.get(); }
+
+    void AddOperationOutputs(
+        const gtl::ArraySlice<tensorflow::TensorHandle*>& handles,
+        int64 operation_id) {
+      mutex_lock l(tensors_mu_);
+      for (int i = 0; i < handles.size(); i++) {
+        // TODO(nareshmodi): Correctly handle operation_id not being unique.
+        tensors_.emplace(RemoteTensorHandleInternal(operation_id, i),
+                         handles[i]);
+      }
+    }
+
+    Status GetTensorHandle(const RemoteTensorHandleInternal& remote_handle,
+                           tensorflow::TensorHandle** handle) {
+      mutex_lock l(tensors_mu_);
+      auto iter = tensors_.find(remote_handle);
+      if (iter == tensors_.end()) {
+        return errors::InvalidArgument(
+            "Unable to find the relevant tensor remote_handle: Op ID: ",
+            remote_handle.op_id, ", Output num: ", remote_handle.output_num);
+      }
+
+      *handle = iter->second;
+
+      return Status::OK();
+    }
+
+    Status DeleteTensorHandle(const RemoteTensorHandleInternal& remote_handle) {
+      mutex_lock l(tensors_mu_);
+      auto iter = tensors_.find(remote_handle);
+      if (iter == tensors_.end()) {
+        return errors::InvalidArgument(
+            "Unable to find the relevant tensor remote_handle: Op ID: ",
+            remote_handle.op_id, ", Output num: ", remote_handle.output_num);
+      }
+
+      iter->second->Unref();
+      tensors_.erase(iter);
+
+      return Status::OK();
+    }
+
+   private:
+    using RemoteTensorHandleMap =
+        gtl::FlatMap<RemoteTensorHandleInternal, tensorflow::TensorHandle*,
+                     RemoteTensorHandleInternalHash,
+                     RemoteTensorHandleInternalEquals>;
+
+    // The context for this execution.
+    std::unique_ptr<tensorflow::EagerContext> ctx_;
+
+    mutex tensors_mu_;
+    RemoteTensorHandleMap tensors_ GUARDED_BY(tensors_mu_);
+  };
+  // The returned ServerContext will need to be Unrefed.
+  tensorflow::Status GetServerContext(uint64, ServerContext**);
+
+ private:
+  Status ExecuteOp(const Operation& operation, ServerContext* server_context);
+  const WorkerEnv* const env_;  // Not owned.
+
+  mutex contexts_mu_;
+  std::unordered_map<uint64, ServerContext*> contexts_ GUARDED_BY(contexts_mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(EagerServiceImpl);
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_SERVICE_IMPL_H_
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f865ebe1be9c076d717720857f65ab9928836051
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -0,0 +1,268 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
+
+#include <string.h>
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+
+namespace tensorflow {
+namespace eager {
+namespace {
+
+class TestEagerServiceImpl : public EagerServiceImpl {
+ public:
+  explicit TestEagerServiceImpl(const WorkerEnv* env) : EagerServiceImpl(env) {}
+  Status GetTensorHandle(const uint64 context_id,
+                         const RemoteTensorHandleInternal& remote_handle,
+                         tensorflow::TensorHandle** handle) {
+    ServerContext* context = nullptr;
+    TF_RETURN_IF_ERROR(GetServerContext(context_id, &context));
+    core::ScopedUnref context_unref(context);
+
+    return context->GetTensorHandle(remote_handle, handle);
+  }
+};
+
+void SetTensorProto(AttrValue* val) {
+  int64_t dims[] = {2, 2};
+  float data[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  TF_Tensor* t = TF_AllocateTensor(
+      TF_FLOAT, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  tensorflow::Tensor tensor;
+  TF_ASSERT_OK(tensorflow::TF_TensorToTensor(t, &tensor));
+  tensor.AsProtoTensorContent(val->mutable_tensor());
+  TF_DeleteTensor(t);
+}
+
+void AddOperationToEnqueueRequest(
+    int64 id, const string& name,
+    const std::vector<std::pair<int64, int32>>& inputs,
+    const std::unordered_map<string, AttrValue>& attrs, const string& device,
+    EnqueueRequest* request) {
+  auto* operation = request->add_queue()->mutable_operation();
+
+  operation->set_id(id);
+  operation->set_name(name);
+  operation->set_device(device);
+
+  for (const auto& tensor_handle_pair : inputs) {
+    auto* input = operation->add_inputs();
+    input->set_op_id(tensor_handle_pair.first);
+    input->set_output_num(tensor_handle_pair.second);
+  }
+
+  for (const auto& attr_entry : attrs) {
+    (*operation->mutable_attrs())[attr_entry.first] = attr_entry.second;
+  }
+}
+
+tensorflow::FunctionDef MatMulFunction() {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'MatMulFunction'"
+      "      input_arg {"
+      "        name: 'a'"
+      "        type: DT_FLOAT"
+      "      }"
+      "      output_arg {"
+      "        name: 'm'"
+      "        type: DT_FLOAT"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'matmul'"
+      "      op: 'MatMul'"
+      "      input: 'a'"
+      "      input: 'a'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    ret {"
+      "      key: 'm'"
+      "      value: 'matmul:product'"
+      "    }",
+      &def));
+  return def;
+}
+
+// Test creates a context and attempts to execute some ops.
+TEST(EagerServiceImplTest, BasicTest) {
+  WorkerEnv worker_env;
+  worker_env.env = Env::Default();
+  tensorflow::RpcRendezvousMgr rm(&worker_env);
+  worker_env.rendezvous_mgr = &rm;
+
+  TestEagerServiceImpl eager_service_impl(&worker_env);
+
+  CreateContextRequest request;
+  request.mutable_server_def()->set_job_name("localhost");
+  request.mutable_server_def()->set_task_index(0);
+  CreateContextResponse response;
+
+  TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+  uint64 context_id = response.context_id();
+
+  EnqueueRequest remote_enqueue_request;
+  remote_enqueue_request.set_context_id(context_id);
+  EnqueueResponse remote_enqueue_response;
+
+  std::unordered_map<string, AttrValue> const_attrs;
+  AttrValue val;
+  val.set_type(tensorflow::DataType::DT_FLOAT);
+  const_attrs.insert({"dtype", val});
+  val.Clear();
+  SetTensorProto(&val);
+  const_attrs.insert({"value", val});
+
+  AddOperationToEnqueueRequest(1, "Const", {}, const_attrs,
+                               "/job:localhost/replica:0/task:0/device:CPU:0",
+                               &remote_enqueue_request);
+
+  std::unordered_map<string, AttrValue> attrs;
+  val.Clear();
+  val.set_type(tensorflow::DataType::DT_FLOAT);
+  attrs.insert({"T", val});
+  val.Clear();
+  val.set_b(false);
+  attrs.insert({"transpose_a", val});
+  attrs.insert({"transpose_b", val});
+
+  AddOperationToEnqueueRequest(2, "MatMul", {{1, 0}, {1, 0}}, attrs,
+                               "/job:localhost/replica:0/task:0/device:CPU:0",
+                               &remote_enqueue_request);
+
+  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+                                          &remote_enqueue_response));
+
+  tensorflow::TensorHandle* tensor_handle;
+  TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+      response.context_id(), RemoteTensorHandleInternal(2, 0), &tensor_handle));
+
+  // This should be OK to do since we've placed all computation on the CPU
+  // device.
+  const tensorflow::Tensor* t = nullptr;
+  TF_ASSERT_OK(tensor_handle->Tensor(&t));
+
+  auto actual = t->flat<float>();
+
+  EXPECT_EQ(4, actual.size());
+
+  EXPECT_EQ(7, actual(0));
+  EXPECT_EQ(10, actual(1));
+  EXPECT_EQ(15, actual(2));
+  EXPECT_EQ(22, actual(3));
+
+  CloseContextRequest close_context_request;
+  close_context_request.set_context_id(context_id);
+  CloseContextResponse close_context_response;
+  TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
+                                               &close_context_response));
+}
+
+// Test creates a context and attempts to execute a function.
+TEST(EagerServiceImplTest, BasicFunctionTest) {
+  WorkerEnv worker_env;
+  worker_env.env = Env::Default();
+  tensorflow::RpcRendezvousMgr rm(&worker_env);
+  worker_env.rendezvous_mgr = &rm;
+
+  TestEagerServiceImpl eager_service_impl(&worker_env);
+
+  CreateContextRequest request;
+  request.mutable_server_def()->set_job_name("localhost");
+  request.mutable_server_def()->set_task_index(0);
+  CreateContextResponse response;
+
+  TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+  uint64 context_id = response.context_id();
+
+  RegisterFunctionRequest register_function_request;
+  register_function_request.set_context_id(context_id);
+  *register_function_request.mutable_function_def() = MatMulFunction();
+  RegisterFunctionResponse register_function_response;
+
+  TF_ASSERT_OK(eager_service_impl.RegisterFunction(
+      &register_function_request, &register_function_response));
+
+  EnqueueRequest remote_enqueue_request;
+  remote_enqueue_request.set_context_id(context_id);
+  EnqueueResponse remote_enqueue_response;
+
+  std::unordered_map<string, AttrValue> const_attrs;
+  AttrValue val;
+  val.set_type(tensorflow::DataType::DT_FLOAT);
+  const_attrs.insert({"dtype", val});
+  val.Clear();
+
+  SetTensorProto(&val);
+  const_attrs.insert({"value", val});
+
+  AddOperationToEnqueueRequest(1, "Const", {}, const_attrs,
+                               "/job:localhost/replica:0/task:0/device:CPU:0",
+                               &remote_enqueue_request);
+  AddOperationToEnqueueRequest(
+      2, "MatMulFunction", {{1, 0}}, std::unordered_map<string, AttrValue>(),
+      "/job:localhost/replica:0/task:0/device:CPU:0", &remote_enqueue_request);
+
+  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+                                          &remote_enqueue_response));
+
+  const tensorflow::Tensor* t = nullptr;
+  tensorflow::TensorHandle* tensor_handle;
+  TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+      response.context_id(), RemoteTensorHandleInternal(2, 0), &tensor_handle));
+  TF_ASSERT_OK(tensor_handle->Tensor(&t));
+
+  auto actual = t->flat<float>();
+  EXPECT_EQ(4, actual.size());
+
+  EXPECT_EQ(7, actual(0));
+  EXPECT_EQ(10, actual(1));
+  EXPECT_EQ(15, actual(2));
+  EXPECT_EQ(22, actual(3));
+
+  CloseContextRequest close_context_request;
+  close_context_request.set_context_id(context_id);
+  CloseContextResponse close_context_response;
+  TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
+                                               &close_context_response));
+}
+
+}  // namespace
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4bd67aaedbec0af4f6c675a4e5427bb2d0e751f
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_EXECUTE_NODE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_EXECUTE_NODE_H_
+
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace tensorflow {
+namespace eager {
+
+// EnqueueNode is an implementation of EagerNode which enqueues an operation
+// via RPC in a remote EagerService.
+class RemoteExecuteNode : public tensorflow::EagerNode {
+ public:
+  RemoteExecuteNode(tensorflow::uint64 id,
+                    const tensorflow::eager::EnqueueRequest& request,
+                    tensorflow::eager::EagerClient* eager_client)
+      : tensorflow::EagerNode(id),
+        request_(std::move(request)),
+        eager_client_(eager_client) {}
+
+  tensorflow::Status Run() override {
+    tensorflow::eager::EnqueueResponse response;
+    tensorflow::Status status;
+    Notification n;
+    eager_client_->EnqueueAsync(&request_, &response,
+                                [&n, &status](const tensorflow::Status& s) {
+                                  status.Update(s);
+                                  n.Notify();
+                                });
+    n.WaitForNotification();
+
+    return status;
+  }
+
+ private:
+  EnqueueRequest request_;
+  tensorflow::eager::EagerClient*
+      eager_client_;  // Not owned, and must outlive the RemoteExecuteNode.
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_EXECUTE_NODE_H_
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..25ec062c03553501e9368e1651d9e45c586d419e
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_TENSOR_HANDLE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_TENSOR_HANDLE_H_
+
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace tensorflow {
+namespace eager {
+
+struct RemoteTensorHandleInternal {
+  explicit RemoteTensorHandleInternal(const RemoteTensorHandle& tensor_handle)
+      : op_id(tensor_handle.op_id()), output_num(tensor_handle.output_num()) {}
+  RemoteTensorHandleInternal(int64 op_id, int32 output_num)
+      : op_id(op_id), output_num(output_num) {}
+  int64 op_id;
+  int32 output_num;
+};
+
+struct RemoteTensorHandleInternalHash {
+  std::size_t operator()(const RemoteTensorHandleInternal& handle) const {
+    return FingerprintCat64(handle.op_id, handle.output_num);
+  }
+};
+
+struct RemoteTensorHandleInternalEquals {
+  bool operator()(const RemoteTensorHandleInternal& first,
+                  const RemoteTensorHandleInternal& second) const {
+    return first.op_id == second.op_id && first.output_num == second.output_num;
+  }
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_TENSOR_HANDLE_H_
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index e60386fd34acc1d94565014b5e3ca920fe336ea3..4f9d84d158f81d179f29dec2784409cec0f55beb 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -269,7 +269,8 @@ class DeviceFinder {
     mutex_lock l(mu_);
     seen_targets_[target_index] = true;
     if (!s.ok()) {
-      LOG(ERROR) << "Master init: " << s;
+      LOG(ERROR) << "CreateSession failed because worker "
+                 << targets_[target_index] << " returned error: " << s;
       status_.Update(s);
     } else {
       found_.insert(found_.end(), devices->begin(), devices->end());
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 08fbe8b144f76d2bf421cbe32d1ef11336bdd730..bd70eca3f6528e24e25988a7799ad2962073c8ba 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -119,7 +119,11 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
     // it on/off and don't make use of the responses.
     for (auto& p : partitions_) {
       LoggingRequest* req = new LoggingRequest;
-      req->set_rpc_logging(active);
+      if (active) {
+        req->set_enable_rpc_logging(true);
+      } else {
+        req->set_disable_rpc_logging(true);
+      }
       LoggingResponse* resp = new LoggingResponse;
       Ref();
       p.worker->LoggingAsync(req, resp, [this, req, resp](const Status& s) {
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index c2719f54622b255ce12c13e9831b3fa0faac7a39..40028ee241b6e78634c63de04508ba4dca947b7a 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -171,6 +171,7 @@ tf_cuda_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:graph_mgr",
         "//tensorflow/core/distributed_runtime:recent_request_ids",
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..1a3bd9d6bf0e4240b0a9326e27b45c37ac25c723
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -0,0 +1,67 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = [
+    "//tensorflow:internal",
+])
+
+cc_library(
+    name = "grpc_eager_service",
+    srcs = ["grpc_eager_service.cc"],
+    hdrs = ["grpc_eager_service.h"],
+    deps = [
+        "//tensorflow/core:eager_service_proto_cc",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+cc_library(
+    name = "grpc_eager_client",
+    srcs = ["grpc_eager_client.cc"],
+    hdrs = ["grpc_eager_client.h"],
+    deps = [
+        "//tensorflow/core:eager_service_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime/eager:eager_client",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_client_cq_tag",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_state",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+cc_library(
+    name = "grpc_eager_service_impl",
+    srcs = ["grpc_eager_service_impl.cc"],
+    hdrs = ["grpc_eager_service_impl.h"],
+    deps = [
+        ":grpc_eager_service",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ptr_util",
+        "//tensorflow/core/distributed_runtime/eager:eager_service_impl",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_call",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+cc_library(
+    name = "eager_grpc_server_lib",
+    hdrs = ["eager_grpc_server_lib.h"],
+    deps = [
+        ":grpc_eager_service_impl",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
+        "//tensorflow/core/distributed_runtime:worker_cache_wrapper",
+        "//tensorflow/core/distributed_runtime/eager:eager_service_impl",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
+    ],
+)
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5dc4c831d04a00115eb011b6b0eb458ff16416c
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h
@@ -0,0 +1,97 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_EAGER_GRPC_SERVER_LIB_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_EAGER_GRPC_SERVER_LIB_H_
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
+#include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
+#include "tensorflow/core/distributed_runtime/worker_cache_wrapper.h"
+
+namespace tensorflow {
+namespace eager {
+
+class EagerGrpcServer : public GrpcServer {
+ public:
+  static Status Create(const ServerDef& server_def,
+                       std::unique_ptr<EagerGrpcServer>* server) {
+    std::unique_ptr<EagerGrpcServer> ret(new EagerGrpcServer(server_def));
+
+    TF_RETURN_IF_ERROR(ret->InitEager());
+
+    *server = std::move(ret);
+
+    return Status::OK();
+  }
+
+  Status Start() override {
+    TF_RETURN_IF_ERROR(GrpcServer::Start());
+
+    eager_service_->Start();
+
+    return Status::OK();
+  }
+
+  Status Stop() override {
+    TF_RETURN_IF_ERROR(GrpcServer::Stop());
+
+    eager_service_->Stop();
+
+    return Status::OK();
+  }
+
+  using GrpcServer::channel_cache;
+  using GrpcServer::master_env;
+  using GrpcServer::worker_env;
+
+ private:
+  EagerGrpcServer(const ServerDef& server_def)
+      : GrpcServer(server_def, Env::Default()),
+        worker_name_(
+            strings::StrCat("/job:", server_def.job_name(),
+                            "/replica:0/task:", server_def.task_index())) {}
+
+  Status InitEager() {
+    TF_RETURN_IF_ERROR(this->Init(
+        [this](const WorkerEnv* worker_env,
+               ::grpc::ServerBuilder* server_builder) {
+          this->eager_service_.reset(
+              new eager::GrpcEagerServiceImpl(worker_env, server_builder));
+        },
+        nullptr));
+
+    worker_session_ = WorkerSession::CreateWithBorrowedDeviceMgr(
+        "", worker_name_,
+        std::unique_ptr<WorkerCacheInterface>(
+            new WorkerCacheWrapper(master_env()->worker_cache)),
+        worker_env()->device_mgr, {});
+
+    auto* r = worker_env()->rendezvous_mgr->Find(0);
+    return r->Initialize(worker_session_.get());
+  }
+
+  std::unique_ptr<GrpcEagerServiceImpl> eager_service_;
+  std::shared_ptr<WorkerSession> worker_session_;
+  const string worker_name_;
+};  // namespace eager
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_EAGER_GRPC_SERVER_LIB_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4786c43ee2c5118858bbedbb65d0f0a7e9b2bdc9
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -0,0 +1,142 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h"
+
+#include "grpc++/generic/generic_stub.h"
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace tensorflow {
+namespace eager {
+namespace {
+class GrpcEagerClient : public EagerClient {
+ public:
+  GrpcEagerClient(const tensorflow::SharedGrpcChannelPtr& channel,
+                  ::grpc::CompletionQueue* cq)
+      : stub_(channel), cq_(cq) {}
+  ~GrpcEagerClient() override {}
+
+#define CLIENT_METHOD(method)                                             \
+  void method##Async(const method##Request* request,                      \
+                     method##Response* response, StatusCallback done)     \
+      override {                                                          \
+    new RPCState<protobuf::Message>(                                      \
+        &stub_, cq_, "/tensorflow.eager.EagerService/" #method, *request, \
+        response, std::move(done), nullptr);                              \
+  }
+
+  CLIENT_METHOD(CreateContext);
+  CLIENT_METHOD(Enqueue);
+  CLIENT_METHOD(WaitQueueDone);
+  CLIENT_METHOD(KeepAlive);
+  CLIENT_METHOD(CloseContext);
+  CLIENT_METHOD(RegisterFunction);
+
+#undef CLIENT_METHOD
+
+ private:
+  ::grpc::GenericStub stub_;
+  ::grpc::CompletionQueue* cq_;
+};
+
+class GrpcEagerClientCache : public EagerClientCache {
+ public:
+  explicit GrpcEagerClientCache(
+      std::shared_ptr<tensorflow::GrpcChannelCache> cache)
+      : next_round_robin_assignment_(0), cache_(cache), threads_(4) {}
+
+  ~GrpcEagerClientCache() override { threads_.clear(); }
+
+  EagerClient* GetClient(const string& target) override {
+    auto it = clients_.find(target);
+    if (it == clients_.end()) {
+      tensorflow::SharedGrpcChannelPtr shared =
+          cache_->FindWorkerChannel(target);
+      auto worker = std::unique_ptr<EagerClient>(new GrpcEagerClient(
+          shared, threads_[AssignClientToThread(target)].completion_queue()));
+
+      it = clients_.emplace(target, std::move(worker)).first;
+    }
+
+    return it->second.get();
+  }
+
+ private:
+  mutex assignment_mu_;
+  std::unordered_map<std::string, size_t> target_assignments_
+      GUARDED_BY(assignment_mu_);
+  size_t next_round_robin_assignment_ GUARDED_BY(assignment_mu_);
+
+  size_t AssignClientToThread(const string& target) {
+    // Round-robin target assignment, but keeps the same target on the same
+    // polling thread always, as this is important for gRPC performace
+    mutex_lock lock(assignment_mu_);
+    auto it = target_assignments_.find(target);
+    if (it == target_assignments_.end()) {
+      it = target_assignments_
+               .insert(std::make_pair(
+                   target, (next_round_robin_assignment_++) % threads_.size()))
+               .first;
+    }
+    return it->second;
+  }
+
+  class GrpcEagerClientThread {
+   public:
+    GrpcEagerClientThread() {
+      thread_.reset(Env::Default()->StartThread(
+          ThreadOptions(), "eager_client_thread", [this]() {
+            void* tag;
+            bool ok;
+            while (completion_queue_.Next(&tag, &ok)) {
+              GrpcClientCQTag* callback_tag =
+                  static_cast<GrpcClientCQTag*>(tag);
+              callback_tag->OnCompleted(ok);
+            }
+          }));
+    }
+
+    ~GrpcEagerClientThread() {
+      completion_queue_.Shutdown();
+      thread_.reset();
+    }
+
+    ::grpc::CompletionQueue* completion_queue() { return &completion_queue_; }
+
+   private:
+    ::grpc::CompletionQueue completion_queue_;
+    std::unique_ptr<Thread> thread_;
+  };  // GrpcEagerClientThread
+
+  std::shared_ptr<tensorflow::GrpcChannelCache> cache_;
+  std::unordered_map<string, std::unique_ptr<EagerClient>> clients_;
+  std::vector<GrpcEagerClientThread> threads_;
+};
+
+}  // namespace
+
+EagerClientCache* NewGrpcEagerClientCache(
+    std::shared_ptr<tensorflow::GrpcChannelCache> channel) {
+  return new GrpcEagerClientCache(channel);
+}
+
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a926da488477b3c88b11b822ef6d332457e3cc0
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_CLIENT_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_CLIENT_H_
+
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+
+namespace tensorflow {
+namespace eager {
+// The GrpcChannelCache is not owned.
+EagerClientCache* NewGrpcEagerClientCache(
+    std::shared_ptr<tensorflow::GrpcChannelCache> channel);
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_CLIENT_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3fd7deaa868a9d0541bef31bd64eee4dc34acc96
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc
@@ -0,0 +1,123 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
+
+#include "grpc++/impl/codegen/async_stream.h"
+#include "grpc++/impl/codegen/async_unary_call.h"
+#include "grpc++/impl/codegen/channel_interface.h"
+#include "grpc++/impl/codegen/client_unary_call.h"
+#include "grpc++/impl/codegen/method_handler_impl.h"
+#include "grpc++/impl/codegen/rpc_service_method.h"
+#include "grpc++/impl/codegen/service_type.h"
+#include "grpc++/impl/codegen/sync_stream.h"
+
+namespace tensorflow {
+namespace eager {
+
+namespace grpc {
+
+static const char* grpcEagerService_method_names[] = {
+    "/tensorflow.eager.EagerService/CreateContext",
+    "/tensorflow.eager.EagerService/Enqueue",
+    "/tensorflow.eager.EagerService/WaitQueueDone",
+    "/tensorflow.eager.EagerService/KeepAlive",
+    "/tensorflow.eager.EagerService/CloseContext",
+    "/tensorflow.eager.EagerService/RegisterFunction",
+};
+
+std::unique_ptr<EagerService::Stub> EagerService::NewStub(
+    const std::shared_ptr< ::grpc::ChannelInterface>& channel,
+    const ::grpc::StubOptions& options) {
+  std::unique_ptr<EagerService::Stub> stub(new EagerService::Stub(channel));
+  return stub;
+}
+
+EagerService::Stub::Stub(
+    const std::shared_ptr< ::grpc::ChannelInterface>& channel)
+    : channel_(channel),
+      rpcmethod_CreateContext_(grpcEagerService_method_names[0],
+                               ::grpc::internal::RpcMethod::NORMAL_RPC,
+                               channel),
+      rpcmethod_Enqueue_(grpcEagerService_method_names[1],
+                         ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_WaitQueueDone_(grpcEagerService_method_names[2],
+                               ::grpc::internal::RpcMethod::NORMAL_RPC,
+                               channel),
+      rpcmethod_KeepAlive_(grpcEagerService_method_names[3],
+                           ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_CloseContext_(grpcEagerService_method_names[4],
+                              ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_RegisterFunction_(grpcEagerService_method_names[5],
+                                  ::grpc::internal::RpcMethod::NORMAL_RPC,
+                                  channel) {}
+
+::grpc::Status EagerService::Stub::CreateContext(
+    ::grpc::ClientContext* context, const CreateContextRequest& request,
+    CreateContextResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_CreateContext_, context, request, response);
+}
+
+::grpc::Status EagerService::Stub::Enqueue(::grpc::ClientContext* context,
+                                           const EnqueueRequest& request,
+                                           EnqueueResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(channel_.get(), rpcmethod_Enqueue_,
+                                             context, request, response);
+}
+
+::grpc::Status EagerService::Stub::WaitQueueDone(
+    ::grpc::ClientContext* context, const WaitQueueDoneRequest& request,
+    WaitQueueDoneResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_WaitQueueDone_, context, request, response);
+}
+
+::grpc::Status EagerService::Stub::KeepAlive(::grpc::ClientContext* context,
+                                             const KeepAliveRequest& request,
+                                             KeepAliveResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_KeepAlive_, context, request, response);
+}
+
+::grpc::Status EagerService::Stub::CloseContext(
+    ::grpc::ClientContext* context, const CloseContextRequest& request,
+    CloseContextResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_CloseContext_, context, request, response);
+}
+
+::grpc::Status EagerService::Stub::RegisterFunction(
+    ::grpc::ClientContext* context, const RegisterFunctionRequest& request,
+    RegisterFunctionResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_RegisterFunction_, context, request, response);
+}
+
+EagerService::AsyncService::AsyncService() {
+  for (int i = 0; i < 6; ++i) {
+    AddMethod(new ::grpc::internal::RpcServiceMethod(
+        grpcEagerService_method_names[i],
+        ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
+    ::grpc::Service::MarkMethodAsync(i);
+  }
+}
+
+EagerService::AsyncService::~AsyncService() {}
+
+}  // namespace grpc
+
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7b192ac857a423af81c544bb57c672ae82129d9
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
@@ -0,0 +1,168 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_H_
+
+#include "grpc++/impl/codegen/async_stream.h"
+#include "grpc++/impl/codegen/async_unary_call.h"
+#include "grpc++/impl/codegen/proto_utils.h"
+#include "grpc++/impl/codegen/rpc_method.h"
+#include "grpc++/impl/codegen/service_type.h"
+#include "grpc++/impl/codegen/status.h"
+#include "grpc++/impl/codegen/stub_options.h"
+#include "grpc++/impl/codegen/sync_stream.h"
+
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace grpc {
+class CompletionQueue;
+class Channel;
+class RpcService;
+class ServerCompletionQueue;
+class ServerContext;
+}  // namespace grpc
+
+namespace tensorflow {
+namespace eager {
+
+namespace grpc {
+
+// GRPC stubs of `tensorflow.eager.EagerService`, based on the
+// definition in "//tensorflow/core/protobuf/eager_service.proto",
+// and the gRPC generated stub and service classes.
+// See that file for the definition of methods and messages.
+// Similar to the Master/Worker tensorflow GRPC services, this is not gen'ned
+// via a rule, but included as an implementation directly.
+class EagerService final {
+ public:
+  class StubInterface {
+   public:
+    virtual ~StubInterface() {}
+    virtual ::grpc::Status CreateContext(::grpc::ClientContext* context,
+                                         const CreateContextRequest& request,
+                                         CreateContextResponse* response) = 0;
+    virtual ::grpc::Status Enqueue(::grpc::ClientContext* context,
+                                   const EnqueueRequest& request,
+                                   EnqueueResponse* response) = 0;
+    virtual ::grpc::Status WaitQueueDone(::grpc::ClientContext* context,
+                                         const WaitQueueDoneRequest& request,
+                                         WaitQueueDoneResponse* response) = 0;
+    virtual ::grpc::Status KeepAlive(::grpc::ClientContext* context,
+                                     const KeepAliveRequest& request,
+                                     KeepAliveResponse* response) = 0;
+    virtual ::grpc::Status CloseContext(::grpc::ClientContext* context,
+                                        const CloseContextRequest& request,
+                                        CloseContextResponse* response) = 0;
+    virtual ::grpc::Status RegisterFunction(
+        ::grpc::ClientContext* context, const RegisterFunctionRequest& request,
+        RegisterFunctionResponse* response) = 0;
+  };
+  class Stub final : public StubInterface {
+   public:
+    Stub(const std::shared_ptr< ::grpc::ChannelInterface>& channel);
+    ::grpc::Status CreateContext(::grpc::ClientContext* context,
+                                 const CreateContextRequest& request,
+                                 CreateContextResponse* response) override;
+    ::grpc::Status Enqueue(::grpc::ClientContext* context,
+                           const EnqueueRequest& request,
+                           EnqueueResponse* response) override;
+    ::grpc::Status WaitQueueDone(::grpc::ClientContext* context,
+                                 const WaitQueueDoneRequest& request,
+                                 WaitQueueDoneResponse* response) override;
+    ::grpc::Status KeepAlive(::grpc::ClientContext* context,
+                             const KeepAliveRequest& request,
+                             KeepAliveResponse* response) override;
+    ::grpc::Status CloseContext(::grpc::ClientContext* context,
+                                const CloseContextRequest& request,
+                                CloseContextResponse* response) override;
+    ::grpc::Status RegisterFunction(
+        ::grpc::ClientContext* context, const RegisterFunctionRequest& request,
+        RegisterFunctionResponse* response) override;
+
+   private:
+    std::shared_ptr< ::grpc::ChannelInterface> channel_;
+    const ::grpc::internal::RpcMethod rpcmethod_CreateContext_;
+    const ::grpc::internal::RpcMethod rpcmethod_Enqueue_;
+    const ::grpc::internal::RpcMethod rpcmethod_WaitQueueDone_;
+    const ::grpc::internal::RpcMethod rpcmethod_KeepAlive_;
+    const ::grpc::internal::RpcMethod rpcmethod_CloseContext_;
+    const ::grpc::internal::RpcMethod rpcmethod_RegisterFunction_;
+  };
+  static std::unique_ptr<Stub> NewStub(
+      const std::shared_ptr< ::grpc::ChannelInterface>& channel,
+      const ::grpc::StubOptions& options = ::grpc::StubOptions());
+
+  class AsyncService : public ::grpc::Service {
+   public:
+    AsyncService();
+    virtual ~AsyncService();
+    void RequestCreateContext(
+        ::grpc::ServerContext* context, CreateContextRequest* request,
+        ::grpc::ServerAsyncResponseWriter<CreateContextResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(0, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestEnqueue(
+        ::grpc::ServerContext* context, EnqueueRequest* request,
+        ::grpc::ServerAsyncResponseWriter<EnqueueResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(1, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestWaitQueueDone(
+        ::grpc::ServerContext* context, WaitQueueDoneRequest* request,
+        ::grpc::ServerAsyncResponseWriter<WaitQueueDoneResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(2, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestKeepAlive(
+        ::grpc::ServerContext* context, KeepAliveRequest* request,
+        ::grpc::ServerAsyncResponseWriter<KeepAliveResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(3, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestCloseContext(
+        ::grpc::ServerContext* context, CloseContextRequest* request,
+        ::grpc::ServerAsyncResponseWriter<CloseContextResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(4, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestRegisterFunction(
+        ::grpc::ServerContext* context, RegisterFunctionRequest* request,
+        ::grpc::ServerAsyncResponseWriter<RegisterFunctionResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(5, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+  };
+};
+
+}  // namespace grpc
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b36c6dce868e40bab04baf6d501b87d31f2d0630
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
@@ -0,0 +1,91 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h"
+
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace eager {
+
+GrpcEagerServiceImpl::GrpcEagerServiceImpl(
+    const WorkerEnv* env, ::grpc::ServerBuilder* server_builder)
+    : local_impl_(env) {
+  request_handler_threadpool_ =
+      MakeUnique<thread::ThreadPool>(env->env, "EagerServiceRequestHandler", 4);
+  server_builder->RegisterService(&service_);
+  cq_ = server_builder->AddCompletionQueue();
+}
+
+void GrpcEagerServiceImpl::DriveCQ() {
+#define ENQUEUE_REQUEST(method)                                                \
+  do {                                                                         \
+    Call<GrpcEagerServiceImpl,                                                 \
+         tensorflow::eager::grpc::EagerService::AsyncService, method##Request, \
+         method##Response>::                                                   \
+        EnqueueRequest(&service_, cq_.get(),                                   \
+                       &grpc::EagerService::AsyncService::Request##method,     \
+                       &GrpcEagerServiceImpl::method##Handler, false);         \
+  } while (0)
+  ENQUEUE_REQUEST(CreateContext);
+  ENQUEUE_REQUEST(Enqueue);
+  ENQUEUE_REQUEST(WaitQueueDone);
+  ENQUEUE_REQUEST(KeepAlive);
+  ENQUEUE_REQUEST(CloseContext);
+  ENQUEUE_REQUEST(RegisterFunction);
+#undef ENQUEUE_REQUEST
+
+  void* tag;  // Matches the operation started against this cq_.
+  bool ok;
+
+  while (true) {
+    if (!cq_->Next(&tag, &ok)) {
+      // The queue is shutting down.
+      break;
+    }
+    UntypedCall<GrpcEagerServiceImpl>::Tag* callback_tag =
+        static_cast<UntypedCall<GrpcEagerServiceImpl>::Tag*>(tag);
+
+    if (callback_tag) {
+      callback_tag->OnCompleted(this, ok);
+    } else {
+      cq_->Shutdown();
+      break;
+    }
+  }
+}
+
+void GrpcEagerServiceImpl::Start() {
+  // TODO(nareshmodi) separate thread for driving CQ
+  request_handler_threadpool_->Schedule([this]() { DriveCQ(); });
+}
+
+void GrpcEagerServiceImpl::Stop() {
+  // This enqueues a special event (with a null tag)
+  // that causes the completion queue to be shut down on the
+  // polling thread.
+  shutdown_alarm_ = MakeUnique<::grpc::Alarm>(
+      cq_.get(), gpr_now(GPR_CLOCK_MONOTONIC), nullptr);
+}
+
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..65550caf646283b2a116373f7c54c28efdea6150
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_IMPL_H_
+
+#include "grpc++/alarm.h"
+#include "grpc++/completion_queue.h"
+#include "grpc++/server_builder.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+
+namespace tensorflow {
+namespace eager {
+
+// This class is a wrapper that handles communication for gRPC.
+class GrpcEagerServiceImpl {
+ public:
+  template <class RequestMessage, class ResponseMessage>
+  using EagerCall = Call<GrpcEagerServiceImpl, grpc::EagerService::AsyncService,
+                         RequestMessage, ResponseMessage>;
+
+  GrpcEagerServiceImpl(const WorkerEnv* env,
+                       ::grpc::ServerBuilder* server_builder);
+  virtual ~GrpcEagerServiceImpl() {}
+
+  void Start();
+  void Stop();
+
+ private:
+#define HANDLER(method)                                                        \
+  void method##Handler(EagerCall<method##Request, method##Response>* call) {   \
+    request_handler_threadpool_->Schedule([this, call]() {                     \
+      call->SendResponse(                                                      \
+          ToGrpcStatus(local_impl_.method(&call->request, &call->response)));  \
+    });                                                                        \
+    Call<GrpcEagerServiceImpl,                                                 \
+         tensorflow::eager::grpc::EagerService::AsyncService, method##Request, \
+         method##Response>::                                                   \
+        EnqueueRequest(&service_, cq_.get(),                                   \
+                       &grpc::EagerService::AsyncService::Request##method,     \
+                       &GrpcEagerServiceImpl::method##Handler, false);         \
+  }
+  HANDLER(CreateContext);
+  HANDLER(Enqueue);
+  HANDLER(WaitQueueDone);
+  HANDLER(KeepAlive);
+  HANDLER(CloseContext);
+  HANDLER(RegisterFunction);
+#undef HANDLER
+
+  EagerServiceImpl local_impl_;
+
+  void DriveCQ();
+
+  std::unique_ptr<::grpc::Alarm> shutdown_alarm_;
+
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
+  tensorflow::eager::grpc::EagerService::AsyncService service_;
+
+  std::unique_ptr<thread::ThreadPool> request_handler_threadpool_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GrpcEagerServiceImpl);
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_IMPL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index c832adbbbf8eba1ec512d62470025fb56a39b8a4..85adfd2c762db0ba451d0dd0e7160d16e80054fd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -147,7 +147,9 @@ MasterService::Stub::Stub(
 }
 
 MasterService::AsyncService::AsyncService() {
-  for (int i = 0; i < 10; ++i) {
+  int method_len = sizeof(grpcMasterService_method_names) / 
+                    sizeof(grpcMasterService_method_names[0]);
+  for (int i = 0; i < method_len; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcMasterService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 5b7b74ce636dcba0d70f60a74d2f78d554f2422f..1acf1fb4fc1ea9b4f516f5bba0e8a2b1d4f72e25 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -54,6 +54,7 @@ class GrpcRemoteWorker : public WorkerInterface {
         cleanupgraph_(Method(GrpcWorkerMethod::kCleanupGraph)),
         cleanupall_(Method(GrpcWorkerMethod::kCleanupAll)),
         recvtensor_(Method(GrpcWorkerMethod::kRecvTensor)),
+        recvbuf_(Method(GrpcWorkerMethod::kRecvBuf)),
         logging_(Method(GrpcWorkerMethod::kLogging)),
         tracing_(Method(GrpcWorkerMethod::kTracing)),
         completegroup_(Method(GrpcWorkerMethod::kCompleteGroup)),
@@ -118,6 +119,11 @@ class GrpcRemoteWorker : public WorkerInterface {
     IssueRequest(request, response, cleanupall_, std::move(done));
   }
 
+  void RecvBufAsync(CallOptions* call_opts, const RecvBufRequest* request,
+                    RecvBufResponse* response, StatusCallback done) override {
+    IssueRequest(request, response, recvbuf_, std::move(done), call_opts);
+  }
+
   void CompleteGroupAsync(CallOptions* call_opts,
                           const CompleteGroupRequest* request,
                           CompleteGroupResponse* response,
@@ -239,6 +245,7 @@ class GrpcRemoteWorker : public WorkerInterface {
   const ::grpc::string cleanupgraph_;
   const ::grpc::string cleanupall_;
   const ::grpc::string recvtensor_;
+  const ::grpc::string recvbuf_;
   const ::grpc::string logging_;
   const ::grpc::string tracing_;
   const ::grpc::string completegroup_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 99b6bda6b145fa89cd00bd1e1e9cca1b34b5b3a6..e5ffb4ed2fdfdd84a2566643983b2da783df385b 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -296,19 +296,19 @@ Status GrpcServer::WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
   GrpcChannelSpec channel_spec;
   TF_RETURN_IF_ERROR(ParseChannelSpec(options, &channel_spec));
 
-  std::shared_ptr<GrpcChannelCache> channel_cache(
+  channel_cache_.reset(
       NewGrpcChannelCache(channel_spec, GetChannelCreationFunction()));
 
   string name_prefix = strings::StrCat("/job:", *options.job_name, "/replica:0",
                                        "/task:", options.task_index);
 
-  const string host_port = channel_cache->TranslateTask(name_prefix);
+  const string host_port = channel_cache_->TranslateTask(name_prefix);
   int requested_port;
 
   if (!strings::safe_strto32(str_util::Split(host_port, ':')[1],
                              &requested_port)) {
     return errors::Internal("Could not parse port for local server from \"",
-                            channel_cache->TranslateTask(name_prefix), "\".");
+                            channel_cache_->TranslateTask(name_prefix), "\".");
   }
   if (requested_port != bound_port_) {
     return errors::InvalidArgument("Requested port ", requested_port,
@@ -316,7 +316,7 @@ Status GrpcServer::WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
   }
 
   *worker_cache = NewGrpcWorkerCacheWithLocalWorker(
-      channel_cache, worker_impl_.get(), name_prefix);
+      channel_cache_, worker_impl_.get(), name_prefix);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 7c2f06f618a85c901ce7a7902cb8b1bc4e57be40..0122df178ad65023b9a0b2e2c4c0fd17f9922e1e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -104,6 +104,9 @@ class GrpcServer : public ServerInterface {
   int bound_port() const { return bound_port_; }
 
   WorkerEnv* worker_env() { return &worker_env_; }
+  MasterEnv* master_env() { return &master_env_; }
+
+  std::shared_ptr<GrpcChannelCache> channel_cache() { return channel_cache_; }
 
   const ServerDef& server_def() const { return server_def_; }
 
@@ -135,6 +138,7 @@ class GrpcServer : public ServerInterface {
   std::unique_ptr<Master> master_impl_;
   AsyncServiceInterface* master_service_ = nullptr;
   std::unique_ptr<Thread> master_thread_ GUARDED_BY(mu_);
+  std::shared_ptr<GrpcChannelCache> channel_cache_;
 
   // Implementation of a TensorFlow worker, and RPC polling thread.
   WorkerEnv worker_env_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 26fad1fc3c92d50ed936b44a57e6876192e78c1e..2e7b11196383e80005a774320a90cb6ae05bab51 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "grpc++/alarm.h"
 #include "grpc++/server_builder.h"
 
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
@@ -37,10 +38,12 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
 namespace tensorflow {
@@ -159,6 +162,9 @@ class GrpcWorkerService : public AsyncServiceInterface {
       for (int i = 0; i < 1000; ++i) {
         EnqueueRecvTensorRequestRaw();
       }
+      for (int i = 0; i < 500; ++i) {
+        ENQUEUE_REQUEST(RecvBuf, true);
+      }
       for (int i = 0; i < 100; ++i) {
         ENQUEUE_REQUEST(RunGraph, true);
       }
@@ -170,9 +176,9 @@ class GrpcWorkerService : public AsyncServiceInterface {
       ENQUEUE_REQUEST(Tracing, false);
 
       for (int i = 0; i < 10; ++i) {
-        ENQUEUE_REQUEST(CompleteGroup, false);
-        ENQUEUE_REQUEST(CompleteInstance, false);
-        ENQUEUE_REQUEST(GetStepSequence, false);
+        ENQUEUE_REQUEST(CompleteGroup, true);
+        ENQUEUE_REQUEST(CompleteInstance, true);
+        ENQUEUE_REQUEST(GetStepSequence, true);
       }
 
       void* tag;
@@ -322,6 +328,20 @@ class GrpcWorkerService : public AsyncServiceInterface {
       ENQUEUE_REQUEST(Tracing, false);
     }
 
+    void RecvBufHandler(WorkerCall<RecvBufRequest, RecvBufResponse>* call) {
+      Schedule([this, call]() {
+        CallOptions* call_opts = new CallOptions;
+        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+        worker_->RecvBufAsync(call_opts, &call->request, &call->response,
+                              [call, call_opts](const Status& s) {
+                                call->ClearCancelCallback();
+                                delete call_opts;
+                                call->SendResponse(ToGrpcStatus(s));
+                              });
+      });
+      ENQUEUE_REQUEST(RecvBuf, true);
+    }
+
     void CompleteGroupHandler(
         WorkerCall<CompleteGroupRequest, CompleteGroupResponse>* call) {
       Schedule([this, call]() {
@@ -334,7 +354,7 @@ class GrpcWorkerService : public AsyncServiceInterface {
                                       call->SendResponse(ToGrpcStatus(s));
                                     });
       });
-      ENQUEUE_REQUEST(CompleteGroup, false);
+      ENQUEUE_REQUEST(CompleteGroup, true);
     }
 
     void CompleteInstanceHandler(
@@ -360,7 +380,7 @@ class GrpcWorkerService : public AsyncServiceInterface {
             &call->request, &call->response,
             [call](const Status& s) { call->SendResponse(ToGrpcStatus(s)); });
       });
-      ENQUEUE_REQUEST(GetStepSequence, false);
+      ENQUEUE_REQUEST(GetStepSequence, true);
     }
 #undef ENQUEUE_REQUEST
 
@@ -485,13 +505,90 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
       });
 }
 
+void GrpcWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                              RecvBufResponse* response, StatusCallback done) {
+  // This is a generic, low performance implementation appropriate for grpc.
+  CollectiveExecutor::Handle ce_handle(
+      env_->collective_executor_mgr->FindOrCreate(request->step_id()), true);
+  CollectiveRemoteAccess* rma = ce_handle.get()->remote_access();
+  rma->buf_rendezvous()->ConsumeBuf(
+      request->buf_rendezvous_key(),
+      [this, opts, request, response, done](const Status& status,
+                                            BufRendezvous::Hook* hook) {
+        Status s = status;
+        if (s.ok()) {
+          if (!DMAHelper::CanUseDMA(hook->prod_value)) {
+            s = errors::Internal("Tensor value for key ",
+                                 request->buf_rendezvous_key(),
+                                 " is not of a type supported by RecvBuf");
+          }
+        }
+        if (s.ok()) {
+          // The RPC source tensor needs to be in CPU RAM.  If not already
+          // there make a copy using memory appropriate to the purpose.
+          const size_t num_bytes = hook->prod_value->TotalBytes();
+          const bool on_host =
+              hook->prod_dev->attributes().device_type() == "CPU" ||
+              hook->prod_attr.on_host();
+          if ((!on_host) && (num_bytes > 0)) {
+            Device* cpu_dev = nullptr;
+            s = env_->device_mgr->LookupDevice("CPU:0", &cpu_dev);
+            if (s.ok()) {
+              AllocatorAttributes cpu_attr;
+              cpu_attr.set_gpu_compatible(true);
+              cpu_attr.set_nic_compatible(true);
+              Tensor* cpu_tensor = new Tensor(cpu_dev->GetAllocator(cpu_attr),
+                                              hook->prod_value->dtype(),
+                                              hook->prod_value->shape());
+              hook->prod_ctx->CopyDeviceTensorToCPU(
+                  hook->prod_value, "empty_name", hook->prod_dev, cpu_tensor,
+                  [this, num_bytes, response, done, hook,
+                   cpu_tensor](const Status& s) {
+                    if (s.ok()) {
+                      RecvBufRespExtra extra;
+                      extra.set_tensor_content(reinterpret_cast<const char*>(
+                                                   DMAHelper::base(cpu_tensor)),
+                                               num_bytes);
+                      response->mutable_transport_options()->PackFrom(extra);
+                    }
+                    response->set_send_start_micros(env_->env->NowMicros());
+                    done(s);
+                    BufRendezvous::DoneWithHook(hook);
+                    delete cpu_tensor;
+                  });
+              return;
+            }
+          } else {
+            // Tensor is on CPU.
+            RecvBufRespExtra extra;
+            extra.set_tensor_content(reinterpret_cast<const char*>(
+                                         DMAHelper::base(hook->prod_value)),
+                                     num_bytes);
+            response->mutable_transport_options()->PackFrom(extra);
+          }
+        }
+        response->set_send_start_micros(env_->env->NowMicros());
+        done(s);
+        BufRendezvous::DoneWithHook(hook);
+      });
+}
+
 void GrpcWorker::LoggingAsync(const LoggingRequest* request,
                               LoggingResponse* response, StatusCallback done) {
   auto env = this->env();
   if (env) {
-    auto session_mgr = (SessionMgr*)env->session_mgr;
+    auto session_mgr = env->session_mgr;
     if (session_mgr) {
-      session_mgr->SetLogging(request->rpc_logging());
+      if (request->enable_rpc_logging()) {
+        session_mgr->SetLogging(true);
+      }
+      // NOTE(mrry): Handle old masters that disable RPC logging by setting
+      // `request->enable_rpc_logging` to `false`.
+      if (request->disable_rpc_logging() ||
+          (!request->enable_rpc_logging() &&
+           request->fetch_step_id_size() == 0)) {
+        session_mgr->SetLogging(false);
+      }
       for (const auto& step_id : request->fetch_step_id()) {
         session_mgr->RetrieveLogs(step_id, response);
       }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index fbddbda9e6f9e5561d4db0e035a48ed8db0d8559..c0ed0884bc5cfdc968d1d2f1fb87c589f8455a24 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -43,6 +43,9 @@ class GrpcWorker : public Worker {
   virtual void LoggingAsync(const LoggingRequest* request,
                             LoggingResponse* response, StatusCallback done);
 
+  virtual void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                            RecvBufResponse* response, StatusCallback done);
+
   WorkerEnv* env();
 
  private:
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
index a91cc0692af71b52270e62deba869f8eb112bb0e..38cc2b81d30e5dd70b16692995eb35c045c8229e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
@@ -46,6 +46,8 @@ const char* GrpcWorkerMethodName(GrpcWorkerMethod id) {
       return "/tensorflow.WorkerService/CleanupAll";
     case GrpcWorkerMethod::kRecvTensor:
       return "/tensorflow.WorkerService/RecvTensor";
+    case GrpcWorkerMethod::kRecvBuf:
+      return "/tensorflow.WorkerService/RecvBuf";
     case GrpcWorkerMethod::kLogging:
       return "/tensorflow.WorkerService/Logging";
     case GrpcWorkerMethod::kTracing:
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index c5104c6a50182a7c9ca370f42fc9948eb5101f7e..da270835bd1ab82fd79787378168bc36d4dc9da2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -81,6 +81,7 @@ enum class GrpcWorkerMethod {
   kCleanupGraph,
   kCleanupAll,
   kRecvTensor,
+  kRecvBuf,
   kLogging,
   kTracing,
   kCompleteGroup,
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 7ef4206c780d138943851de7c66d55adbe392384..95b31c6991f6344c1b15b1fd28225aef37359818 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -67,7 +67,7 @@ Status SessionMgr::CreateSession(const string& session,
     worker_name = WorkerNameFromServerDef(server_def);
   }
 
-  if (worker_cache != nullptr & default_worker_cache_.get() != nullptr) {
+  if (worker_cache != nullptr && default_worker_cache_ != nullptr) {
     worker_cache->SetLogging(this->is_logging_active_);
   }
 
diff --git a/tensorflow/core/distributed_runtime/test_utils.h b/tensorflow/core/distributed_runtime/test_utils.h
index 0ed078241f3a58c33b63e84ba017107e8507bd30..48d83845dd3b0e332a39464258f0f782d666423f 100644
--- a/tensorflow/core/distributed_runtime/test_utils.h
+++ b/tensorflow/core/distributed_runtime/test_utils.h
@@ -93,6 +93,11 @@ class TestWorkerInterface : public WorkerInterface {
     done(errors::Unimplemented("RunGraphAsync"));
   }
 
+  void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                    RecvBufResponse* response, StatusCallback done) override {
+    done(errors::Unimplemented("RecvBufAsync"));
+  }
+
   void CompleteGroupAsync(CallOptions* opts,
                           const CompleteGroupRequest* request,
                           CompleteGroupResponse* response,
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index d682ac8f34cd3e72c46d1a1cbac2f17c892879cf..4e6500fbc6baff76228b5b2e8f4445ad970acf54 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -337,6 +337,15 @@ void Worker::TracingAsync(const TracingRequest* request,
   done(errors::Unimplemented("Tracing"));
 }
 
+void Worker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                          RecvBufResponse* response, StatusCallback done) {
+  // The base Worker class does not implement RecvBufAsync because
+  // it is not currently used for worker-to-worker communication. Use a
+  // transport-specific implementation (such as `GrpcWorker::RecvBufAsync()`)
+  // instead.
+  done(errors::Unimplemented("Worker::RecvBufAsync()"));
+}
+
 void Worker::CompleteGroupAsync(CallOptions* opts,
                                 const CompleteGroupRequest* request,
                                 CompleteGroupResponse* response,
diff --git a/tensorflow/core/distributed_runtime/worker.h b/tensorflow/core/distributed_runtime/worker.h
index b5a9ada502b201ce35dba9682699fe42fdae138c..91eb27c10ea140b607b634c7d525bac96217ddbb 100644
--- a/tensorflow/core/distributed_runtime/worker.h
+++ b/tensorflow/core/distributed_runtime/worker.h
@@ -90,6 +90,9 @@ class Worker : public WorkerInterface {
   void TracingAsync(const TracingRequest* request, TracingResponse* response,
                     StatusCallback done) override;
 
+  void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                    RecvBufResponse* response, StatusCallback done) override;
+
   void CompleteGroupAsync(CallOptions* opts,
                           const CompleteGroupRequest* request,
                           CompleteGroupResponse* response,
diff --git a/tensorflow/core/distributed_runtime/worker_cache_partial.cc b/tensorflow/core/distributed_runtime/worker_cache_partial.cc
index 61e54162343fb46f1953939b0100dbeb05afa97a..55b6957b96210d3697c732b65cd9c973b778d84b 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_partial.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_partial.cc
@@ -67,7 +67,7 @@ Status WorkerCachePartial::RefreshDeviceStatus(const string& device_name) {
   };
   std::unique_ptr<WorkerInterface, decltype(deleter)> rwi(CreateWorker(task),
                                                           deleter);
-  if (s.ok() && !rwi.get()) {
+  if (s.ok() && !rwi) {
     s = errors::Internal("RefreshDeviceStatus, unknown worker task: ", task);
   }
 
diff --git a/tensorflow/core/distributed_runtime/worker_interface.h b/tensorflow/core/distributed_runtime/worker_interface.h
index bad31d27b231db9954eb3394a50fbbc2b76a6867..a50ac3b8ae51e90c119793bc9392301335ffe3b5 100644
--- a/tensorflow/core/distributed_runtime/worker_interface.h
+++ b/tensorflow/core/distributed_runtime/worker_interface.h
@@ -112,6 +112,9 @@ class WorkerInterface {
   virtual void TracingAsync(const TracingRequest* request,
                             TracingResponse* response, StatusCallback done) = 0;
 
+  virtual void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                            RecvBufResponse* response, StatusCallback done) = 0;
+
   virtual void CompleteGroupAsync(CallOptions* opts,
                                   const CompleteGroupRequest* request,
                                   CompleteGroupResponse* response,
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2c87156dca61188a3a8deabf9ad483c9180ccd55..2bb4d32d5776640b505e0132f7f1fd263480e722 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -67,13 +67,8 @@ struct AllocatorStats {
 // device memory.
 class Allocator {
  public:
-#ifdef EIGEN_VECTORIZE_AVX512
   // Align to 64 byte boundary.
   static constexpr size_t kAllocatorAlignment = 64;
-#else
-  // Align to 32 byte boundary.
-  static constexpr size_t kAllocatorAlignment = 32;
-#endif
 
   virtual ~Allocator();
 
diff --git a/tensorflow/core/framework/api_def.proto b/tensorflow/core/framework/api_def.proto
index cce02d84b215d0b75ee096af076d177398910f3d..3f8dd272e7798ab289a0fcb411aef1e4a53ddf64 100644
--- a/tensorflow/core/framework/api_def.proto
+++ b/tensorflow/core/framework/api_def.proto
@@ -56,8 +56,10 @@ message ApiDef {
     // use a snake_case convention instead of CamelCase.
     string name = 1;
 
-    // First GraphDef version at which the op is disallowed.
-    int32 deprecation_version = 2;
+    // If this endpoint is deprecated, set deprecation_message to a
+    // message that should be logged when the endpoint is used.
+    // The message should indicate alternative endpoint to use, if any.
+    string deprecation_message = 2;
   }
   repeated Endpoint endpoint = 3;
 
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index 87c1ddd15df4f89e29b1d073f4380e65dae531f9..79966f06922a62c7d04648f4a2829d05861cd76b 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -33,6 +33,154 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+// Do not construct large tensors to compute their hash or compare for equality.
+constexpr int kMaxAttrValueTensorByteSize = 32 * 1024 * 1024;  // 32mb
+
+// Return the size of the tensor represented by this TensorProto. If shape is
+// not fully defined return -1.
+int64 TensorByteSize(const TensorProto& t) {
+  // num_elements returns -1 if shape is not fully defined.
+  int64 num_elems = TensorShape(t.tensor_shape()).num_elements();
+  return num_elems < 0 ? -1 : num_elems * DataTypeSize(t.dtype());
+}
+
+// Compute TensorProto hash by creating a Tensor, serializing it as tensor
+// content, and computing a hash of it's string representation. This is unsafe
+// operation, because large tensors can be represented as TensorProto, but can't
+// be serialized to tensor content.
+uint64 TensorProtoHash(const TensorProto& tp) {
+  Tensor tensor(tp.dtype());
+  bool success = tensor.FromProto(tp);
+  DCHECK(success);
+  TensorProto p;
+  tensor.AsProtoTensorContent(&p);
+  string s;
+  SerializeToStringDeterministic(p, &s);
+  return Hash64(s);
+}
+
+// Do not create large tensors in memory, compute hash based on TensorProto
+// string representation. Tensors with identical content potentially can have a
+// different hash code if they are defined with different TensorProto
+// representations.
+uint64 FastTensorProtoHash(const TensorProto& tp) {
+  string s;
+  if (TensorByteSize(tp) > kMaxAttrValueTensorByteSize) {
+    string s;
+    bool success = SerializeToStringDeterministic(tp, &s);
+    DCHECK(success);
+    return Hash64(s);
+  } else {
+    return TensorProtoHash(tp);
+  }
+}
+
+// There are multiple equivalent representations of attr values containing
+// TensorProtos. Compare them by constructing Tensors and serializing them
+// back. Comparing Tensor objects is pretty tricky. This is unsafe operation,
+// because large tensors can be represented as TensorProto, but can't be
+// serialized to tensor content.
+bool AreTensorProtosEqual(const TensorProto& lhs, const TensorProto& rhs) {
+  Tensor lhs_t(lhs.dtype());
+  bool success = lhs_t.FromProto(lhs);
+  DCHECK(success);
+
+  Tensor rhs_t(rhs.dtype());
+  success = rhs_t.FromProto(rhs);
+  DCHECK(success);
+
+  TensorProto lhs_tp;
+  lhs_t.AsProtoTensorContent(&lhs_tp);
+
+  TensorProto rhs_tp;
+  rhs_t.AsProtoTensorContent(&rhs_tp);
+
+  string lhs_str, rhs_str;
+  SerializeToStringDeterministic(lhs_tp, &lhs_str);
+  SerializeToStringDeterministic(rhs_tp, &rhs_str);
+
+  return lhs_str == rhs_str;
+}
+
+// Do not construct large tensors in memory, compare equality using TensorProto
+// string representation. Tensors with identical content potentially can have
+// different tensor proto representation.
+bool FastAreTensorProtosEqual(const TensorProto& lhs, const TensorProto& rhs) {
+  if (TensorByteSize(lhs) > kMaxAttrValueTensorByteSize ||
+      TensorByteSize(rhs) > kMaxAttrValueTensorByteSize) {
+    string lhs_str, rhs_str;
+    bool success = lhs.AppendToString(&lhs_str);
+    DCHECK(success);
+    success = rhs.AppendToString(&rhs_str);
+    DCHECK(success);
+
+    return lhs_str == rhs_str;
+  } else {
+    return AreTensorProtosEqual(lhs, rhs);
+  }
+}
+
+using TensorProtoHasher = std::function<uint64(const TensorProto&)>;
+using TensorProtosEquality =
+    std::function<bool(const TensorProto&, const TensorProto&)>;
+
+uint64 AttrValueHash(const AttrValue& a, const TensorProtoHasher& tensor_hash) {
+  if (a.has_tensor()) return tensor_hash(a.tensor());
+
+  if (a.has_func()) {
+    const NameAttrList& func = a.func();
+    uint64 h = Hash64(func.name());
+    std::map<string, AttrValue> map(func.attr().begin(), func.attr().end());
+    for (const auto& pair : map) {
+      h = Hash64(pair.first.data(), pair.first.size(), h);
+      h = Hash64Combine(AttrValueHash(pair.second, tensor_hash), h);
+    }
+    return h;
+  }
+
+  // If `a` is not a tensor or func, get a hash of serialized string.
+  string s;
+  SerializeToStringDeterministic(a, &s);
+  return Hash64(s);
+}
+
+bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b,
+                        const TensorProtosEquality& tensor_equality) {
+  if (a.has_tensor() != b.has_tensor()) {
+    return false;
+  } else if (a.has_tensor() && b.has_tensor()) {
+    return tensor_equality(a.tensor(), b.tensor());
+  }
+
+  // `func` field contains a nested AttrValue. Compare such AttrValues
+  // recursively.
+  if (a.has_func() != b.has_func()) {
+    return false;
+  } else if (a.has_func() && b.has_func()) {
+    const NameAttrList& af = a.func();
+    const NameAttrList& bf = b.func();
+    if (af.name() != bf.name()) return false;
+    std::unordered_map<string, AttrValue> am(af.attr().begin(),
+                                             af.attr().end());
+    for (const auto& bm_pair : bf.attr()) {
+      const auto& iter = am.find(bm_pair.first);
+      if (iter == am.end()) return false;
+      if (!AreAttrValuesEqual(iter->second, bm_pair.second, tensor_equality))
+        return false;
+      am.erase(iter);
+    }
+    if (!am.empty()) return false;
+    return true;
+  }
+
+  // All other fields in AttrValue have deterministic representations.
+  // It is safe to compare their serialized strings.
+  string a_str, b_str;
+  SerializeToStringDeterministic(a, &a_str);
+  SerializeToStringDeterministic(b, &b_str);
+  return a_str == b_str;
+}
+
 string SummarizeString(const string& str) {
   string escaped = str_util::CEscape(str);
 
@@ -412,89 +560,19 @@ void SetAttrValue(gtl::ArraySlice<NameAttrList> value, AttrValue* out) {
 }
 
 bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b) {
-  // There are multiple equivalent representations of attr values containing
-  // TensorProtos. Compare them by constructing Tensors and serializing them
-  // back. Comparing Tensor objects is pretty tricky.
-  if (a.has_tensor() != b.has_tensor()) {
-    return false;
-  } else if (a.has_tensor() && b.has_tensor()) {
-    Tensor at(a.tensor().dtype());
-    bool success = at.FromProto(a.tensor());
-    DCHECK(success);
-
-    Tensor bt(b.tensor().dtype());
-    success = bt.FromProto(b.tensor());
-    DCHECK(success);
-
-    TensorProto ap;
-    at.AsProtoTensorContent(&ap);
-
-    TensorProto bp;
-    bt.AsProtoTensorContent(&bp);
-
-    string a_str, b_str;
-    SerializeToStringDeterministic(ap, &a_str);
-    SerializeToStringDeterministic(bp, &b_str);
-    return a_str == b_str;
-  }
-
-  // `func` field contains a nested AttrValue. Compare such AttrValues
-  // recursively.
-  if (a.has_func() != b.has_func()) {
-    return false;
-  } else if (a.has_func() && b.has_func()) {
-    const NameAttrList& af = a.func();
-    const NameAttrList& bf = b.func();
-    if (af.name() != bf.name()) return false;
-    std::unordered_map<string, AttrValue> am(af.attr().begin(),
-                                             af.attr().end());
-    for (const auto& bm_pair : bf.attr()) {
-      const auto& iter = am.find(bm_pair.first);
-      if (iter == am.end()) return false;
-      if (!AreAttrValuesEqual(iter->second, bm_pair.second)) return false;
-      am.erase(iter);
-    }
-    if (!am.empty()) return false;
-    return true;
-  }
-
-  // All other fields in AttrValue have deterministic representations.
-  // It is safe to compare their serialized strings.
-  string a_str, b_str;
-  SerializeToStringDeterministic(a, &a_str);
-  SerializeToStringDeterministic(b, &b_str);
-  return a_str == b_str;
+  return AreAttrValuesEqual(a, b, AreTensorProtosEqual);
 }
 
 uint64 AttrValueHash(const AttrValue& a) {
-  if (a.has_tensor()) {
-    // Deal with multiple representations by parsing TensorProto to
-    // Tensor and serializing it back. This is slow, but current use case
-    // don't need high efficiency.
-    Tensor tensor(a.tensor().dtype());
-    bool success = tensor.FromProto(a.tensor());
-    DCHECK(success);
-    TensorProto p;
-    tensor.AsProtoTensorContent(&p);
-    string s;
-    SerializeToStringDeterministic(p, &s);
-    return Hash64(s);
-  }
-  if (a.has_func()) {
-    const NameAttrList& func = a.func();
-    uint64 h = Hash64(func.name());
-    std::map<string, AttrValue> map(func.attr().begin(), func.attr().end());
-    for (const auto& pair : map) {
-      h = Hash64(pair.first.data(), pair.first.size(), h);
-      h = Hash64Combine(AttrValueHash(pair.second), h);
-    }
-    return h;
-  }
+  return AttrValueHash(a, TensorProtoHash);
+}
 
-  // If `a` is not a tensor or func, get a hash of serialized string.
-  string s;
-  SerializeToStringDeterministic(a, &s);
-  return Hash64(s);
+bool FastAreAttrValuesEqual(const AttrValue& a, const AttrValue& b) {
+  return AreAttrValuesEqual(a, b, FastAreTensorProtosEqual);
+}
+
+uint64 FastAttrValueHash(const AttrValue& a) {
+  return AttrValueHash(a, FastTensorProtoHash);
 }
 
 bool HasPlaceHolder(const AttrValue& val) {
diff --git a/tensorflow/core/framework/attr_value_util.h b/tensorflow/core/framework/attr_value_util.h
index 29e34c5090ea9116de054ff7d3a9faf4ed2c30fb..0da9b1081bdf0b5314a3b18c4e34198505424eec 100644
--- a/tensorflow/core/framework/attr_value_util.h
+++ b/tensorflow/core/framework/attr_value_util.h
@@ -98,6 +98,19 @@ bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b);
 // probably not persist the returned value.
 uint64 AttrValueHash(const AttrValue& a);
 
+// WARNING: Equality check might return false-negative for large (> 32mb)
+// tensors defined with different TensorProto representations.
+//
+// A pair of consistent hash and equals functions that are guaranteed to be fast
+// with AttrValues that potentially can have very large Tensors (larger than
+// 32mb) defined by TensorProto. If large identical Tensors are defined using
+// different representations (e.g. one with tensor content, and second with
+// bool_val), they will have different hash code and equals will return false.
+// Small (less than 32mb) tensors with different TensorProto representations
+// hashed/compared by their tensor content.
+uint64 FastAttrValueHash(const AttrValue& a);
+bool FastAreAttrValuesEqual(const AttrValue& a, const AttrValue& b);
+
 // Returns true if "val" has a placeholder.
 bool HasPlaceHolder(const AttrValue& val);
 
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 0916c9b7a85122da7b4067b82b7b7cc1e7c3fb0a..71a31b0e75471c0965c46ac68f1e0728bfc35261 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1417,6 +1417,21 @@ Status ExplicitShape(InferenceContext* c) {
   return Status::OK();
 }
 
+Status ExplicitShapes(InferenceContext* c) {
+  std::vector<PartialTensorShape> shapes;
+  TF_RETURN_IF_ERROR(c->GetAttr("shapes", &shapes));
+  if (shapes.empty()) {
+    return errors::Internal("shapes attribute is empty");
+  }
+  for (int i = 0; i < shapes.size(); ++i) {
+    ShapeHandle output_shape;
+    TF_RETURN_IF_ERROR(
+        c->MakeShapeFromPartialTensorShape(shapes[i], &output_shape));
+    c->set_output(i, output_shape);
+  }
+  return Status::OK();
+}
+
 }  // namespace shape_inference
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 789746b4037fbf9f11d34c425272bfc44d8623be..87bb133d929e4c0ad6b70fc54e2edc7918200d29 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -289,6 +289,9 @@ Status ScatterNdUpdateShape(InferenceContext* c);
 // Shape function for ops with an explicit "shape" attribute.
 Status ExplicitShape(InferenceContext* c);
 
+// Shape function for multiple-output ops with an explicit "shapes" attribute.
+Status ExplicitShapes(InferenceContext* c);
+
 }  // namespace shape_inference
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 4145ef7bc9d22632db3d0a71f8901a671dd95ee5..62a9d5751d6d4e8a82089fdecfd450a81c2d6bae 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/dataset.h"
 
+#include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
 
@@ -269,4 +270,22 @@ const char GraphDatasetBase::kDatasetGraphKey[] = "_DATASET_GRAPH";
 const char GraphDatasetBase::kDatasetGraphOutputNodeKey[] =
     "_DATASET_GRAPH_OUTPUT_NODE";
 
+namespace dataset {
+
+IteratorContext MakeIteratorContext(OpKernelContext* ctx) {
+  IteratorContext::Params params;
+  params.env = ctx->env();
+  params.runner = *(ctx->runner());
+  params.lib = ctx->function_library();
+  // Note: must use reinterpret_cast because function.h forward-declares Device.
+  DeviceBase* device =
+      reinterpret_cast<DeviceBase*>(ctx->function_library()->device());
+  params.allocator_getter = [device](AllocatorAttributes attrs) {
+    return device->GetAllocator(attrs);
+  };
+  return IteratorContext(params);
+}
+
+}  // namespace dataset
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 775d9f6eb6a4e24e74b44988ada95eed5239f725..8624af9bf56e7ef4c4ff1223e7d70394bc1b180d 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -619,6 +619,12 @@ Status GetDatasetFromVariantTensor(const Tensor& tensor,
 // The ownership of `dataset` is transferred to `tensor`.
 Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
 
+namespace dataset {
+
+IteratorContext MakeIteratorContext(OpKernelContext* ctx);
+
+}  // namespace dataset
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index bdc1af9fdaeb2e2ba7605b6f67ea55fa0bb7977a..647c66099cfa99e0d7f83782f9ba77a8fb021b1c 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -504,7 +504,7 @@ string Print(const NodeDef& n) {
   std::vector<string> dep;
   for (StringPiece s : n.input()) {
     if (str_util::ConsumePrefix(&s, "^")) {
-      dep.push_back(s.ToString());
+      dep.push_back(std::string(s));
     } else {
       dat.push_back(s);
     }
diff --git a/tensorflow/core/framework/node_def_builder.cc b/tensorflow/core/framework/node_def_builder.cc
index f9cf6ce87359d6e4df03306629b53a73e5673181..8e00bfe4f894202919444c166245189a4bca4409 100644
--- a/tensorflow/core/framework/node_def_builder.cc
+++ b/tensorflow/core/framework/node_def_builder.cc
@@ -24,22 +24,23 @@ limitations under the License.
 namespace tensorflow {
 
 NodeDefBuilder::NodeOut::NodeOut(StringPiece n, int i, DataType dt)
-    : node(n.ToString()), index(i), data_type(dt) {}
+    : node(std::string(n)), index(i), data_type(dt) {}
 
 NodeDefBuilder::NodeOut::NodeOut() {
   // uninitialized, call Reset() before use.
 }
 
 void NodeDefBuilder::NodeOut::Reset(StringPiece n, int i, DataType dt) {
-  node = n.ToString();
+  node = std::string(n);
   index = i;
   data_type = dt;
 }
 
 NodeDefBuilder::NodeDefBuilder(StringPiece name, StringPiece op_name,
                                const OpRegistryInterface* op_registry) {
-  node_def_.set_name(name.ToString());
-  const Status status = op_registry->LookUpOpDef(op_name.ToString(), &op_def_);
+  node_def_.set_name(std::string(name));
+  const Status status =
+      op_registry->LookUpOpDef(std::string(op_name), &op_def_);
   if (status.ok()) {
     Initialize();
   } else {
@@ -50,7 +51,7 @@ NodeDefBuilder::NodeDefBuilder(StringPiece name, StringPiece op_name,
 
 NodeDefBuilder::NodeDefBuilder(StringPiece name, const OpDef* op_def)
     : op_def_(op_def) {
-  node_def_.set_name(name.ToString());
+  node_def_.set_name(std::string(name));
   Initialize();
 }
 
@@ -170,7 +171,7 @@ void NodeDefBuilder::AddInput(StringPiece src_node, int src_index) {
   } else if (src_index > 0) {
     node_def_.add_input(strings::StrCat(src_node, ":", src_index));
   } else {
-    node_def_.add_input(src_node.ToString());
+    node_def_.add_input(std::string(src_node));
   }
 }
 
@@ -193,12 +194,12 @@ void NodeDefBuilder::VerifyInputRef(const OpDef::ArgDef* input_arg,
 }
 
 NodeDefBuilder& NodeDefBuilder::ControlInput(StringPiece src_node) {
-  control_inputs_.push_back(src_node.ToString());
+  control_inputs_.push_back(std::string(src_node));
   return *this;
 }
 
 NodeDefBuilder& NodeDefBuilder::Device(StringPiece device_spec) {
-  node_def_.set_device(device_spec.ToString());
+  node_def_.set_device(std::string(device_spec));
   return *this;
 }
 
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index bad92ca9b3d8c981a5dc56485d218179190e83d0..a816c151407ed2a7e8ced3a1834fdc5e8d829317 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -245,7 +245,7 @@ DEFINE_GET_ATTR(NameAttrList, func, "func", emplace_back, v, ;);
 #undef DEFINE_GET_ATTR
 
 bool HasNodeAttr(const NodeDef& node_def, StringPiece attr_name) {
-  return node_def.attr().find(attr_name.ToString()) != node_def.attr().end();
+  return node_def.attr().find(std::string(attr_name)) != node_def.attr().end();
 }
 
 static const string& kEmptyString = *new string();
@@ -378,15 +378,20 @@ Status OutputTypeForNode(const NodeDef& node_def, const OpDef& op_def,
                                  node_def.name());
 }
 
+Status OutputTypesForNode(const NodeDef& node_def, const OpDef& op_def,
+                          DataTypeVector* outputs) {
+  for (const auto& arg : op_def.output_arg()) {
+    TF_RETURN_IF_ERROR(AddArgToSig(node_def, arg, outputs));
+  }
+  return Status::OK();
+}
+
 Status InOutTypesForNode(const NodeDef& node_def, const OpDef& op_def,
                          DataTypeVector* inputs, DataTypeVector* outputs) {
   for (const auto& arg : op_def.input_arg()) {
     TF_RETURN_IF_ERROR(AddArgToSig(node_def, arg, inputs));
   }
-  for (const auto& arg : op_def.output_arg()) {
-    TF_RETURN_IF_ERROR(AddArgToSig(node_def, arg, outputs));
-  }
-  return Status::OK();
+  return OutputTypesForNode(node_def, op_def, outputs);
 }
 
 Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
@@ -639,7 +644,7 @@ Status AttachDef(const Status& status, const Node& node) {
 
 void AddNodeAttr(StringPiece name, const AttrValue& value, NodeDef* node_def) {
   node_def->mutable_attr()->insert(
-      AttrValueMap::value_type(name.ToString(), value));
+      AttrValueMap::value_type(std::string(name), value));
 }
 
 #define ADD_NODE_ATTR(T)                                           \
@@ -677,7 +682,7 @@ ADD_NODE_ATTR(gtl::ArraySlice<NameAttrList>)
 #undef ADD_NODE_ATTR
 
 void AddAttr(StringPiece name, const AttrValue& value, AttrValueMap* map) {
-  map->insert(AttrValueMap::value_type(name.ToString(), value));
+  map->insert(AttrValueMap::value_type(std::string(name), value));
 }
 
 #define ADD_ATTR(T)                                            \
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index b8a1e84f2e79d6537f58e9ac15ff8e1a22f877c7..ce7818a31c689eb6b36933fbb8eb7525b14ea21f 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -247,6 +247,10 @@ Status InputTypeForNode(const NodeDef& node_def, const OpDef& op_def,
 // REQUIRES: ValidateOpDef(op_def).ok()
 Status OutputTypeForNode(const NodeDef& node_def, const OpDef& op_def,
                          int output_port, DataType* output_type);
+// Computes the output types for a specific node.
+// REQUIRES: ValidateOpDef(op_def).ok()
+Status OutputTypesForNode(const NodeDef& node_def, const OpDef& op_def,
+                          DataTypeVector* outputs);
 // Computes the input and output types for a specific node.
 // REQUIRES: ValidateOpDef(op_def).ok()
 Status InOutTypesForNode(const NodeDef& node_def, const OpDef& op_def,
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index 403bd0b5e22a314309ad0994a879d588c124fe54..91eb6c0672d93e229a31424795ec54b5a68b3067 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -527,7 +527,7 @@ void FinalizeDoc(const string& text, OpDef* op_def,
 }  // namespace
 
 OpDefBuilder::OpDefBuilder(StringPiece op_name) {
-  op_def()->set_name(op_name.ToString());  // NOLINT
+  op_def()->set_name(std::string(op_name));  // NOLINT
 }
 
 OpDefBuilder& OpDefBuilder::Attr(StringPiece spec) {
@@ -584,7 +584,7 @@ OpDefBuilder& OpDefBuilder::Deprecated(int version, StringPiece explanation) {
   } else {
     OpDeprecation* deprecation = op_def()->mutable_deprecation();
     deprecation->set_version(version);
-    deprecation->set_explanation(explanation.ToString());
+    deprecation->set_explanation(std::string(explanation));
   }
   return *this;
 }
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 7f23272871abe96dfa2fd7240bfc82015178bda6..4b56d807df6bca6806dab5a1be79399bf6830d82 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_gen_lib.h"
 
+#include <algorithm>
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -185,7 +186,7 @@ static bool FindMultiline(StringPiece line, size_t colon, string* end) {
   while (str_util::ConsumePrefix(&line, " ")) {
   }
   if (str_util::ConsumePrefix(&line, "<<")) {
-    *end = line.ToString();
+    *end = std::string(line);
     return true;
   }
   return false;
@@ -306,9 +307,6 @@ void InitApiDefFromOpDef(const OpDef& op_def, ApiDef* api_def) {
 
   auto* endpoint = api_def->add_endpoint();
   endpoint->set_name(op_def.name());
-  if (op_def.has_deprecation()) {
-    endpoint->set_deprecation_version(op_def.deprecation().version());
-  }
 
   for (const auto& op_in_arg : op_def.input_arg()) {
     auto* api_in_arg = api_def->add_in_arg();
diff --git a/tensorflow/core/framework/op_gen_lib_test.cc b/tensorflow/core/framework/op_gen_lib_test.cc
index 857b1c8dbcac66899f98bb4f2ef87f65f7442f6b..e0e77c74495d62d0d0d2bc1c75d50fb1963bdcfd 100644
--- a/tensorflow/core/framework/op_gen_lib_test.cc
+++ b/tensorflow/core/framework/op_gen_lib_test.cc
@@ -189,7 +189,6 @@ TEST(OpGenLibTest, ApiDefInitializedFromOpDef) {
 visibility: VISIBLE
 endpoint {
   name: "testop"
-  deprecation_version: 123
 }
 in_arg {
   name: "arg_a"
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index ca91d68f79f9e3d2bf658e8cb240ed6de93ff983..c71bcb26abc025e1ab5b990403dde5396f1ef508 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -923,7 +923,7 @@ void OpKernelContext::clear_recorded_memory() {
 struct KernelRegistration {
   KernelRegistration(const KernelDef& d, StringPiece c,
                      kernel_factory::OpKernelRegistrar::Factory f)
-      : def(d), kernel_class_name(c.ToString()), factory(f) {}
+      : def(d), kernel_class_name(std::string(c)), factory(f) {}
   const KernelDef def;
   const string kernel_class_name;
   const kernel_factory::OpKernelRegistrar::Factory factory;
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index c84ea3b034cc20329b20af111f6b08ceebbfb80b..621da5b83861956b50e863e2faab5b68a18995bb 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -338,6 +338,9 @@ class ResourceHandleOp : public OpKernel {
  private:
   string container_;
   string name_;
+  mutex mutex_;
+  Tensor resource_;
+  std::atomic<bool> initialized_{false};
 };
 
 // Registers a kernel for an op which produces a handle to a resource of the
@@ -511,10 +514,20 @@ ResourceHandleOp<T>::ResourceHandleOp(OpKernelConstruction* context)
 
 template <typename T>
 void ResourceHandleOp<T>::Compute(OpKernelContext* ctx) {
-  Tensor* output = nullptr;
-  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
-  output->scalar<ResourceHandle>()() =
-      MakeResourceHandle<T>(ctx, container_, name_);
+  if (!initialized_.load()) {
+    mutex_lock ml(mutex_);
+    // Checking again to see if another thread has initialized the resource.
+    if (!initialized_.load()) {
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}),
+                                             &resource_, attr));
+      resource_.scalar<ResourceHandle>()() =
+          MakeResourceHandle<T>(ctx, container_, name_);
+      initialized_.store(true);
+    }
+  }
+  ctx->set_output(0, resource_);
 }
 
 }  //  end namespace tensorflow
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 2b995e8b5e8c845078de6f0f82f204f6fa781ee8..b02bc3adbed9e7e4d47d0f2e2094d8e062dfd9a7 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -605,13 +605,20 @@ Status InferenceContext::Subshape(ShapeHandle s, int64 start,
   return Subshape(s, start, std::numeric_limits<int64>::max() /* end */, out);
 }
 
-Status InferenceContext::Subshape(ShapeHandle s, int64 start_in, int64 end_in,
+Status InferenceContext::Subshape(ShapeHandle s, int64 start, int64 end,
                                   ShapeHandle* out) {
-  int64 start = start_in;
-  int64 end = end_in;
+  return Subshape(s, start, end, 1 /* stride */, out);
+}
+
+Status InferenceContext::Subshape(ShapeHandle s, int64 start, int64 end,
+                                  int64 stride, ShapeHandle* out) {
+  int64 start_in = start;
+  int64 end_in = end;
+
   const int32 rank = Rank(s);
-  if (start == 0 && ((RankKnown(s) && end >= rank) ||
-                     end == std::numeric_limits<int64>::max())) {
+  if (start == 0 && stride == 1 &&
+      ((RankKnown(s) && end >= rank) ||
+       end == std::numeric_limits<int64>::max())) {
     *out = s;
     return Status::OK();
   }
@@ -621,6 +628,9 @@ Status InferenceContext::Subshape(ShapeHandle s, int64 start_in, int64 end_in,
 
   if (start > rank) start = rank;
   if (end > rank) end = rank;
+
+  if (stride < 0 && start == rank) --start;
+
   if (start < 0) {
     start = rank + start;
     if (start < 0) {
@@ -638,16 +648,23 @@ Status InferenceContext::Subshape(ShapeHandle s, int64 start_in, int64 end_in,
                                      ", for shape with rank ", rank);
     }
   }
-  if (start > end) {
+  if (stride > 0 && start > end) {
     *out = nullptr;
     return errors::InvalidArgument(
         "Subshape must have computed start <= end, but is ", start, " and ",
         end, " (computed from start ", start_in, " and end ", end_in,
         " over shape with rank ", rank, ")");
+  } else if (stride < 0 && start < end) {
+    *out = nullptr;
+    return errors::InvalidArgument(
+        "Subshape must have computed start >= end since stride is negative, "
+        "but is ",
+        start, " and ", end, " (computed from start ", start_in, " and end ",
+        end_in, " over shape with rank ", rank, " and stride", stride, ")");
   }
+
   std::vector<DimensionHandle> dims;
-  dims.reserve(end - start);
-  for (int i = start; i < end; ++i) {
+  for (int i = start; stride > 0 ? i < end : i > end; i += stride) {
     dims.push_back(Dim(s, i));
   }
   return ReturnCreatedShape(dims, out);
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index 9431a62abefd1a98aa95df2b58f343dfad2c7bee..3f3729dcf97e4df81a22f6e2ab7ceee6377562a5 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -434,6 +434,13 @@ class InferenceContext {
   Status Subshape(ShapeHandle s, int64 start, int64 end,
                   ShapeHandle* out) TF_MUST_USE_RESULT;
 
+  // Returns in <*out> a sub-shape of <s>, with dimensions [start:end:stride].
+  // <start> and <end> can be negative, to index from the end of the shape.
+  // <start> and <end> are set to the rank of <s> if > rank of <s>.
+  // <stride> can be negative, to reverse the <s>.
+  Status Subshape(ShapeHandle s, int64 start, int64 end, int64 stride,
+                  ShapeHandle* out) TF_MUST_USE_RESULT;
+
   // Returns in <*out> the result of appending the dimensions of <s2> to those
   // of <s1>.
   Status Concatenate(ShapeHandle s1, ShapeHandle s2,
diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index 2a99af7659d9be0dbab505fc7147e7fcc15d67c9..f6656b3b4563886473fbba3bade71a943d931ca5 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -32,7 +32,7 @@ class Tensor;
 
 struct ShapeInferenceTestOp {
   typedef std::pair<string, DataType> ShapeAndType;
-  explicit ShapeInferenceTestOp(StringPiece name) : name(name.ToString()) {}
+  explicit ShapeInferenceTestOp(StringPiece name) : name(std::string(name)) {}
   string name;
   NodeDef node_def;
   std::vector<const Tensor*> input_tensors;
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index d5a45c73c37bf0807e9437a4c886ca0d96dc5c67..167e0eaa6e187c9a340ad18bf7a2e02da646512a 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -619,7 +619,7 @@ void Tensor::CheckTypeAndIsAligned(DataType expected_dtype) const {
   CHECK_EQ(dtype(), expected_dtype)
       << DataTypeString(expected_dtype) << " expected, got "
       << DataTypeString(dtype());
-  CHECK(IsAligned()) << "CheckTypeAndIsAligned";
+  CHECK(IsAligned()) << "ptr = " << base<void>();
 }
 
 void Tensor::CheckIsAlignedAndSingleElement() const {
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index b613effd18bbbaf107a56b518859024db1c9bbb2..80e168df972c4cb662ff479c16b7172a1fbea598 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1147,29 +1147,29 @@ TEST(Tensor, FailureToAllocate) {
 
 // On the alignment.
 //
-// As of 2015/8, tensorflow::Tensor allocates its buffer with 32-byte
+// As of 2018/5, tensorflow::Tensor allocates its buffer with 64-byte
 // alignment. Tensor::tensor/flat/vec/matrix methods requires the
 // buffer satisfies Eigen::Aligned (e.g., 16-bytes aligned usually,
-// and 32-bytes for AVX). Tensor::Slice requires the caller to ensure
-// its result is aligned if the caller intends to use those methods.
-// In this test case, we simply make sure each slice is 32-byte
-// aligned: sizeof(float) * 4 * 2 = 32.
+// 32-bytes for AVX, and 64-bytes for AVX512). Tensor::Slice requires
+// the caller to ensure its result is aligned if the caller intends
+// to use those methods. In this test case, we simply make sure each
+// slice is 64-byte aligned: sizeof(float) * 4 * 36 = 576.  576 % 64 = 0.
 TEST(Tensor, Slice_Basic) {
   Tensor saved;
   {  // General
-    Tensor x(DT_FLOAT, TensorShape({10, 4, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 4, 36}));
     // Fills in known values.
     for (int i = 0; i < 10; ++i) {
       x.Slice(i, i + 1).flat<float>().setConstant(i * 1.f);
     }
     // A simple slice along dim0.
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 36})));
     auto tx = x.tensor<float, 3>();
     auto ty = y.tensor<float, 3>();
     for (int i = 0; i < 4; ++i) {
       for (int j = 0; j < 4; ++j) {
-        for (int k = 0; k < 34; ++k) {
+        for (int k = 0; k < 36; ++k) {
           EXPECT_EQ(ty(i, j, k), 4.0 + i);
           EXPECT_EQ(&tx(4 + i, j, k), &ty(i, j, k));
         }
@@ -1186,7 +1186,7 @@ TEST(Tensor, Slice_Basic) {
     auto tz = z.tensor<float, 3>();
     EXPECT_EQ(1, z.dim_size(0));
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
         EXPECT_EQ(tz(0, j, k), 6.0);
       }
     }
@@ -1198,16 +1198,16 @@ TEST(Tensor, Slice_Basic) {
     EXPECT_EQ(1, saved.dim_size(0));
     auto tsaved = saved.tensor<float, 3>();
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
         EXPECT_EQ(tsaved(0, j, k), 6.0);
       }
     }
   }
   {  // Empty
-    Tensor x(DT_FLOAT, TensorShape({10, 0, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 0, 36}));
     x.flat<float>().setRandom();
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 36})));
   }
 
   {
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index eeb6c60f717523f9ad35e24011b420980744ecc8..71d0637dc238428a5cf5a3cdb2b2634f5684349a 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -695,7 +695,7 @@ Status Graph::AddWhileContext(StringPiece frame_name,
                               std::vector<OutputTensor> body_outputs,
                               WhileContext** result) {
   auto pair = while_ctxs_.insert(std::pair<string, WhileContext>(
-      frame_name.ToString(),
+      std::string(frame_name),
       WhileContext(frame_name, std::move(enter_nodes), std::move(exit_nodes),
                    cond_output, std::move(body_inputs),
                    std::move(body_outputs))));
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index c678283fce1768888cb7730aac8f306336c78c4b..2fd32c0bd4319ffd345769c0ce6504c20679f0f6 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -489,7 +489,7 @@ Status GraphConstructor::InitFromEdges() {
           num_control_edges++;
         } else {
           TensorId id(ParseTensorName(input_name));
-          if (next_iteration_nodes_.find(id.first.ToString()) !=
+          if (next_iteration_nodes_.find(std::string(id.first)) !=
               next_iteration_nodes_.end()) {
             has_loop_back_edge = true;
           }
@@ -811,7 +811,7 @@ void GraphConstructor::UniquifyNames(
     // We require that UniquifyNames() is called on all NodeDefs in topological
     // order. This guarantees that node_def's inputs will already be uniquified
     // if necessary.
-    auto iter = uniquified_names_.find(id.first.ToString());
+    auto iter = uniquified_names_.find(std::string(id.first));
     if (iter == uniquified_names_.end()) continue;
     id.first = iter->second;
     node_def->set_input(i, id.ToString());
@@ -830,7 +830,7 @@ void GraphConstructor::UpdateUniquifiedColocationNames() {
     for (int i = 0; i < coloc_values.size(); ++i) {
       StringPiece val(coloc_values[i]);
       if (str_util::ConsumePrefix(&val, kColocationGroupPrefix)) {
-        const auto& name_pair = uniquified_names_.find(val.ToString());
+        const auto& name_pair = uniquified_names_.find(std::string(val));
         if (name_pair == uniquified_names_.end()) continue;
         updated = true;
         coloc_values[i] =
@@ -856,7 +856,7 @@ bool GraphConstructor::NameExistsInGraphDef(StringPiece name) {
 }
 
 string GraphConstructor::FindUniqueName(StringPiece original_name) {
-  string name = original_name.ToString();
+  string name = std::string(original_name);
   int count = 0;
   // Check that any generated names don't collide with imported NodeDefs (as
   // well as nodes in g_).
@@ -989,7 +989,7 @@ Status GraphConstructor::Convert() {
             src_node->num_outputs(), " outputs");
       }
 
-      inputs.emplace_back(id.first.ToString(), src_node, src_index);
+      inputs.emplace_back(std::string(id.first), src_node, src_index);
     }
 
     if (has_data_back_edge && !IsMerge(*node_def)) {
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index b513778de9c86e12c8c80b51fbb4e5801f3514e2..c54b4fa269eaf240fa47d80b5e46991bcbcb8e6f 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -157,7 +157,7 @@ class GraphConstructorTest : public ::testing::Test {
     }
     StringPiece loc(value[0]);
     return str_util::ConsumePrefix(&loc, kColocationGroupPrefix)
-               ? loc.ToString()
+               ? std::string(loc)
                : "";
   }
 
diff --git a/tensorflow/core/graph/graph_def_builder.cc b/tensorflow/core/graph/graph_def_builder.cc
index 7a58347bd1ba44d822f5c52d2686e4b3c6e43d9b..dd84c4f7c7269dd212bcfb29085079e5d19e3403 100644
--- a/tensorflow/core/graph/graph_def_builder.cc
+++ b/tensorflow/core/graph/graph_def_builder.cc
@@ -44,12 +44,12 @@ GraphDefBuilder::Options GraphDefBuilder::Options::WithControlInputs(
 }
 GraphDefBuilder::Options GraphDefBuilder::Options::WithNameImpl(
     StringPiece name) {
-  name_ = name.ToString();
+  name_ = std::string(name);
   return *this;
 }
 GraphDefBuilder::Options GraphDefBuilder::Options::WithDeviceImpl(
     StringPiece device) {
-  device_ = device.ToString();
+  device_ = std::string(device);
   return *this;
 }
 GraphDefBuilder::Options GraphDefBuilder::Options::WithControlInputImpl(
diff --git a/tensorflow/core/graph/graph_def_builder.h b/tensorflow/core/graph/graph_def_builder.h
index 776a74c6d8821e53a26d73399105f55189f227df..0d6aae43556920027a2d1a8a19b23b6a3243fa3c 100644
--- a/tensorflow/core/graph/graph_def_builder.h
+++ b/tensorflow/core/graph/graph_def_builder.h
@@ -128,7 +128,7 @@ class GraphDefBuilder {
     Options WithControlInputsImpl(gtl::ArraySlice<Node*> control_inputs);
     template <class T>
     Options WithAttrImpl(StringPiece name, T&& value) {
-      attrs_.emplace_back(name.ToString(), AttrValue());
+      attrs_.emplace_back(std::string(name), AttrValue());
       SetAttrValue(std::forward<T>(value), &attrs_.back().second);
       return *this;
     }
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 877e4f1b44e005b310667f48dcc0bfd0d0a7e1d5..1b1941f9c19cf64420be0f088c99f00768c15b53 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -785,7 +785,7 @@ Status TopologicalSortNodesWithTimePriority(
   for (int n = 0; n < gdef->node_size(); ++n) {
     const NodeDef* ndef = &gdef->node(n);
     for (int i = 0; i < ndef->input_size(); ++i) {
-      node_to_output_nodes[ParseTensorName(ndef->input(i)).first.ToString()]
+      node_to_output_nodes[std::string(ParseTensorName(ndef->input(i)).first)]
           .push_back(ndef);
     }
     int64 start_time;
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 72a13d4da7a70a7dc22df8ef382575ac78ad78d3..b9667998d6143b24fee4bc6d23527bf5135331a1 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -2691,14 +2691,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(), kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(), kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
@@ -2865,9 +2865,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
-  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized 
-  // path. The unoptimized path is slow. Thus we dont rewrite the node 
-  // and use default Eigen. But for depth_radius=2, MKL DNN optimized 
+  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized
+  // path. The unoptimized path is slow. Thus we dont rewrite the node
+  // and use default Eigen. But for depth_radius=2, MKL DNN optimized
   // path is taken, i.e., eigen node is rewritten by MKl DNN node.
   static bool LrnRewrite(const Node* n) {
     CHECK_NOTNULL(n);
@@ -2876,13 +2876,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_EQ(GetNodeAttr(n->def(), "depth_radius", &depth_radius).ok(), true);
 
     // if the depth_radius of LRN is not 2, don't rewrite the node by MKL DNN
-    // and use eigen node instead 
+    // and use eigen node instead
     if (depth_radius == 2) {
       return true;
     }
     VLOG(1) << "LrnRewrite: The model sets depth_radius as not 2 which"
             << "case is not optimized by Intel MKL, thus using Eigen op"
-            << "for LRN " ; 
+            << "for LRN ";
 
     return false;
   }
@@ -3015,6 +3015,35 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                 std::vector<NodeBuilder::NodeOut>* ws_tensors,
                                 bool* are_ws_tensors_added);
 
+  // Helper function used by FixMklMetaDataEdges. Fixes the metadata edge
+  // pointed by 'e_metadata' corresponding to the data edge 'e_data' in graph
+  // 'g'. Returns true is fixup was done; otherwise, it returns false.
+  bool FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
+    const Edge* e_data, const Edge* e_metadata);
+
+  // Are the input Mkl metadata edges for node 'n' in graph 'g' correctly
+  // connected? If not, then fix them. This is needed because a graph may have
+  // some input Mkl metadata edges incorrectly setup after node merge and
+  // rewrite passes. This could happen because GetReversePostOrder function may
+  // not provide topologically sorted order if a graph contains cycles. The
+  // function returns true if at least one Mkl metadata edge for node 'n' was
+  // fixed. Otherwise, it returns false.
+  //
+  // Example:
+  //
+  // X = MklConv2D(_, _, _)
+  // Y = MklConv2DWithBias(_, _, _, _, _, _)
+  // Z = MklAdd(X, Y, DummyMklTensor, Y:1)
+  //
+  // For a graph such as shown above, note that 3rd argument of MklAdd contains
+  // DummyMklTensor. Actually, it should be getting the Mkl metadata from
+  // MklConv2D op (specifically, X:2). This incorrect plumbing could be possible
+  // (although rare) if the Mkl NodeMerge + NodeRewrite passes visit Z before X
+  // (possible if X, Y, Z are part of a loop.) This function fixes the Mkl
+  // metadata edges only - it does not rewrite nodes nor does it modify the Mkl
+  // data edges (1st and 2nd arguments of MklAdd).
+  bool FixMklMetaDataEdges(std::unique_ptr<Graph>* g, Node* n);
+
   // Functions specific to operators to copy attributes
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
@@ -4241,6 +4270,92 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   return nullptr;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+//              Post-rewrite Mkl metadata fixup pass
+///////////////////////////////////////////////////////////////////////////////
+bool MklLayoutRewritePass::FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
+    const Edge* e_data, const Edge* e_metadata) {
+  if (g == nullptr || e_data == nullptr || e_metadata == nullptr) {
+    return false;
+  }
+
+  Node* n_data = e_data->src();
+  int n_data_op_slot = e_data->src_output();
+  int n_metadata_op_slot = GetTensorMetaDataIndex(n_data_op_slot,
+                                                  n_data->num_outputs());
+
+  // If the source of meta edge is a constant node (producing dummy Mkl metadata
+  // tensor), then we will need to fix.
+  if (IsConstant(e_metadata->src())) {
+    Node* e_metadata_dst = e_metadata->dst();
+    int e_metadata_in_slot = e_metadata->dst_input();
+    CHECK_NOTNULL((*g)->AddEdge(n_data, n_metadata_op_slot,
+                  e_metadata_dst, e_metadata_in_slot));
+
+    (*g)->RemoveEdge(e_metadata);
+    return true;
+  }
+
+  return false;
+}
+
+bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr<Graph>* g,
+    Node* n) {
+  bool result = false;
+
+  // If graph node is not Mkl node, then return.
+  DataType T = DT_INVALID;
+  if (!GetNodeAttr(n->def(), "T", &T).ok() ||
+      !mkl_op_registry::IsMklOp(n->type_string(), T)) {
+    return result;
+  }
+
+  // If it is Mkl node, then check if the input edges to this node that carry
+  // Mkl metadata are linked up correctly with the source node.
+
+  // For Mkl nodes, we generate twice the number of input tensors (n for Mkl
+  // data tensors + n for Mkl metadata tensors). We need to check for correct
+  // connection of n metadata tensors only.
+  int num_data_inputs = n->num_inputs() / 2;
+  for (int idx = 0; idx < num_data_inputs; idx++) {
+    // Get the edge connecting input slot with index (idx).
+    const Edge* e = nullptr;
+    TF_CHECK_OK(n->input_edge(idx, &e));
+
+    // If e is control edge, then skip.
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    // Check that the source node for edge 'e' is Mkl node. If it is not an Mkl
+    // node, then we don't need to do anything.
+    Node* e_src = e->src();
+    if (GetNodeAttr(e_src->def(), "T", &T).ok() &&
+        mkl_op_registry::IsMklOp(e_src->type_string(), T)) {
+      // Source node for edge 'e' is Mkl node.
+      // Destination node and destination input slot of e is node 'n' and 'idx'
+      // resp.
+      CHECK_EQ(e->dst(), n);
+      CHECK_EQ(e->dst_input(), idx);
+
+      // Let's get edge that carries Mkl metadata corresponding to Mkl data edge
+      // 'e'. For that, let's first get the input slot of 'n' where the meta
+      // edge will feed the value.
+      int e_meta_in_slot = GetTensorMetaDataIndex(e->dst_input(),
+                                                  n->num_inputs());
+      const Edge* e_meta = nullptr;
+      TF_CHECK_OK(n->input_edge(e_meta_in_slot, &e_meta));
+
+      // Let's check if we need to fix this meta edge.
+      if (FixMklMetaDataEdgeIfNeeded(g, e, e_meta)) {
+        result = true;
+      }
+    }
+  }
+
+  return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 //              Run function for the pass
 ///////////////////////////////////////////////////////////////////////////////
@@ -4307,6 +4422,25 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite)", &**g);
 
+  order.clear();
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+    if (FixMklMetaDataEdges(g, n)) {
+      string node_name = n->name();
+      string op_name = n->type_string();
+
+      VLOG(1) << "MklLayoutRewritePass: fixed metadata edges for node "
+              << node_name << " with op " << op_name;
+      result = true;
+    }
+  }
+  DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite+Fixup)",
+            &**g);
+
   return result;
 }
 
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 5e2a465e22c7cbe45cbea40ea7a11491e2b2ad24..7645b4a7f0afa5fe0e6edc88a0259f92e064e82c 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -2022,6 +2022,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
@@ -2051,6 +2052,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoAddBias) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(_MklConv2D);DMT/_0(Const);DMT/_1(Const)|"
@@ -2069,6 +2071,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'Input'}"
@@ -2095,6 +2098,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'Input'}"
@@ -2125,6 +2129,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
@@ -2151,6 +2156,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C'] }"
       "node { name: 'E' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2178,6 +2184,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C'] }"
       "node { name: 'E' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2204,6 +2211,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative2) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C'] }"
       "node { name: 'E' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2233,6 +2241,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative3) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
       "node { name: 'E' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -2272,6 +2281,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
@@ -2289,6 +2299,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['F', 'B', 'E']}"
       "node { name: 'Z' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -2319,6 +2330,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['B', 'C'] }");
@@ -2341,6 +2353,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Conv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2348,6 +2361,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
@@ -2370,6 +2384,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
       " input: ['B', 'C'] }");
@@ -2389,6 +2404,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'D'] }");
@@ -2411,6 +2427,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['B', 'A', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'D'] }");
@@ -2477,6 +2494,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive2) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'M', 'N']}"
       "node { name: 'D' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -2529,6 +2547,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Conv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2536,6 +2555,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['C', 'D']}"
       "node { name: 'G' op: 'Const' "
       " attr { key: 'dtype' value { type: DT_INT32 } }"
@@ -2572,6 +2592,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D']}"
@@ -2634,6 +2655,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Conv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2641,6 +2663,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['C', 'D']}"
       "node { name: 'G' op: 'Const' "
       " attr { key: 'dtype' value { type: DT_INT32 } }"
@@ -2678,6 +2701,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D']}"
@@ -3274,6 +3298,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['B', 'C'] }",
@@ -3296,6 +3321,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
       "node { name: 'E' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -3323,6 +3349,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'D'] }",
@@ -3491,6 +3518,37 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
             "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
 }
 
+/////////////////////////////////////////////////////////////////////
+//         Post-rewrite fixup pass test
+
+TEST_F(MklLayoutPassTest, PostRewriteFixUpPass) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_UINT8 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_UINT8 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'E' op: '_MklAdd'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A', 'D', 'D']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Const);E(_MklAdd);"
+            "M(_MklInput);N(_MklInput)|A->C;A->E:1;B->C:1;C->E;C:2->E:2;"
+            "D->E:3;M->C:2;N->C:3");
+}
+
 /////////////////////////////////////////////////////////////////////
 
 static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index 114962c0e4f2969fe539d5b168aaf62d577a7024..03f3bbd6634b8a4a4fab5411fcb02b3ab8611d70 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -30,7 +30,7 @@ NodeBuilder::NodeOut::NodeOut(Node* n, int32 i)  // NOLINT(runtime/explicit)
       dt(SafeGetOutput(node, i, &error)) {}
 
 NodeBuilder::NodeOut::NodeOut(StringPiece n, int32 i, DataType t)
-    : node(nullptr), error(false), name(n.ToString()), index(i), dt(t) {}
+    : node(nullptr), error(false), name(std::string(n)), index(i), dt(t) {}
 
 NodeBuilder::NodeOut::NodeOut()
     : node(nullptr), error(true), index(0), dt(DT_FLOAT) {}
diff --git a/tensorflow/core/graph/while_context.cc b/tensorflow/core/graph/while_context.cc
index 10a2b67f37817454b5322759c91aa18e983c5b9d..1b38aac35db9f5c16cc5068e19416838a2645978 100644
--- a/tensorflow/core/graph/while_context.cc
+++ b/tensorflow/core/graph/while_context.cc
@@ -23,7 +23,7 @@ WhileContext::WhileContext(StringPiece frame_name,
                            OutputTensor cond_output,
                            std::vector<OutputTensor> body_inputs,
                            std::vector<OutputTensor> body_outputs)
-    : frame_name_(frame_name.ToString()),
+    : frame_name_(std::string(frame_name)),
       enter_nodes_(std::move(enter_nodes)),
       exit_nodes_(std::move(exit_nodes)),
       cond_output_(cond_output),
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index d8c4489a5799b1f43cb48639b5a0926e59bd0e75..69b759473588e7c39bf74a151db3500032bdf71a 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -816,6 +816,60 @@ class SymbolicShapeRefiner {
           c->output_tensors_as_shapes.resize(1);
           c->output_tensors_as_shapes[0] = result;
         }
+      } else if (IsStridedSlice(node)) {
+        ShapeHandle input = ic->input_tensors_as_shapes()[0];
+        bool valid = ic->RankKnown(input);
+        const Tensor* slice_begin = ic->input_tensor(1);
+        valid &= slice_begin != nullptr && slice_begin->NumElements() == 1;
+        const Tensor* slice_end = ic->input_tensor(2);
+        valid &= slice_end != nullptr && slice_end->NumElements() == 1;
+        const Tensor* slice_stride = ic->input_tensor(3);
+        valid &= slice_stride != nullptr && slice_stride->NumElements() == 1;
+
+        if (node.attr().count("ellipsis_mask") > 0 &&
+            node.attr().at("ellipsis_mask").i() != 0) {
+          valid = false;
+        }
+        if (node.attr().count("new_axis_mask") > 0 &&
+            node.attr().at("new_axis_mask").i() != 0) {
+          valid = false;
+        }
+        if (node.attr().count("shrink_axis_mask") > 0 &&
+            node.attr().at("shrink_axis_mask").i() != 0) {
+          valid = false;
+        }
+        int begin_mask = 0;
+        if (node.attr().count("begin_mask") > 0) {
+          begin_mask = node.attr().at("begin_mask").i();
+        }
+        int end_mask = 0;
+        if (node.attr().count("end_mask") > 0) {
+          end_mask = node.attr().at("end_mask").i();
+        }
+        if (begin_mask < 0 || begin_mask > 1 || end_mask < 0 || end_mask > 1) {
+          valid = false;
+        }
+        if (valid) {
+          int64 begin = 0;
+          if (begin_mask == 0) {
+            begin = slice_begin->dtype() == DT_INT32
+                        ? slice_begin->flat<int32>()(0)
+                        : slice_begin->flat<int64>()(0);
+          }
+          int64 end = std::numeric_limits<int64>::max();
+          if (end_mask == 0) {
+            end =
+                (slice_end->dtype() == DT_INT32 ? slice_end->flat<int32>()(0)
+                                                : slice_end->flat<int64>()(0));
+          }
+          int64 stride = slice_stride->dtype() == DT_INT32
+                             ? slice_stride->flat<int32>()(0)
+                             : slice_stride->flat<int64>()(0);
+          ShapeHandle result;
+          TF_RETURN_IF_ERROR(ic->Subshape(input, begin, end, stride, &result));
+          c->output_tensors_as_shapes.resize(1);
+          c->output_tensors_as_shapes[0] = result;
+        }
       }
     }
 
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index a53f6414c307e0c74dd0d130aa3f60ee36a5e08a..3e44b222fdb99b93b725bad8f4e6074b864d643a 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -952,6 +952,39 @@ TEST_F(GraphPropertiesTest, Performance) {
   TF_CHECK_OK(properties.InferStatically(false));
 }
 
+TEST_F(GraphPropertiesTest, StridedSlicesOfShapes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a =
+      ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+  auto shp = ops::Shape(s.WithOpName("shape"), {a});
+
+  Output index1 = ops::Const(s.WithOpName("index1"), 0, {1});
+  Output index2 = ops::Const(s.WithOpName("index2"), 1, {1});
+  Output index3 = ops::Const(s.WithOpName("index3"), 2, {1});
+
+  Output b = ops::StridedSlice(s.WithOpName("b"), shp, index1, index2, index2);
+  Output c = ops::StridedSlice(s.WithOpName("c"), shp, index2, index3, index2);
+
+  Output zero = ops::Const(s.WithOpName("zero"), 0.0f, {});
+  Output o1 = ops::Fill(s.WithOpName("o1"), b, zero);
+  Output o2 = ops::Fill(s.WithOpName("o2"), c, zero);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(false));
+  const auto shape_a = properties.GetOutputProperties("a").at(0).shape();
+  const auto shape_o1 = properties.GetOutputProperties("o1").at(0).shape();
+  const auto shape_o2 = properties.GetOutputProperties("o2").at(0).shape();
+  EXPECT_EQ(2, shape_a.dim_size());
+  EXPECT_EQ(1, shape_o1.dim_size());
+  EXPECT_EQ(1, shape_o2.dim_size());
+  EXPECT_EQ(shape_a.dim(0).size(), shape_o1.dim(0).size());
+  EXPECT_EQ(shape_a.dim(1).size(), shape_o2.dim(0).size());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index b35873ce385974178967ef3700e2309cf90c1d10..b8e337582c93a8c9f178f1bd441d408c4250dc4d 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -27,10 +27,16 @@ namespace grappler {
 
 constexpr int kOpsPerMac = 2;
 constexpr char kConst[] = "Const";
+constexpr char kGuaranteeConst[] = "GuaranteeConst";
 constexpr char kConv2d[] = "Conv2D";
 constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
 constexpr char kConv2dBackpropInput[] = "Conv2DBackpropInput";
 constexpr char kFusedConv2dBiasActivation[] = "FusedConv2DBiasActivation";
+constexpr char kDepthwiseConv2dNative[] = "DepthwiseConv2dNative";
+constexpr char kDepthwiseConv2dNativeBackpropFilter[] =
+    "DepthwiseConv2dNativeBackpropFilter";
+constexpr char kDepthwiseConv2dNativeBackpropInput[] =
+    "DepthwiseConv2dNativeBackpropInput";
 constexpr char kMatMul[] = "MatMul";
 constexpr char kSparseMatMul[] = "SparseMatMul";
 constexpr char kPlaceholder[] = "Placeholder";
@@ -123,33 +129,6 @@ int64 GetOutputSize(const int64 input, const int64 filter, const int64 stride,
   }
 }
 
-// Return a minimum shape if the shape is unknown. If known, return the original
-// shape.
-TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape,
-                                      int rank, bool* found_unknown_shapes) {
-  auto shape = original_shape;
-  if (shape.unknown_rank() || shape.dim_size() < rank) {
-    *found_unknown_shapes = true;
-    TensorShapeProto::Dim dim;
-    VLOG(2) << "Use minimum shape because the rank is unknown.";
-    // The size of each dimension is at least 1, if unknown.
-    dim.set_size(1);
-    for (int i = 0; i < rank; i++) {
-      *shape.add_dim() = dim;
-    }
-  } else {
-    for (int i = 0; i < shape.dim_size(); i++) {
-      if (shape.dim(i).size() < 0) {
-        *found_unknown_shapes = true;
-        VLOG(2) << "Use minimum dim size 1 because the shape is unknown.";
-        // The size of each dimension is at least 1, if unknown.
-        shape.mutable_dim(i)->set_size(1);
-      }
-    }
-  }
-  return shape;
-}
-
 // Return the output element count of a binary element-wise op considering
 // broadcasting.
 int64 CwiseOutputElementCount(const TensorShapeProto& input_shape_1,
@@ -181,6 +160,33 @@ int64 CwiseOutputElementCount(const TensorShapeProto& input_shape_1,
 
 }  // namespace
 
+// Return a minimum shape if the shape is unknown. If known, return the original
+// shape.
+TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape,
+                                      int rank, bool* found_unknown_shapes) {
+  auto shape = original_shape;
+  if (shape.unknown_rank() || shape.dim_size() < rank) {
+    *found_unknown_shapes = true;
+    TensorShapeProto::Dim dim;
+    VLOG(2) << "Use minimum shape because the rank is unknown.";
+    // The size of each dimension is at least 1, if unknown.
+    dim.set_size(1);
+    for (int i = 0; i < rank; i++) {
+      *shape.add_dim() = dim;
+    }
+  } else {
+    for (int i = 0; i < shape.dim_size(); i++) {
+      if (shape.dim(i).size() < 0) {
+        *found_unknown_shapes = true;
+        VLOG(2) << "Use minimum dim size 1 because the shape is unknown.";
+        // The size of each dimension is at least 1, if unknown.
+        shape.mutable_dim(i)->set_size(1);
+      }
+    }
+  }
+  return shape;
+}
+
 OpLevelCostEstimator::OpLevelCostEstimator() {
   // Syntactic sugar to build and return a lambda that takes an OpInfo and
   // returns a cost.
@@ -200,11 +206,20 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
        wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput)},
       {kFusedConv2dBiasActivation,
        wrap(&OpLevelCostEstimator::PredictFusedConv2DBiasActivation)},
+      // reuse Conv2D for DepthwiseConv2dNative because the caculation is the
+      // same although the actual meaning of the parameters are different. See
+      // comments in PredictConv2D and related functions
+      {kDepthwiseConv2dNative, wrap(&OpLevelCostEstimator::PredictConv2D)},
+      {kDepthwiseConv2dNativeBackpropFilter,
+       wrap(&OpLevelCostEstimator::PredictConv2DBackpropFilter)},
+      {kDepthwiseConv2dNativeBackpropInput,
+       wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput)},
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
 
       {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kGuaranteeConst, wrap(&OpLevelCostEstimator::PredictNoOp)},
 
       {kGather, wrap(&OpLevelCostEstimator::PredictGatherOrSlice)},
       {kGatherV2, wrap(&OpLevelCostEstimator::PredictGatherOrSlice)},
@@ -537,18 +552,30 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
 int64 OpLevelCostEstimator::CountConv2DOperations(
     const OpInfo& op_features, ConvolutionDimensions* conv_info,
     bool* found_unknown_shapes) const {
-  if (op_features.op() != kConv2d) {
-    LOG(ERROR) << "Invalid Operation";
-    return 0;
-  }
+  DCHECK(op_features.op() == kConv2d ||
+         op_features.op() == kDepthwiseConv2dNative)
+      << "Invalid Operation: not Conv2D nor DepthwiseConv2dNative";
+
   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
       op_features.inputs(0).shape(), op_features.inputs(1).shape(), op_features,
       found_unknown_shapes);
 
+  //  in DepthwiseConv2dNative conv_dims.oz is actually the channel depth
+  //  multiplier; The effective output channel depth oz_effective is
+  //  conv_dims.iz * conv_dims.oz. thus # ops = N x H x W x oz_effective x 2RS.
+  //  Compare to Conv2D where # ops =  N x H x W x iz x oz x 2RS,
+  //  oz = oz_effective,  then Conv2D_ops / Depthwise_conv2d_native_ops = iz.
   int64 ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
-  ops *= conv_dims.iz * conv_dims.oz;
+  if (op_features.op() == kConv2d) {
+    ops *= conv_dims.iz * conv_dims.oz;
+  } else {
+    // To ensure output tensor dims to be correct for DepthwiseConv2DNative,
+    // although ops are the same as Conv2D.
+    conv_dims.oz *= conv_dims.iz;
+    ops *= conv_dims.oz;
+  }
   ops *= kOpsPerMac;
 
   if (conv_info != nullptr) {
@@ -795,7 +822,10 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
     bool* found_unknown_shapes) const {
   int64 ops = 0;
 
-  DCHECK_EQ(kConv2dBackpropInput, op_features.op());
+  DCHECK(op_features.op() == kConv2dBackpropInput ||
+         op_features.op() == kDepthwiseConv2dNativeBackpropInput)
+      << "Invalid Operation: not kConv2dBackpropInput nor"
+         "kDepthwiseConv2dNativeBackpropInput";
 
   if (op_features.inputs_size() < 2) {
     *found_unknown_shapes = true;
@@ -828,10 +858,16 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
   ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
-  ops *= conv_dims.iz * conv_dims.oz;
+  if (op_features.op() == kConv2dBackpropInput) {
+    ops *= conv_dims.iz * conv_dims.oz;
+  } else {
+    // conv_dims always use forward path definition regardless
+    conv_dims.oz *= conv_dims.iz;
+    ops *= conv_dims.oz;
+  }
   ops *= kOpsPerMac;
 
-  VLOG(1) << "Operations for Conv2DBackpropInput " << ops;
+  VLOG(1) << "Operations for" << op_features.op() << "  " << ops;
 
   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
@@ -843,7 +879,11 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
     const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
     bool* found_unknown_shapes) const {
   int64 ops = 0;
-  DCHECK_EQ(kConv2dBackpropFilter, op_features.op());
+
+  DCHECK(op_features.op() == kConv2dBackpropFilter ||
+         op_features.op() == kDepthwiseConv2dNativeBackpropFilter)
+      << "Invalid Operation: not kConv2dBackpropFilter nor"
+         "kDepthwiseConv2dNativeBackpropFilter";
 
   TensorShapeProto filter_shape;
   bool shape_found = false;
@@ -875,10 +915,15 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
   ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
-  ops *= conv_dims.iz * conv_dims.oz;
+  if (op_features.op() == kConv2dBackpropFilter) {
+    ops *= conv_dims.iz * conv_dims.oz;
+  } else {
+    // conv_dims always use forward path definition regardless
+    conv_dims.oz *= conv_dims.iz;
+    ops *= conv_dims.oz;
+  }
   ops *= kOpsPerMac;
-
-  VLOG(1) << "Operations for Conv2DBackpropFilter" << ops;
+  VLOG(1) << "Operations for" << op_features.op() << "  " << ops;
 
   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 35649f7ee959a292dbf68246221bc98c52f2db37..d384f5727965bc382377c155c515e4f8f006169d 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -30,6 +30,8 @@ namespace grappler {
 
 bool GetTensorShapeProtoFromTensorProto(const TensorProto& tensor_proto,
                                         TensorShapeProto* tensor_shape_proto);
+TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape,
+                                      int rank, bool* found_unknown_shapes);
 
 class OpLevelCostEstimator {
  public:
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 13ea43bed692828f00e89b7f964c3abcdcdb6483..b2c021b73ac4c37d7afc493777fd248055a83970 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -128,6 +128,23 @@ OpContext DescribeConvolution(int batch, int ix, int iy, int iz1, int iz2,
   return op_context;
 }
 
+// Describe DepthwiseConvolution constructs an OpContext for a
+// DepthwiseConv2dNative applied to an input
+// tensor with shape (batch, ix, iy, iz1) and a kernel tensor with shape
+// (kx, ky, iz2, cm). cm is channel multiplier
+
+OpContext DescribeDepthwiseConv2dNative(int batch, int ix, int iy, int iz1,
+                                        int iz2, int kx, int ky, int cm) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("DepthwiseConv2dNative");
+
+  DescribeTensor4D(batch, ix, iy, iz1, op_context.op_info.add_inputs());
+  DescribeTensor4D(kx, ky, iz2, cm, op_context.op_info.add_inputs());
+
+  return op_context;
+}
+
 // DescribeFusedConv2DBiasActivation constructs an OpContext for a
 // FusedConv2DBiasActivation applied to a convolution input tensor with shape
 // (batch, ix, iy, iz1), a kernel tensor with shape (kx, ky, iz2, oz), a
@@ -505,6 +522,15 @@ TEST_F(OpLevelCostEstimatorTest, Conv2DExecutionTime) {
   EXPECT_FALSE(cost.inaccurate);
 }
 
+TEST_F(OpLevelCostEstimatorTest, DepthwiseConv2dNativeExecutionTime) {
+  auto cost =
+      PredictCosts(DescribeDepthwiseConv2dNative(16, 19, 19, 48, 48, 5, 5, 3));
+  EXPECT_EQ(Costs::Duration(112340), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(4158720), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(4271060), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
 TEST_F(OpLevelCostEstimatorTest, DummyExecutionTime) {
   auto cost = PredictCosts(DescribeBinaryOp("Dummy", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index e633ecf78989f75db8d21b18552c7bc6ef433f60..92581942cbdbb0b76679297c48bc081c73fda833 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -106,6 +106,8 @@ bool IsConv2DBackpropInput(const NodeDef& node) {
   return node.op() == "Conv2DBackpropInput";
 }
 
+bool IsConv3D(const NodeDef& node) { return node.op() == "Conv3D"; }
+
 bool IsDepthwiseConv2dNative(const NodeDef& node) {
   return node.op() == "DepthwiseConv2dNative";
 }
@@ -408,6 +410,21 @@ bool IsPersistent(const NodeDef& node) {
   return IsConstant(node) || IsVariable(node);
 }
 
+bool MaybeHasRefInput(const NodeDef& node) {
+  const OpDef* op_def;
+  Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
+  if (!status.ok()) {
+    return true;
+  }
+  // Nodes such as Assign or AssignAdd modify one of their inputs.
+  for (const auto& input : op_def->input_arg()) {
+    if (input.is_ref()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool IsFreeOfSideEffect(const NodeDef& node) {
   // Placeholders must be preserved to keep the graph feedable.
   if (IsPlaceholder(node)) {
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index f6105d710e41c066026eab1a097509a292039ecc..9d91ba1ba53acb5af6ee4e78ade1fb37cb66283e 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -48,6 +48,7 @@ bool IsConstant(const NodeDef& node);
 bool IsConv2D(const NodeDef& node);
 bool IsConv2DBackpropFilter(const NodeDef& node);
 bool IsConv2DBackpropInput(const NodeDef& node);
+bool IsConv3D(const NodeDef& node);
 bool IsDepthwiseConv2dNative(const NodeDef& node);
 bool IsDepthwiseConv2dNativeBackpropFilter(const NodeDef& node);
 bool IsDepthwiseConv2dNativeBackpropInput(const NodeDef& node);
@@ -166,6 +167,10 @@ bool IsPersistent(const NodeDef& node);
 
 bool IsFreeOfSideEffect(const NodeDef& node);
 
+// Returns true if the takes a tensor reference as input, or if looking up its
+// OpDef failed.
+bool MaybeHasRefInput(const NodeDef& node);
+
 bool ModifiesFrameInfo(const NodeDef& node);
 
 // Returns true if the op is known to write to one or more of its inputs.
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 5b5e1e024e8cfaae638c453b469ae4881bcd3df8..104a0428ce83705abf47f2124e1618ddbb9d08fe 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -96,6 +96,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
+        ":symbolic_shapes",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -214,7 +215,6 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core/grappler/utils:frame",
     ],
 )
 
@@ -267,7 +267,6 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core/grappler/utils:frame",
         "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
@@ -508,7 +507,6 @@ cc_library(
         ":arithmetic_optimizer",
         ":auto_parallel",
         ":constant_folding",
-        ":custom_graph_optimizer",
         ":custom_graph_optimizer_registry",
         ":debug_stripper",
         ":dependency_optimizer",
@@ -518,6 +516,8 @@ cc_library(
         ":loop_optimizer",
         ":memory_optimizer",
         ":model_pruner",
+        ":remapper",
+        ":shape_optimizer",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -604,6 +604,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
@@ -628,6 +629,79 @@ tf_cuda_cc_test(
     ],
 )
 
+cc_library(
+    name = "shape_optimizer",
+    srcs = ["shape_optimizer.cc"],
+    hdrs = [
+        "shape_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        ":symbolic_shapes",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:frame",
+    ],
+)
+
+tf_cc_test(
+    name = "shape_optimizer_test",
+    srcs = ["shape_optimizer_test.cc"],
+    deps = [
+        ":shape_optimizer",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
+cc_library(
+    name = "remapper",
+    srcs = ["remapper.cc"],
+    hdrs = [
+        "remapper.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":constant_folding",
+        ":graph_optimizer",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:graph_properties",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "remapper_test",
+    srcs = ["remapper_test.cc"],
+    deps = [
+        ":remapper",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
 cc_library(
     name = "symbolic_shapes",
     srcs = ["symbolic_shapes.cc"],
@@ -635,6 +709,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ] + tf_protos_grappler(),
 )
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index adfae2e1a34eb8836e7f3cfcb0223e65a7873470..060e4200af230267e6c3d18d2fd1495c785bb673 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
@@ -38,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/tensor_coding.h"
@@ -254,6 +256,17 @@ NodeDef* GetTailOfValuePreservingChain(
                         is_value_preserving_non_branching);
 }
 
+NodeDef* GetTailOfIdempotentChain(
+    const NodeDef& node, const NodeMap& node_map,
+    const std::unordered_set<string>& nodes_to_preserve) {
+  auto is_idempotent_non_branching = [&](const NodeDef& node) {
+    return nodes_to_preserve.find(node.name()) == nodes_to_preserve.end() &&
+           IsIdempotent(node) && NumNonControlOutputs(node, node_map) == 1;
+  };
+  return GetTailOfChain(node, node_map, /*follow_control_input=*/false,
+                        is_idempotent_non_branching);
+}
+
 // Graph optimizer context extension specific to ArithmeticOptimizer.
 struct ArithmeticOptimizerContext {
   explicit ArithmeticOptimizerContext(SetVector<NodeDef*>* nodes_to_simplify)
@@ -270,7 +283,7 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
                                     const ArithmeticOptimizerContext ctx_ext)
       : GraphOptimizerStage("ArithmeticOptimizer", name, ctx),
         ctx_ext_(ctx_ext) {}
-  virtual ~ArithmeticOptimizerStage() = default;
+  ~ArithmeticOptimizerStage() override = default;
 
  protected:
   // Simplification graph rewrite can create additional nodes that are inputs
@@ -1149,21 +1162,24 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
 class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
  public:
   explicit RemoveIdentityTranspose(const GraphOptimizerContext& ctx,
-                                   const ArithmeticOptimizerContext& ctx_ext)
-      : ArithmeticOptimizerStage("RemoveIdentityTranspose", ctx, ctx_ext) {}
+                                   const ArithmeticOptimizerContext& ctx_ext,
+                                   RewriterConfig::Toggle opt_level)
+      : ArithmeticOptimizerStage("RemoveIdentityTranspose", ctx, ctx_ext),
+        opt_level_(opt_level) {}
   ~RemoveIdentityTranspose() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
     return IsTranspose(*node) || IsConjugateTranspose(*node);
   }
 
-  // TODO(rmlarsen): Forward control dependencies on the bypassed
-  // transpose nodes.
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
+    NodeDef* tail = node;
+    tail = GetTailOfIdempotentChain(*tail, *ctx().node_map,
+                                    *ctx().nodes_to_preserve);
+    NodeDef* first_transpose;
+    TF_RETURN_IF_ERROR(GetInputNode(tail->input(0), &first_transpose));
 
-    NodeDef* input;
-    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
     NodeDef* node_perm;
     TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &node_perm));
     if (!IsConstant(*node_perm)) {
@@ -1171,17 +1187,30 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
     }
     std::vector<int64> node_perm_values;
     TF_RETURN_IF_ERROR(GetPermutation(*node_perm, &node_perm_values));
-    if (input->op() == node->op()) {
+    if (first_transpose->op() == node->op()) {
       // Remove pairs of transposes that cancel each other.
-      NodeDef* input_perm;
-      TF_RETURN_IF_ERROR(GetInputNode(input->input(1), &input_perm));
-      if (!IsConstant(*input_perm)) {
+      NodeDef* first_transpose_perm;
+      TF_RETURN_IF_ERROR(
+          GetInputNode(first_transpose->input(1), &first_transpose_perm));
+      if (!IsConstant(*first_transpose_perm)) {
         return Status::OK();
       }
-      std::vector<int64> input_perm_values;
-      TF_RETURN_IF_ERROR(GetPermutation(*input_perm, &input_perm_values));
-      if (AreInversePermutations(node_perm_values, input_perm_values)) {
-        *simplified_node_name = input->input(0);
+      std::vector<int64> first_transpose_perm_values;
+      TF_RETURN_IF_ERROR(
+          GetPermutation(*first_transpose_perm, &first_transpose_perm_values));
+      if (AreInversePermutations(node_perm_values,
+                                 first_transpose_perm_values)) {
+        if (tail == node) {
+          // Bypass adjacent pair.
+          *simplified_node_name = first_transpose->input(0);
+        } else {
+          // Bypass pair connected through chain.
+          tail->set_input(0, first_transpose->input(0));
+          ctx().node_map->UpdateInput(tail->name(), first_transpose->name(),
+                                      first_transpose->input(0));
+          ForwardControlDependencies(tail, {first_transpose});
+          *simplified_node_name = node->input(0);
+        }
       }
     } else {
       // Remove simple identity transposes.
@@ -1231,6 +1260,8 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
     }
     return true;
   }
+
+  RewriterConfig::Toggle opt_level_;
 };
 
 // Remove redundant Bitcasts.
@@ -1349,6 +1380,47 @@ class RemoveNegationStage : public ArithmeticOptimizerStage {
   }
 };
 
+class RemoveLogicalNotStage : public ArithmeticOptimizerStage {
+ public:
+  explicit RemoveLogicalNotStage(const GraphOptimizerContext& ctx,
+                                 const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveLogicalNot", ctx, ctx_ext) {}
+  ~RemoveLogicalNotStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsLogicalNot(*node) && !IsInPreserveSet(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    const string node_name = node->name();
+    NodeDef* input;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
+    if (IsInPreserveSet(*input) ||
+        NumNonControlOutputs(*input, *ctx().node_map) > 1) {
+      return Status::OK();
+    }
+    string new_op;
+    if (IsEqual(*input)) {
+      new_op = "NotEqual";
+    } else if (IsNotEqual(*input)) {
+      new_op = "Equal";
+    } else if (IsLess(*input)) {
+      new_op = "GreaterEqual";
+    } else if (IsLessEqual(*input)) {
+      new_op = "Greater";
+    } else if (IsGreater(*input)) {
+      new_op = "LessEqual";
+    } else if (IsGreaterEqual(*input)) {
+      new_op = "Less";
+    }
+    if (!new_op.empty()) {
+      input->set_op(new_op);
+      *simplified_node_name = input->name();
+    }
+    return Status::OK();
+  }
+};
+
 // This optimization hoists the common prefix of unary ops of the inputs to
 // concat out of the concat, for example:
 //    Concat([Exp(Sin(x)), Exp(Sin(y)), Exp(Sin(z))])
@@ -1752,7 +1824,7 @@ class SqrtDivToRsqrtMulStage : public ArithmeticOptimizerStage {
 class UniqueNodes {
  public:
   NodeDef* FindOrAddRepresentative(NodeDef* node) {
-    std::size_t sig = ComputeSignature(*node);
+    uint64 sig = ComputeSignature(*node);
     std::vector<NodeDef*>& candidates = rep_[sig];
     for (auto& candidate : candidates) {
       if (SameNode(*candidate, *node)) {
@@ -1764,26 +1836,25 @@ class UniqueNodes {
   }
 
  private:
-  std::size_t ComputeSignature(const NodeDef& node) const;
+  uint64 ComputeSignature(const NodeDef& node) const;
   bool SameNode(const NodeDef& node1, const NodeDef& node2) const;
 
-  std::unordered_map<std::size_t, std::vector<NodeDef*>> rep_;
+  std::unordered_map<uint64, std::vector<NodeDef*>> rep_;
 };
 
-std::size_t UniqueNodes::ComputeSignature(const NodeDef& node) const {
-  std::size_t h = std::hash<string>{}(node.op());
-  h ^= std::hash<string>{}(node.device());
+uint64 UniqueNodes::ComputeSignature(const NodeDef& node) const {
+  uint64 h = Hash64(node.op());
+  h = Hash64Combine(Hash64(node.device()), h);
+
   for (const auto& input : node.input()) {
     int pos;
     string node_name = ParseNodeName(input, &pos);
-    h ^= std::hash<string>{}(node_name);
-    h ^= static_cast<std::size_t>(pos);
+    h = Hash64CombineUnordered(Hash64(node_name), h);
+    h = Hash64CombineUnordered(std::hash<int>()(pos), h);
   }
   for (const auto& attr : node.attr()) {
-    h ^= std::hash<string>{}(attr.first);
-    string tmp;
-    attr.second.AppendToString(&tmp);
-    h ^= std::hash<string>{}(tmp);
+    h = Hash64CombineUnordered(Hash64(attr.first), h);
+    h = Hash64CombineUnordered(FastAttrValueHash(attr.second), h);
   }
   return h;
 }
@@ -1839,17 +1910,8 @@ bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   }
   for (const auto& attr1 : node1.attr()) {
     auto it = node2.attr().find(attr1.first);
-    if (it == node2.attr().end()) {
-      return false;
-    }
-    const auto& attr2 = *it;
-    string val1;
-    attr1.second.AppendToString(&val1);
-    string val2;
-    attr2.second.AppendToString(&val2);
-    if (val1 != val2) {
-      return false;
-    }
+    if (it == node2.attr().end()) return false;
+    if (!FastAreAttrValuesEqual(attr1.second, it->second)) return false;
   }
 
   return true;
@@ -2233,6 +2295,9 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         new_square_node->set_input(i - 1, new_square_node->input(i));
       }
       new_square_node->mutable_input()->RemoveLast();
+      for (const string& input : new_square_node->input()) {
+        node_map_->AddOutput(NodeName(input), new_square_node->name());
+      }
       return new_square_node->name();
     }
   }
@@ -2398,13 +2463,15 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
   if (options_.minimize_broadcasts && can_use_shapes)
     pipeline.AddStage<MinimizeBroadcasts>(ctx, ctx_ext);
   if (options_.remove_identity_transpose && can_use_shapes)
-    pipeline.AddStage<RemoveIdentityTranspose>(ctx, ctx_ext);
+    pipeline.AddStage<RemoveIdentityTranspose>(ctx, ctx_ext, opt_level_);
   if (options_.remove_redundant_bitcast)
     pipeline.AddStage<RemoveRedundantBitcastStage>(ctx, ctx_ext);
   if (options_.remove_redundant_cast)
     pipeline.AddStage<RemoveRedundantCastStage>(ctx, ctx_ext);
   if (options_.remove_negation)
     pipeline.AddStage<RemoveNegationStage>(ctx, ctx_ext);
+  if (options_.remove_logical_not)
+    pipeline.AddStage<RemoveLogicalNotStage>(ctx, ctx_ext);
   if (options_.hoist_cwise_unary_chains)
     pipeline.AddStage<HoistCWiseUnaryChainsStage>(ctx, ctx_ext);
   if (options_.convert_sqrt_div_to_rsqrt_mul)
@@ -2491,7 +2558,8 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
   TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph_));
 
   graph_properties_.reset(new GraphProperties(optimized_item));
-  const Status status = graph_properties_->InferStatically(false);
+  const bool assume_valid_feeds = opt_level_ == RewriterConfig::AGGRESSIVE;
+  const Status status = graph_properties_->InferStatically(assume_valid_feeds);
   const bool can_use_shapes = status.ok();
   if (!can_use_shapes) {
     VLOG(1) << "Shape inference failed." << status.error_message();
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 3f9feac55f62f010985294a59ec47c889f9b70bb..8e1b3eda3b8f18b9c4403ac857c637fccaf220e5 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -65,9 +65,10 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
     bool remove_negation = true;
-    bool hoist_cwise_unary_chains = true;
+    bool hoist_cwise_unary_chains = false;
     bool convert_sqrt_div_to_rsqrt_mul = false;
     bool remove_idempotent = true;
+    bool remove_logical_not = true;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index e109e66633167569ce91fd40e824952c5da6b118..64fdc8a83b1ad104d14d2998dfb3c23df05b0ece 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -177,6 +177,11 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     DisableAllStages(optimizer);
     optimizer->options_.remove_idempotent = true;
   }
+
+  void EnableOnlyRemoveLogicalNot(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_logical_not = true;
+  }
 };
 
 TEST_F(ArithmeticOptimizerTest, NoOp) {
@@ -696,6 +701,9 @@ TEST_F(ArithmeticOptimizerTest, HoistFactorDiv) {
         item.fetch = {"id"};
         TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+        auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+        EXPECT_EQ(1, tensors_expected.size());
+
         ArithmeticOptimizer optimizer;
         EnableOnlyHoistCommonFactor(&optimizer);
 
@@ -734,6 +742,13 @@ TEST_F(ArithmeticOptimizerTest, HoistFactorDiv) {
           EXPECT_EQ("id", id_node->name());
           EXPECT_EQ(HoistDivName("add"), id_node->input(0));
         }
+        auto tensors = EvaluateNodes(output, item.fetch);
+        EXPECT_EQ(1, tensors.size());
+        if (use_ints) {
+          test::ExpectTensorEqual<int32>(tensors_expected[0], tensors[0]);
+        } else {
+          test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+        }
       }
     }
   }
@@ -954,6 +969,67 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
+TEST_F(ArithmeticOptimizerTest, NotAssumeValidFeeds) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output inputs =
+      ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({4, 3, 28, 28}));
+  Output target_shape = ops::Const(s, {4, 3, 28, 28}, {4});
+  Output reshape = ops::Reshape(s, inputs, target_shape);
+  Output outputs = ops::Identity(s.WithOpName("outputs"), reshape);
+
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 3, 28, 28}));
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  item.feed = {{"Placeholder", x_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
+
+  item.graph.Swap(&output);
+  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+
+  // The reshape is preserved because the shape of the placeholder can be
+  // different from the shape of the actual feed.
+  EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
+
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(ArithmeticOptimizerTest, AssumeValidFeedsInAggressiveMode) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output inputs =
+      ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({4, 3, 28, 28}));
+  Output target_shape = ops::Const(s, {4, 3, 28, 28}, {4});
+  Output reshape = ops::Reshape(s, inputs, target_shape);
+  Output outputs = ops::Identity(s.WithOpName("outputs"), reshape);
+
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 3, 28, 28}));
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  item.feed = {{"Placeholder", x_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  GraphDef output;
+  TF_EXPECT_OK(ArithmeticOptimizer(RewriterConfig::AGGRESSIVE)
+                   .Optimize(nullptr, item, &output));
+
+  item.graph.Swap(&output);
+  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+
+  EXPECT_EQ(0, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
 TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
   // Reshape from [-1,3,28,28] to [8,-1,28,28] is not identity, because it can
   // be from [4,3,28,28] to [8,6,28,28].
@@ -1112,7 +1188,7 @@ TEST_F(ArithmeticOptimizerTest, RemoveIdentityTransposes) {
       ops::RandomUniform(s.WithOpName("inputs"), inputs_shape, DT_FLOAT);
   Output perm1 = ops::Const(s.WithOpName("perm1"), {0, 2, 3, 1}, {4});
   Output perm2 = ops::Const(s.WithOpName("perm2"), {0, 3, 1, 2}, {4});
-  Output perm3 = ops::Const(s.WithOpName("perm2"), {0, 1, 2, 3}, {4});
+  Output perm3 = ops::Const(s.WithOpName("perm3"), {0, 1, 2, 3}, {4});
   Output transpose1 = ops::Transpose(s.WithOpName("transpose1"), inputs, perm1);
   Output transpose2 =
       ops::Transpose(s.WithOpName("transpose2"), transpose1, perm2);
@@ -1156,6 +1232,11 @@ TEST_F(ArithmeticOptimizerTest, RemoveIdentityTransposesMultipleOutputs) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({8, 12, 28, 28}));
+  item.feed = {{"inputs", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyRemoveIdentityTranspose(&optimizer);
@@ -1168,6 +1249,10 @@ TEST_F(ArithmeticOptimizerTest, RemoveIdentityTransposesMultipleOutputs) {
       EXPECT_EQ(node.input(2), "Split:2");
     }
   }
+
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, RemoveTransposesWithControlDependency) {
@@ -1184,6 +1269,11 @@ TEST_F(ArithmeticOptimizerTest, RemoveTransposesWithControlDependency) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 3}));
+  item.feed = {{"Placeholder", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyRemoveIdentityTranspose(&optimizer);
@@ -1194,6 +1284,10 @@ TEST_F(ArithmeticOptimizerTest, RemoveTransposesWithControlDependency) {
   EXPECT_EQ(2, outputs_node->input_size());
   EXPECT_EQ(outputs_node->input(0), "outputs_const");
   EXPECT_EQ(outputs_node->input(1), "^Placeholder");
+
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, NotRemoveTransposes) {
@@ -1220,6 +1314,47 @@ TEST_F(ArithmeticOptimizerTest, NotRemoveTransposes) {
   EXPECT_EQ(6, output.node_size());
 }
 
+TEST_F(ArithmeticOptimizerTest, RemoveIdentityTransposesThroughChain) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output inputs_shape =
+      ops::Const(s.WithOpName("inputs_shape"), {8, 3, 28, 28}, {4});
+  Output inputs =
+      ops::RandomUniform(s.WithOpName("inputs"), inputs_shape, DT_FLOAT);
+  Output perm1 = ops::Const(s.WithOpName("perm1"), {0, 2, 3, 1}, {4});
+  Output perm2 = ops::Const(s.WithOpName("perm2"), {0, 3, 1, 2}, {4});
+  Output transpose1 = ops::Transpose(
+      s.WithOpName("transpose1").WithControlDependencies(perm2), inputs, perm1);
+  Output identity = ops::Identity(s.WithOpName("id"), transpose1);
+  Output transpose2 =
+      ops::Transpose(s.WithOpName("transpose2"), identity, perm2);
+  Output id1 = ops::Identity(s.WithOpName("id1"), transpose2);
+
+  GrapplerItem item;
+  item.fetch = {"id1"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveIdentityTranspose(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  std::set<string> nodes_after_optimization;
+  for (const NodeDef& node : output.node()) {
+    nodes_after_optimization.insert(node.name());
+    if (node.name() == "id") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("inputs", node.input(0));
+      EXPECT_EQ("^perm2", node.input(1));
+    }
+    if (node.name() == "id1") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("id", node.input(0));
+    }
+  }
+  EXPECT_EQ(nodes_after_optimization,
+            std::set<string>({"id", "id1", "inputs_shape", "inputs", "perm2"}));
+}
+
 TEST_F(ArithmeticOptimizerTest, FoldMulToTransposeConv) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs = ops::Placeholder(s.WithOpName("inputs"), DT_FLOAT,
@@ -1440,6 +1575,11 @@ TEST_F(ArithmeticOptimizerTest, CombineBitcasts) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto x_t = GenerateRandomTensor<DT_UINT8>(TensorShape({2, 3}));
+  item.feed = {{"inputs", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyRemoveRedundantBitcast(&optimizer);
@@ -1451,6 +1591,10 @@ TEST_F(ArithmeticOptimizerTest, CombineBitcasts) {
   EXPECT_EQ(3, output.node_size());
   EXPECT_EQ(1, CountOpNodes(output, "Bitcast"));
   EXPECT_TRUE(IsNodesDirectlyConnected(node_map, "inputs", "bc2"));
+
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int8>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, CombineAndRemoveBitcasts) {
@@ -1465,6 +1609,11 @@ TEST_F(ArithmeticOptimizerTest, CombineAndRemoveBitcasts) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto x_t = GenerateRandomTensor<DT_INT8>(TensorShape({2, 3}));
+  item.feed = {{"inputs", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyRemoveRedundantBitcast(&optimizer);
@@ -1476,6 +1625,10 @@ TEST_F(ArithmeticOptimizerTest, CombineAndRemoveBitcasts) {
   EXPECT_EQ(2, output.node_size());
   EXPECT_EQ(0, CountOpNodes(output, "Bitcast"));
   EXPECT_TRUE(IsNodesDirectlyConnected(node_map, "inputs", "outputs"));
+
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int8>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, RemoveRedundantCast) {
@@ -1489,6 +1642,11 @@ TEST_F(ArithmeticOptimizerTest, RemoveRedundantCast) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto x_t = GenerateRandomTensor<DT_INT8>(TensorShape({2, 3}));
+  item.feed = {{"inputs", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyRemoveRedundantCast(&optimizer);
@@ -1500,6 +1658,10 @@ TEST_F(ArithmeticOptimizerTest, RemoveRedundantCast) {
   EXPECT_EQ(2, output.node_size());
   EXPECT_EQ(0, CountOpNodes(output, "Cast"));
   EXPECT_TRUE(IsNodesDirectlyConnected(node_map, "inputs", "outputs"));
+
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int8>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfIdenticalShape) {
@@ -1519,6 +1681,14 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfIdenticalShape) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto c_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  std::vector<std::pair<string, Tensor>> feed = {
+      {"a", a_t}, {"b", b_t}, {"c", c_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyAddToAddNCombining(&optimizer);
@@ -1552,6 +1722,10 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfIdenticalShape) {
   ASSERT_NE(updated_outputs, nullptr);
 
   EXPECT_EQ(collapsed_add->name(), updated_outputs->input(0));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MultiplePasses) {
@@ -1576,6 +1750,17 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MultiplePasses) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto c_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto y_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto z_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  std::vector<std::pair<string, Tensor>> feed = {
+      {"a", a_t}, {"b", b_t}, {"c", c_t}, {"x", x_t}, {"y", y_t}, {"z", z_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyAddToAddNCombining(&optimizer);
@@ -1625,6 +1810,10 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MultiplePasses) {
   EXPECT_EQ(2, updated_mul->input_size());
   EXPECT_EQ(collapsed_left->name(), updated_mul->input(0));
   EXPECT_EQ(collapsed_right->name(), updated_mul->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddInputMultipleTimes) {
@@ -1642,6 +1831,14 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddInputMultipleTimes) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto c_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  std::vector<std::pair<string, Tensor>> feed = {
+      {"a", a_t}, {"b", b_t}, {"c", c_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyAddToAddNCombining(&optimizer);
@@ -1670,6 +1867,10 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddInputMultipleTimes) {
   EXPECT_EQ("b", collapsed_add->input(1));
   EXPECT_EQ("b", collapsed_add->input(2));
   EXPECT_EQ("c", collapsed_add->input(3));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfSymbolicallyEqualShape) {
@@ -1693,6 +1894,11 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfSymbolicallyEqualShape) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  std::vector<std::pair<string, Tensor>> feed = {{"input", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyAddToAddNCombining(&optimizer);
@@ -1724,6 +1930,10 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfSymbolicallyEqualShape) {
   const NodeDef* updated_outputs = node_map.GetNode("outputs");
   ASSERT_NE(updated_outputs, nullptr);
   EXPECT_EQ(collapsed_add->name(), updated_outputs->input(0));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCast) {
@@ -1748,6 +1958,17 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCast) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32}));
+  auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32, 32}));
+  auto c_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32, 32, 32}));
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32}));
+  auto y_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32, 32}));
+  auto z_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32, 32, 32}));
+  std::vector<std::pair<string, Tensor>> feed = {
+      {"a", a_t}, {"b", b_t}, {"c", c_t}, {"x", x_t}, {"y", y_t}, {"z", z_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyAddToAddNCombining(&optimizer);
@@ -1820,18 +2041,22 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCast) {
   const NodeDef* updated_outputs = node_map.GetNode("outputs");
   ASSERT_NE(updated_outputs, nullptr);
   EXPECT_EQ(outer_add_name, updated_outputs->input(0));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCastWithSymbolicShapes) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   // We have a small input with one unknown dimension
-  auto small = ops::Variable(s.WithOpName("small"), {-1, 1, 1}, DT_FLOAT);
+  auto small = ops::Variable(s.WithOpName("small"), {-1, 1, 1}, DT_DOUBLE);
 
   // And second input which is larger, but has the same unknown dimension
   // device spec prevents this node from rewriting
-  auto d = "/job:do_not_rewrite_me";
-  auto v = ops::Variable(s.WithOpName("v"), {1, 32, 32}, DT_FLOAT);
+  auto d = "/device:CPU:0";
+  auto v = ops::Variable(s.WithOpName("v"), {1, 32, 32}, DT_DOUBLE);
   auto large = ops::Add(s.WithOpName("large").WithDevice(d), small, v);
 
   // [a, c] have {?, 1, 1} shape, [b] has {?, 32, 32}
@@ -1849,6 +2074,12 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCastWithSymbolicShapes) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto s_t = GenerateRandomTensor<DT_DOUBLE>(TensorShape({8, 1, 1}));
+  auto v_t = GenerateRandomTensor<DT_DOUBLE>(TensorShape({1, 32, 32}));
+  std::vector<std::pair<string, Tensor>> feed = {{"small", s_t}, {"v", v_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyAddToAddNCombining(&optimizer);
@@ -1887,6 +2118,10 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCastWithSymbolicShapes) {
   const NodeDef* updated_outputs = node_map.GetNode("outputs");
   ASSERT_NE(updated_outputs, nullptr);
   EXPECT_EQ(outer_add_name, updated_outputs->input(0));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<double>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, RemoveNegation) {
@@ -1911,6 +2146,12 @@ TEST_F(ArithmeticOptimizerTest, RemoveNegation) {
   item.fetch = {"add_all"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto y_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  std::vector<std::pair<string, Tensor>> feed = {{"x", x_t}, {"y", y_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyRemoveNegation(&optimizer);
@@ -1959,6 +2200,10 @@ TEST_F(ArithmeticOptimizerTest, RemoveNegation) {
     }
   }
   EXPECT_EQ(5, found);
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, ConvertSqrtDivToRsqrtMul) {
@@ -2014,6 +2259,14 @@ TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_SimpleSwap) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32}));
+  auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32, 32}));
+  auto c_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32}));
+  std::vector<std::pair<string, Tensor>> feed = {
+      {"a", a_t}, {"b", b_t}, {"c", c_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyMinimizeBroadcasts(&optimizer);
@@ -2038,16 +2291,20 @@ TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_SimpleSwap) {
   ASSERT_NE(mul2_node, nullptr);
   EXPECT_EQ("mul1", mul2_node->input(0));
   EXPECT_EQ("b", mul2_node->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_FlattenTallGraph) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
-  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_FLOAT);
-  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
-  auto d = ops::Variable(s.WithOpName("d"), {32}, DT_FLOAT);
-  auto e = ops::Variable(s.WithOpName("e"), {32}, DT_FLOAT);
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_DOUBLE);
+  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_DOUBLE);
+  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_DOUBLE);
+  auto d = ops::Variable(s.WithOpName("d"), {32}, DT_DOUBLE);
+  auto e = ops::Variable(s.WithOpName("e"), {32}, DT_DOUBLE);
 
   auto mul1 = ops::Mul(s.WithOpName("mul1"), a, b);
   auto mul2 = ops::Mul(s.WithOpName("mul2"), mul1, c);
@@ -2060,6 +2317,16 @@ TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_FlattenTallGraph) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto a_t = GenerateRandomTensor<DT_DOUBLE>(TensorShape({32}));
+  auto b_t = GenerateRandomTensor<DT_DOUBLE>(TensorShape({32, 32}));
+  auto c_t = GenerateRandomTensor<DT_DOUBLE>(TensorShape({32}));
+  auto d_t = GenerateRandomTensor<DT_DOUBLE>(TensorShape({32}));
+  auto e_t = GenerateRandomTensor<DT_DOUBLE>(TensorShape({32}));
+  std::vector<std::pair<string, Tensor>> feed = {
+      {"a", a_t}, {"b", b_t}, {"c", c_t}, {"d", d_t}, {"e", e_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyMinimizeBroadcasts(&optimizer);
@@ -2099,6 +2366,10 @@ TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_FlattenTallGraph) {
   ASSERT_NE(mul4_node, nullptr);
   EXPECT_EQ("mul3", mul4_node->input(0));
   EXPECT_EQ("b", mul4_node->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<double>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_BuildTreeUp) {
@@ -2120,6 +2391,15 @@ TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_BuildTreeUp) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32}));
+  auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32}));
+  auto c_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32}));
+  auto d_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32, 32}));
+  std::vector<std::pair<string, Tensor>> feed = {
+      {"a", a_t}, {"b", b_t}, {"c", c_t}, {"D", d_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyMinimizeBroadcasts(&optimizer);
@@ -2151,6 +2431,10 @@ TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_BuildTreeUp) {
   ASSERT_NE(mul3_node, nullptr);
   EXPECT_EQ("D", mul3_node->input(0));
   EXPECT_EQ("mul1", mul3_node->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, HoistCWiseUnaryFromConcat) {
@@ -2458,5 +2742,103 @@ TEST_F(ArithmeticOptimizerTest, RemoveIdempotent) {
   }
 }
 
+TEST_F(ArithmeticOptimizerTest, RemoveLogicalNot) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 3.14f, {32});
+  Output b = ops::Const(s.WithOpName("b"), -3.14f, {32});
+  Output eq = ops::Equal(s.WithOpName("eq"), a, b);
+  Output neq = ops::NotEqual(s.WithOpName("neq"), a, b);
+  Output lt = ops::Less(s.WithOpName("lt"), a, b);
+  Output le = ops::LessEqual(s.WithOpName("le"), a, b);
+  Output gt = ops::Greater(s.WithOpName("gt"), a, b);
+  Output ge = ops::GreaterEqual(s.WithOpName("ge"), a, b);
+  // not_eq is reserved
+  Output not_eq1 = ops::LogicalNot(s.WithOpName("not_eq1"), eq);
+  Output not_neq = ops::LogicalNot(s.WithOpName("not_neq"), neq);
+  Output not_lt = ops::LogicalNot(s.WithOpName("not_lt"), lt);
+  Output not_le = ops::LogicalNot(s.WithOpName("not_le"), le);
+  Output not_gt = ops::LogicalNot(s.WithOpName("not_gt"), gt);
+  Output not_ge = ops::LogicalNot(s.WithOpName("not_ge"), ge);
+  Output id_not_eq = ops::Identity(s.WithOpName("id_not_eq"), not_eq1);
+  Output id_not_neq = ops::Identity(s.WithOpName("id_not_neq"), not_neq);
+  Output id_not_lt = ops::Identity(s.WithOpName("id_not_lt"), not_lt);
+  Output id_not_le = ops::Identity(s.WithOpName("id_not_le"), not_le);
+  Output id_not_gt = ops::Identity(s.WithOpName("id_not_gt"), not_gt);
+  Output id_not_ge = ops::Identity(s.WithOpName("id_not_ge"), not_ge);
+
+  GrapplerItem item;
+  item.fetch = {"id_not_eq", "id_not_neq", "id_not_lt",
+                "id_not_le", "id_not_gt",  "id_not_ge"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveLogicalNot(&optimizer);
+  OptimizeTwice(&optimizer, &item, &output);
+  LOG(INFO) << output.DebugString();
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "id_not_eq") {
+      EXPECT_EQ("eq", node.input(0));
+      ++found;
+    }
+    if (node.name() == "id_not_neq") {
+      EXPECT_EQ("neq", node.input(0));
+      ++found;
+    }
+    if (node.name() == "id_not_lt") {
+      EXPECT_EQ("lt", node.input(0));
+      ++found;
+    }
+    if (node.name() == "id_not_le") {
+      EXPECT_EQ("le", node.input(0));
+      ++found;
+    }
+    if (node.name() == "id_not_gt") {
+      EXPECT_EQ("gt", node.input(0));
+      ++found;
+    }
+    if (node.name() == "id_not_ge") {
+      EXPECT_EQ("ge", node.input(0));
+      ++found;
+    }
+
+    if (node.name() == "eq") {
+      EXPECT_EQ("NotEqual", node.op());
+      ++found;
+    }
+    if (node.name() == "neq") {
+      EXPECT_EQ("Equal", node.op());
+      ++found;
+    }
+    if (node.name() == "lt") {
+      EXPECT_EQ("GreaterEqual", node.op());
+      ++found;
+    }
+    if (node.name() == "le") {
+      EXPECT_EQ("Greater", node.op());
+      ++found;
+    }
+    if (node.name() == "gt") {
+      EXPECT_EQ("LessEqual", node.op());
+      ++found;
+    }
+    if (node.name() == "ge") {
+      EXPECT_EQ("Less", node.op());
+      ++found;
+    }
+  }
+  EXPECT_EQ(12, found);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectTensorEqual<bool>(tensors_expected[i], tensors[i]);
+  }
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 47d882768634c478989d98586bfbf6746c41ed85..8bdb164b0371c0326b268d5649565d3b51d25645 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
+
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/symbolic_shapes.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -1329,22 +1331,56 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
 // Returns true iff this reduction can be reduced to an identity (i.e if the set
 // of dimensions to reduce along is empty). This happens often in the gradient
 // graphs.
-bool ConstantFolding::IsSimplifiableReduction(const NodeDef& node) const {
+bool ConstantFolding::IsSimplifiableReduction(
+    const NodeDef& node, const GraphProperties& properties) const {
   if (IsReduction(node)) {
     CHECK_LE(2, node.input_size());
     const NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
     if (IsReallyConstant(*reductions_indices)) {
       TensorVector output;
+      auto outputs_cleanup = gtl::MakeCleanup([&output] {
+        for (const auto& out : output) {
+          delete out.tensor;
+        }
+      });
       Status s = EvaluateNode(*reductions_indices, TensorVector(), &output);
       if (!s.ok()) {
         return false;
       }
       CHECK_EQ(1, output.size());
       int output_size = output[0]->NumElements();
-      delete output[0].tensor;
       if (output_size == 0) {
         return true;
       }
+      if (node.attr().count("keep_dims") > 0 &&
+          node.attr().at("keep_dims").b()) {
+        const auto& props = properties.GetInputProperties(node.name());
+        if (!props.empty()) {
+          const TensorShapeProto& input_shape = props[0].shape();
+          if (!input_shape.unknown_rank()) {
+            bool simplifiable = true;
+            for (int i = 0; i < output[0]->NumElements(); ++i) {
+              int64 dim;
+              if (output[0]->dtype() == DT_INT32) {
+                dim = output[0]->flat<int32>()(i);
+              } else {
+                dim = output[0]->flat<int64>()(i);
+              }
+              if (dim < 0) {
+                dim += input_shape.dim_size();
+              }
+              if (dim < 0 || dim >= input_shape.dim_size() ||
+                  input_shape.dim(dim).size() != 1) {
+                simplifiable = false;
+                break;
+              }
+            }
+            if (simplifiable) {
+              return true;
+            }
+          }
+        }
+      }
     }
   }
   return false;
@@ -1514,6 +1550,13 @@ void ConstantFolding::ReplaceOperationWithIdentity(
 void ConstantFolding::ReplaceOperationWithSnapshot(
     int input_to_forward, const GraphProperties& properties, NodeDef* node,
     GraphDef* graph) {
+  // If the graph contains no ops that mutate their inputs, we can
+  // use Identity insted of Snapshot.
+  if (!graph_contains_assign_or_inplace_op_) {
+    ReplaceOperationWithIdentity(input_to_forward, properties, node, graph);
+    return;
+  }
+
   const DataType dtype = GetDataTypeFromNodeOrProps(*node, properties);
   if (dtype == DT_INVALID) return;
 
@@ -1559,9 +1602,13 @@ void ConstantFolding::ReplaceSubtractionFromZeroByNegation(NodeDef* node,
 
 Status ConstantFolding::ReplaceOperationWithConstant(
     double value, const GraphProperties& properties,
-    const TensorShapeProto& shape, NodeDef* node, GraphDef* graph) {
+    const TensorShapeProto& shape, NodeDef* node, GraphDef* graph,
+    bool* success) {
   const DataType dtype = GetDataTypeFromNodeOrProps(*node, properties);
-  if (dtype == DT_INVALID) return Status::OK();
+  if (dtype == DT_INVALID) {
+    *success = false;
+    return Status::OK();
+  }
 
   AttrValue tensor_attr;
   TF_RETURN_IF_ERROR(
@@ -1580,905 +1627,1198 @@ Status ConstantFolding::ReplaceOperationWithConstant(
     node_map_->UpdateInput(node->name(), node->input(i), ctrl_dep);
     node->set_input(i, ctrl_dep);
   }
-  graph_modified_ = true;
+  *success = true;
   return Status::OK();
 }
 
 Status ConstantFolding::SimplifyGraph(GraphDef* optimized_graph,
                                       GraphProperties* properties,
                                       bool use_shape_info) {
-  const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
   for (int i = 0; i < optimized_graph->node_size(); ++i) {
-    NodeDef* node = optimized_graph->mutable_node(i);
+    TF_RETURN_IF_ERROR(SimplifyNode(optimized_graph->mutable_node(i),
+                                    optimized_graph, properties,
+                                    use_shape_info));
+  }
+  return Status::OK();
+}
 
-    if (IsSplit(*node) && node->attr().at("num_split").i() == 1) {
-      ReplaceOperationWithIdentity(1, *properties, node, optimized_graph);
-      continue;
+Status ConstantFolding::SimplifyNode(NodeDef* node, GraphDef* optimized_graph,
+                                     GraphProperties* properties,
+                                     bool use_shape_info) {
+  if (IsSplit(*node) && node->attr().at("num_split").i() == 1) {
+    ReplaceOperationWithIdentity(1, *properties, node, optimized_graph);
+    return Status::OK();
+  }
+
+  if (IsSplitV(*node) && node->attr().at("num_split").i() == 1) {
+    ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
+    return Status::OK();
+  }
+
+  // Remove Shuffle or Transpose op over dimensions of size 1.
+  if (use_shape_info && (IsShuffle(*node) || IsTranspose(*node)) &&
+      properties->GetInputProperties(node->name()).size() >= 2) {
+    const auto& shape = properties->GetInputProperties(node->name())[0].shape();
+    if (shape.unknown_rank()) {
+      // Not optimizable.
+      return Status::OK();
     }
+    const auto& p = properties->GetInputProperties(node->name())[1];
+    if (TensorShape::IsValid(p.shape()) && p.has_value()) {
+      Tensor perm(p.dtype(), p.shape());
+      if (!perm.FromProto(p.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       p.value().DebugString());
+      }
+      std::vector<int> permutation;
+      for (int j = 0; j < perm.NumElements(); ++j) {
+        if (perm.dtype() == DT_INT64) {
+          permutation.push_back(perm.vec<int64>()(j));
+        } else {
+          permutation.push_back(perm.vec<int>()(j));
+        }
+      }
+      if (permutation.size() != shape.dim_size()) {
+        // Number of elements in perm should be same as dim_size. Skip if not.
+        return Status::OK();
+      }
+      // The node is replaceable iff
+      // dim_size == 0 || all dims have size 1 ||
+      // all dims with > 1 size are not permuted.
+      bool replaceable = true;
+      for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
+        replaceable &= shape.dim(j).size() == 1 || j == permutation[j];
+      }
+      if (replaceable) {
+        ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
+        return Status::OK();
+      }
+    }
+  }
 
-    if (IsSplitV(*node) && node->attr().at("num_split").i() == 1) {
+  // Remove RandomShuffle op if it is scalar or first dimension is of size 1.
+  if (use_shape_info && IsRandomShuffle(*node) &&
+      !properties->GetInputProperties(node->name()).empty()) {
+    const auto& shape = properties->GetInputProperties(node->name())[0].shape();
+    // The node is replaceable iff
+    // unknown_rank == false && (dim_size == 0 || first dim is of size 1)
+    if (!shape.unknown_rank() &&
+        (shape.dim_size() == 0 || shape.dim(0).size() == 1)) {
       ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
-      continue;
+      return Status::OK();
     }
+  }
 
-    // Remove Shuffle or Transpose op over dimensions of size 1.
-    if (use_shape_info && (IsShuffle(*node) || IsTranspose(*node)) &&
-        properties->GetInputProperties(node->name()).size() >= 2) {
-      const auto& shape =
-          properties->GetInputProperties(node->name())[0].shape();
-      if (shape.unknown_rank()) {
-        // Not optimizable.
-        continue;
-      }
-      const auto& p = properties->GetInputProperties(node->name())[1];
-      if (TensorShape::IsValid(p.shape()) && p.has_value()) {
-        Tensor perm(p.dtype(), p.shape());
-        if (!perm.FromProto(p.value())) {
-          return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                         p.value().DebugString());
-        }
-        std::vector<int> permutation;
-        for (int j = 0; j < perm.NumElements(); ++j) {
-          if (perm.dtype() == DT_INT64) {
-            permutation.push_back(perm.vec<int64>()(j));
-          } else {
-            permutation.push_back(perm.vec<int>()(j));
-          }
-        }
-        if (permutation.size() != shape.dim_size()) {
-          // Number of elements in perm should be same as dim_size. Skip if not.
-          continue;
-        }
-        // The node is replaceable iff
-        // dim_size == 0 || all dims have size 1 ||
-        // all dims with > 1 size are not permuted.
-        bool replaceable = true;
-        for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
-          replaceable &= shape.dim(j).size() == 1 || j == permutation[j];
-        }
-        if (replaceable) {
-          ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
-          continue;
+  // Remove Reverse op over dimensions with size 1.
+  if (use_shape_info && node->op() == "ReverseV2" &&
+      properties->GetInputProperties(node->name()).size() >= 2) {
+    const auto& shape = properties->GetInputProperties(node->name())[0].shape();
+    if (shape.unknown_rank()) {
+      // Not optimizable.
+      return Status::OK();
+    }
+    const auto& a = properties->GetInputProperties(node->name())[1];
+    if (TensorShape::IsValid(a.shape()) && a.has_value()) {
+      Tensor axis(a.dtype(), a.shape());
+      if (!axis.FromProto(a.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       a.value().DebugString());
+      }
+      std::set<int> target_axes;
+      for (int j = 0; j < axis.NumElements(); ++j) {
+        // value of axis can be negative.
+        if (axis.dtype() == DT_INT64) {
+          target_axes.insert((axis.vec<int64>()(j) + shape.dim_size()) %
+                             shape.dim_size());
+        } else {
+          target_axes.insert((axis.vec<int>()(j) + shape.dim_size()) %
+                             shape.dim_size());
         }
       }
-    }
 
-    // Remove RandomShuffle op if it is scalar or first dimension is of size 1.
-    if (use_shape_info && IsRandomShuffle(*node) &&
-        !properties->GetInputProperties(node->name()).empty()) {
-      const auto& shape =
-          properties->GetInputProperties(node->name())[0].shape();
       // The node is replaceable iff
-      // unknown_rank == false && (dim_size == 0 || first dim is of size 1)
-      if (!shape.unknown_rank() &&
-          (shape.dim_size() == 0 || shape.dim(0).size() == 1)) {
+      // unknown_rank == false &&
+      // (dim_size == 0 || all dims have size 1 ||
+      //  all dims with > 1 size are not in target_axes)
+      bool replaceable = !shape.unknown_rank();
+      for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
+        replaceable &= shape.dim(j).size() == 1 ||
+                       target_axes.find(j) == target_axes.end();
+      }
+      if (replaceable) {
         ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
-        continue;
+        return Status::OK();
       }
     }
+  }
 
-    // Remove Reverse op over dimensions with size 1.
-    if (use_shape_info && node->op() == "ReverseV2" &&
-        properties->GetInputProperties(node->name()).size() >= 2) {
-      const auto& shape =
-          properties->GetInputProperties(node->name())[0].shape();
-      if (shape.unknown_rank()) {
-        // Not optimizable.
-        continue;
+  if (use_shape_info && IsSlice(*node) &&
+      properties->GetInputProperties(node->name()).size() == 3) {
+    const auto& input = properties->GetInputProperties(node->name())[0];
+    const auto& b = properties->GetInputProperties(node->name())[1];
+    const auto& s = properties->GetInputProperties(node->name())[2];
+    if (TensorShape::IsValid(b.shape()) && b.has_value() &&
+        TensorShape::IsValid(s.shape()) && s.has_value()) {
+      Tensor begin(b.dtype(), b.shape());
+      if (!begin.FromProto(b.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       b.value().DebugString());
       }
-      const auto& a = properties->GetInputProperties(node->name())[1];
-      if (TensorShape::IsValid(a.shape()) && a.has_value()) {
-        Tensor axis(a.dtype(), a.shape());
-        if (!axis.FromProto(a.value())) {
-          return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                         a.value().DebugString());
-        }
-        std::set<int> target_axes;
-        for (int j = 0; j < axis.NumElements(); ++j) {
-          // value of axis can be negative.
-          if (axis.dtype() == DT_INT64) {
-            target_axes.insert((axis.vec<int64>()(j) + shape.dim_size()) %
-                               shape.dim_size());
-          } else {
-            target_axes.insert((axis.vec<int>()(j) + shape.dim_size()) %
-                               shape.dim_size());
-          }
-        }
-
-        // The node is replaceable iff
-        // unknown_rank == false &&
-        // (dim_size == 0 || all dims have size 1 ||
-        //  all dims with > 1 size are not in target_axes)
-        bool replaceable = !shape.unknown_rank();
-        for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
-          replaceable &= shape.dim(j).size() == 1 ||
-                         target_axes.find(j) == target_axes.end();
+      Tensor size(s.dtype(), s.shape());
+      if (!size.FromProto(s.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       s.value().DebugString());
+      }
+      // The node is replaceable iff unknown_rank == false &&
+      // begin == 0 && (size == -1 || size == input_shape) for all dimensions
+      bool replaceable = !input.shape().unknown_rank();
+      for (int j = 0; replaceable && j < input.shape().dim_size(); ++j) {
+        if (begin.dtype() == DT_INT32) {
+          replaceable &= begin.vec<int>()(j) == 0;
+        } else {
+          replaceable &= begin.vec<int64>()(j) == 0;
         }
-        if (replaceable) {
-          ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
-          continue;
+        if (size.dtype() == DT_INT32) {
+          replaceable &= (size.vec<int>()(j) == -1 ||
+                          size.vec<int>()(j) == input.shape().dim(j).size());
+        } else {
+          replaceable &= (size.vec<int64>()(j) == -1 ||
+                          size.vec<int64>()(j) == input.shape().dim(j).size());
         }
       }
+      if (replaceable) {
+        ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
+        return Status::OK();
+      }
     }
+  }
 
-    if (use_shape_info && IsSlice(*node) &&
-        properties->GetInputProperties(node->name()).size() == 3) {
-      const auto& input = properties->GetInputProperties(node->name())[0];
-      const auto& b = properties->GetInputProperties(node->name())[1];
-      const auto& s = properties->GetInputProperties(node->name())[2];
-      if (TensorShape::IsValid(b.shape()) && b.has_value() &&
-          TensorShape::IsValid(s.shape()) && s.has_value()) {
-        Tensor begin(b.dtype(), b.shape());
-        if (!begin.FromProto(b.value())) {
-          return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                         b.value().DebugString());
-        }
-        Tensor size(s.dtype(), s.shape());
-        if (!size.FromProto(s.value())) {
-          return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                         s.value().DebugString());
-        }
-        // The node is replaceable iff unknown_rank == false &&
-        // begin == 0 && (size == -1 || size == input_shape) for all dimensions
-        bool replaceable = !input.shape().unknown_rank();
-        for (int j = 0; replaceable && j < input.shape().dim_size(); ++j) {
-          if (begin.dtype() == DT_INT32) {
-            replaceable &= begin.vec<int>()(j) == 0;
-          } else {
-            replaceable &= begin.vec<int64>()(j) == 0;
-          }
-          if (size.dtype() == DT_INT32) {
-            replaceable &= (size.vec<int>()(j) == -1 ||
-                            size.vec<int>()(j) == input.shape().dim(j).size());
-          } else {
-            replaceable &=
-                (size.vec<int64>()(j) == -1 ||
-                 size.vec<int64>()(j) == input.shape().dim(j).size());
-          }
-        }
-        if (replaceable) {
-          ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
-          continue;
-        }
+  if (use_shape_info && IsStridedSlice(*node) &&
+      properties->GetInputProperties(node->name()).size() == 4) {
+    if (node->attr().at("new_axis_mask").i() != 0 ||
+        node->attr().at("shrink_axis_mask").i() != 0) {
+      // Skip nodes with new/shrink axis mask, since they involve dimension
+      // changes.
+      return Status::OK();
+    }
+    const auto& input = properties->GetInputProperties(node->name())[0];
+    for (int j = 0; j < input.shape().dim_size(); ++j) {
+      // Skip if input shape is not fully determined.
+      if (input.shape().dim(j).size() < 0) {
+        return Status::OK();
       }
     }
-
-    if (use_shape_info && IsTile(*node) &&
-        properties->GetInputProperties(node->name()).size() == 2) {
-      const auto& m = properties->GetInputProperties(node->name())[1];
-      if (TensorShape::IsValid(m.shape()) && m.has_value()) {
-        Tensor multiplies(m.dtype(), m.shape());
-        if (!multiplies.FromProto(m.value())) {
-          return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                         m.value().DebugString());
-        }
-        // The node is replaceable iff all values in multiplies are 1.
-        bool replaceable = true;
-        if (multiplies.dtype() == DT_INT32) {
-          for (int j = 0; replaceable && j < multiplies.vec<int>().size();
-               ++j) {
-            replaceable &= multiplies.vec<int>()(j) == 1;
-          }
-        } else {
-          for (int j = 0; replaceable && j < multiplies.vec<int64>().size();
-               ++j) {
-            replaceable &= multiplies.vec<int64>()(j) == 1;
-          }
+    const auto& b = properties->GetInputProperties(node->name())[1];
+    const auto& e = properties->GetInputProperties(node->name())[2];
+    const auto& s = properties->GetInputProperties(node->name())[3];
+    if (TensorShape::IsValid(b.shape()) && b.has_value() &&
+        TensorShape::IsValid(e.shape()) && e.has_value() &&
+        TensorShape::IsValid(s.shape()) && s.has_value()) {
+      Tensor begin(b.dtype(), b.shape());
+      if (!begin.FromProto(b.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       b.value().DebugString());
+      }
+      Tensor end(e.dtype(), e.shape());
+      if (!end.FromProto(e.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       e.value().DebugString());
+      }
+      Tensor strides(s.dtype(), s.shape());
+      if (!strides.FromProto(s.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       s.value().DebugString());
+      }
+      int begin_mask = node->attr().at("begin_mask").i();
+      int end_mask = node->attr().at("end_mask").i();
+      std::set<int> expanded_ellipsis_indices;
+      int ellipsis_index = -1;
+      for (int j = 0; j < input.shape().dim_size(); ++j) {
+        // find the ellipsis_mask. If not found, insert one in the end if
+        // necessary.
+        if (node->attr().at("ellipsis_mask").i() & 1 << j ||
+            (ellipsis_index == -1 && j >= strides.NumElements())) {
+          ellipsis_index = j;
         }
-        if (replaceable) {
-          ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
-          continue;
+        // insert the indices that are immediately after ellipsis_index if
+        // necessary.
+        if (ellipsis_index != -1 &&
+            input.shape().dim_size() >
+                strides.NumElements() + j - ellipsis_index) {
+          expanded_ellipsis_indices.insert(j);
         }
       }
-    }
 
-    if (use_shape_info && IsPad(*node) &&
-        properties->GetInputProperties(node->name()).size() >= 2) {
-      const auto& p = properties->GetInputProperties(node->name())[1];
-      if (TensorShape::IsValid(p.shape()) && p.has_value()) {
-        Tensor paddings(p.dtype(), p.shape());
-        if (!paddings.FromProto(p.value())) {
-          return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                         p.value().DebugString());
-        }
-        // The node is replaceable iff all values in paddings are 0.
-        bool replaceable = true;
-        // The operation requires it to be int32 value so we don't check for
-        // 1nt64.
-        const auto flatten = paddings.flat<int32>();
-        for (int j = 0; replaceable && j < flatten.size(); ++j) {
-          replaceable &= flatten(j) == 0;
-        }
-        if (replaceable) {
-          ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
+      // The node is replaceable iff unknown_rank == false &&
+      // ((begin_mask is set || begin == 0) && (end_mask is set || end == dim)
+      //  && strides == 1) for all dimensions.
+      bool replaceable = !input.shape().unknown_rank();
+      for (int j = 0; replaceable && j < input.shape().dim_size(); ++j) {
+        if (expanded_ellipsis_indices.find(j) !=
+            expanded_ellipsis_indices.end()) {
+          // ellipsis_mask is effective on current dimension.
           continue;
         }
+        // when we have ellipsis_mask in between, input.shape().dim_size() will
+        // be greater than strides.NumElements(), since we will insert
+        // as many as expanded_ellipsis_indices.size() axes during computation.
+        // We need to subtract this number from j.
+        int i = j;
+        if (ellipsis_index != -1 &&
+            j >= ellipsis_index + expanded_ellipsis_indices.size()) {
+          i = j - expanded_ellipsis_indices.size();
+        }
+        int b = begin.dtype() == DT_INT32 ? begin.vec<int>()(i)
+                                          : begin.vec<int64>()(i);
+        int e =
+            end.dtype() == DT_INT32 ? end.vec<int>()(i) : end.vec<int64>()(i);
+        int s = strides.dtype() == DT_INT32 ? strides.vec<int>()(i)
+                                            : strides.vec<int64>()(i);
+        replaceable &=
+            (begin_mask & 1 << i || b == 0) &&
+            (end_mask & 1 << i || e == input.shape().dim(j).size()) && s == 1;
+      }
+      if (replaceable) {
+        ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
+        return Status::OK();
       }
     }
+  }
 
-    if (use_shape_info && IsSqueeze(*node) &&
-        !properties->GetInputProperties(node->name()).empty()) {
-      // https://www.tensorflow.org/api_docs/python/tf/squeeze mentions it's
-      // error to squeeze a dimension that is not 1, so we only need to check
-      // whether the input has > 1 size for each dimension.
-      const auto& shape =
-          properties->GetInputProperties(node->name())[0].shape();
-      // The node is replaceable iff
-      // unknown_rank == false && (dim_size == 0 || all dims have size > 1)
-      bool replaceable = !shape.unknown_rank();
-      for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
-        replaceable &= shape.dim(j).size() > 1;
+  if (use_shape_info && IsTile(*node) &&
+      properties->GetInputProperties(node->name()).size() == 2) {
+    const auto& m = properties->GetInputProperties(node->name())[1];
+    if (TensorShape::IsValid(m.shape()) && m.has_value()) {
+      Tensor multiplies(m.dtype(), m.shape());
+      if (!multiplies.FromProto(m.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       m.value().DebugString());
+      }
+      // The node is replaceable iff all values in multiplies are 1.
+      bool replaceable = true;
+      if (multiplies.dtype() == DT_INT32) {
+        for (int j = 0; replaceable && j < multiplies.vec<int>().size(); ++j) {
+          replaceable &= multiplies.vec<int>()(j) == 1;
+        }
+      } else {
+        for (int j = 0; replaceable && j < multiplies.vec<int64>().size();
+             ++j) {
+          replaceable &= multiplies.vec<int64>()(j) == 1;
+        }
       }
       if (replaceable) {
         ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
-        continue;
+        return Status::OK();
       }
     }
+  }
 
-    if (IsPack(*node) && NumNonControlInputs(*node) == 1 &&
-        !OptimizedNodeExists(*node, "_const_axis")) {
-      // Create constant axis node.
-      Tensor axis_t(DT_INT32, TensorShape({}));
-      NodeDef* axis_node = optimized_graph->add_node();
-      axis_node->set_name(OptimizedNodeName(*node, "_const_axis"));
-      const int axis = node->attr().at("axis").i();
-      if (!SetTensorValue(DT_INT32, axis, &axis_t).ok() ||
-          !CreateNodeDef(axis_node->name(), TensorValue(&axis_t), axis_node)
-               .ok()) {
-        continue;
-      }
-      // Add a control dependency to make sure axis_node is in the right frame.
-      const string ctrl_dep = ConstantFolding::AddControlDependency(
-          node->input(0), graph_, node_map_.get());
-      axis_node->add_input(ctrl_dep);
-      axis_node->set_device(node->device());
-      node->set_op("ExpandDims");
-      if (node->attr().count("axis") != 0) {
-        node->mutable_attr()->erase("axis");
+  if (use_shape_info && IsPad(*node) &&
+      properties->GetInputProperties(node->name()).size() >= 2) {
+    const auto& p = properties->GetInputProperties(node->name())[1];
+    if (TensorShape::IsValid(p.shape()) && p.has_value()) {
+      Tensor paddings(p.dtype(), p.shape());
+      if (!paddings.FromProto(p.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       p.value().DebugString());
       }
-      if (node->attr().count("N") != 0) {
-        node->mutable_attr()->erase("N");
+      // The node is replaceable iff all values in paddings are 0.
+      bool replaceable = true;
+      // The operation requires it to be int32 value so we don't check for
+      // 1nt64.
+      const auto flatten = paddings.flat<int32>();
+      for (int j = 0; replaceable && j < flatten.size(); ++j) {
+        replaceable &= flatten(j) == 0;
       }
-      (*node->mutable_attr())["Tdim"].set_type(DT_INT32);
-      node->add_input(axis_node->name());
-      if (node->input_size() > 2) {
-        node->mutable_input()->SwapElements(1, node->input_size() - 1);
+      if (replaceable) {
+        ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
+        return Status::OK();
       }
-      graph_modified_ = true;
-      continue;
     }
+  }
 
-    // Move constants past Enter.
-    if (IsEnter(*node) && node->input_size() > 0) {
-      if (node->attr().count("is_constant") == 0 ||
-          !node->attr().at("is_constant").b()) {
-        continue;
-      }
-      const string& node_name = node->name();
-      const NodeDef* input = node_map_->GetNode(node->input(0));
-      if (input != nullptr && IsReallyConstant(*input) &&
-          !OptimizedNodeExists(*input, "_enter")) {
-        auto fanouts = node_map_->GetOutputs(node_name);
-        // Find non-constant nodes that consume the output of *node.
-        std::vector<NodeDef*> consumers;
-        for (NodeDef* fanout : fanouts) {
-          if (!IsConstant(*fanout)) {
-            for (int i = 0; i < fanout->input_size(); ++i) {
-              if (fanout->input(i) == node_name) {
-                consumers.push_back(fanout);
-                break;
-              }
-            }
-          }
-        }
-        if (!consumers.empty()) {
-          NodeDef* new_node = optimized_graph->add_node();
-          *new_node = *input;
-          new_node->set_name(OptimizedNodeName(*input, "_enter"));
-          new_node->set_device(node->device());
-          new_node->clear_input();
-          new_node->add_input(AsControlDependency(node_name));
-          node_map_->AddNode(new_node->name(), new_node);
-          node_map_->AddOutput(node_name, new_node->name());
-          for (NodeDef* consumer : consumers) {
-            for (int i = 0; i < consumer->input_size(); ++i) {
-              if (NodeName(consumer->input(i)) == node_name) {
-                node_map_->UpdateInput(consumer->name(), node_name,
-                                       new_node->name());
-                consumer->set_input(i, new_node->name());
-              }
-            }
-          }
-          graph_modified_ = true;
-          continue;
-        }
-      }
+  if (use_shape_info && IsSqueeze(*node) &&
+      !properties->GetInputProperties(node->name()).empty()) {
+    // https://www.tensorflow.org/api_docs/python/tf/squeeze mentions it's
+    // error to squeeze a dimension that is not 1, so we only need to check
+    // whether the input has > 1 size for each dimension.
+    const auto& shape = properties->GetInputProperties(node->name())[0].shape();
+    // The node is replaceable iff
+    // unknown_rank == false && (dim_size == 0 || all dims have size > 1)
+    bool replaceable = !shape.unknown_rank();
+    for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
+      replaceable &= shape.dim(j).size() > 1;
     }
+    if (replaceable) {
+      ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
+      return Status::OK();
+    }
+  }
+
+  if (IsPack(*node) && NumNonControlInputs(*node) == 1 &&
+      !OptimizedNodeExists(*node, "_const_axis")) {
+    // Create constant axis node.
+    Tensor axis_t(DT_INT32, TensorShape({}));
+    NodeDef* axis_node = optimized_graph->add_node();
+    axis_node->set_name(OptimizedNodeName(*node, "_const_axis"));
+    const int axis = node->attr().at("axis").i();
+    if (!SetTensorValue(DT_INT32, axis, &axis_t).ok() ||
+        !CreateNodeDef(axis_node->name(), TensorValue(&axis_t), axis_node)
+             .ok()) {
+      return Status::OK();
+    }
+    // Add a control dependency to make sure axis_node is in the right frame.
+    const string ctrl_dep = ConstantFolding::AddControlDependency(
+        node->input(0), graph_, node_map_.get());
+    axis_node->add_input(ctrl_dep);
+    axis_node->set_device(node->device());
+    node->set_op("ExpandDims");
+    if (node->attr().count("axis") != 0) {
+      node->mutable_attr()->erase("axis");
+    }
+    if (node->attr().count("N") != 0) {
+      node->mutable_attr()->erase("N");
+    }
+    (*node->mutable_attr())["Tdim"].set_type(DT_INT32);
+    node->add_input(axis_node->name());
+    if (node->input_size() > 2) {
+      node->mutable_input()->SwapElements(1, node->input_size() - 1);
+    }
+    graph_modified_ = true;
+    return Status::OK();
+  }
 
-    // Switch(x, x) will always feed false to its false branch and true to
-    // its true branch. By rewriting the graph a bit, we can propagate these
-    // constants down the two output branches, and just use control dependencies
-    // to trigger the selected one at runtime. For example,
-    //
-    //     +------+
-    // x-->|Switch|-->a  (in practice there may be multiple consumers of each
-    // x-->|      |-->b   output branch.)
-    //     +------+
-    //
-    // Is rewritten as
-    //
-    //     +------+
-    // x-->|Switch|-->Identity--^>Const(false)-->a
-    // x-->|      |-->Identity--^>Const(true)-->b
-    //     +------+
-    if (node->op() == "Switch" && node->input(0) == node->input(1) &&
-        !OptimizedNodeExists(*node, "_const_false") &&
-        !OptimizedNodeExists(*node, "_const_true")) {
-      bool already_optimized = true;
-      // If the optimization was already applied, the switch would have exactly
-      // one Identity node consuming each of its outputs, each without any
-      // non-control outputs.
-      auto fanouts = node_map_->GetOutputs(node->name());
-      if (fanouts.size() == 2) {
-        for (NodeDef* fanout : fanouts) {
-          if (!IsIdentity(*fanout) ||
-              NumNonControlOutputs(*fanout, *node_map_) > 0) {
-            already_optimized = false;
-            break;
+  // Move constants past Enter.
+  if (IsEnter(*node) && node->input_size() > 0) {
+    if (node->attr().count("is_constant") == 0 ||
+        !node->attr().at("is_constant").b()) {
+      return Status::OK();
+    }
+    const string& node_name = node->name();
+    const NodeDef* input = node_map_->GetNode(node->input(0));
+    if (input != nullptr && IsReallyConstant(*input) &&
+        !OptimizedNodeExists(*input, "_enter")) {
+      auto fanouts = node_map_->GetOutputs(node_name);
+      // Find non-constant nodes that consume the output of *node.
+      std::vector<NodeDef*> consumers;
+      for (NodeDef* fanout : fanouts) {
+        if (!IsConstant(*fanout)) {
+          for (int i = 0; i < fanout->input_size(); ++i) {
+            if (fanout->input(i) == node_name) {
+              consumers.push_back(fanout);
+              break;
+            }
           }
         }
       }
-      Tensor false_t(DT_BOOL, TensorShape({}));
-      Tensor true_t(DT_BOOL, TensorShape({}));
-      // Make sure we don't proceed if this switch node was already optimized.
-      if (!already_optimized && SetTensorValue(DT_BOOL, true, &true_t).ok() &&
-          SetTensorValue(DT_BOOL, false, &false_t).ok()) {
-        // Copy the set of consumers of the switch as they will be manipulated
-        // below.
-        const std::set<NodeDef*>& consumer_set =
-            node_map_->GetOutputs(node->name());
-        std::vector<NodeDef*> consumers(consumer_set.begin(),
-                                        consumer_set.end());
-        std::sort(consumers.begin(), consumers.end(),
-                  [](const NodeDef* n1, const NodeDef* n2) {
-                    return n1->name() < n2->name();
-                  });
-        // Create constant false & true nodes.
-        NodeDef* false_node = optimized_graph->add_node();
-        false_node->set_name(OptimizedNodeName(*node, "_const_false"));
-        if (!CreateNodeDef(false_node->name(), TensorValue(&false_t),
-                           false_node)
-                 .ok()) {
-          continue;
-        }
-        false_node->set_device(node->device());
-
-        NodeDef* true_node = optimized_graph->add_node();
-        true_node->set_name(OptimizedNodeName(*node, "_const_true"));
-        if (!CreateNodeDef(true_node->name(), TensorValue(&true_t), true_node)
-                 .ok()) {
-          continue;
-        }
-        true_node->set_device(node->device());
-
-        // Add controls from the switch ports to the constants, and connect the
-        // constants to the original switch outputs.
-        const string false_port = node->name();
-        const string true_port = strings::StrCat(node->name(), ":1");
-        const string false_ctrl_dep =
-            AddControlDependency(false_port, optimized_graph, node_map_.get());
-        false_node->add_input(false_ctrl_dep);
-        const string true_ctrl_dep =
-            AddControlDependency(true_port, optimized_graph, node_map_.get());
-        true_node->add_input(true_ctrl_dep);
-
-        node_map_->AddNode(false_node->name(), false_node);
-        node_map_->AddNode(true_node->name(), true_node);
-        node_map_->AddOutput(NodeName(false_ctrl_dep), false_node->name());
-        node_map_->AddOutput(NodeName(true_ctrl_dep), true_node->name());
-
+      if (!consumers.empty()) {
+        NodeDef* new_node = optimized_graph->add_node();
+        *new_node = *input;
+        new_node->set_name(OptimizedNodeName(*input, "_enter"));
+        new_node->set_device(node->device());
+        new_node->clear_input();
+        new_node->add_input(AsControlDependency(node_name));
+        node_map_->AddNode(new_node->name(), new_node);
+        node_map_->AddOutput(node_name, new_node->name());
         for (NodeDef* consumer : consumers) {
           for (int i = 0; i < consumer->input_size(); ++i) {
-            const string& input = consumer->input(i);
-            if (input == false_port) {
-              consumer->set_input(i, false_node->name());
-              node_map_->UpdateInput(consumer->name(), false_port,
-                                     false_node->name());
-            } else if (input == true_port) {
-              consumer->set_input(i, true_node->name());
-              node_map_->UpdateInput(consumer->name(), true_port,
-                                     true_node->name());
+            if (NodeName(consumer->input(i)) == node_name) {
+              node_map_->UpdateInput(consumer->name(), node_name,
+                                     new_node->name());
+              consumer->set_input(i, new_node->name());
             }
           }
         }
         graph_modified_ = true;
-        continue;
+        return Status::OK();
       }
     }
-    if (IsSimplifiableReduction(*node)) {
-      // Replace the reduction node with an identity node, that can be further
-      // optimized by the model pruner.
-      DataType output_type;
-      if (node->attr().count("T") > 0) {
-        output_type = node->attr().at("T").type();
-      } else {
-        // This is an 'any' or 'all' reduction. The output is always boolean.
-        output_type = DT_BOOL;
+  }
+
+  // Switch(x, x) will always feed false to its false branch and true to
+  // its true branch. By rewriting the graph a bit, we can propagate these
+  // constants down the two output branches, and just use control dependencies
+  // to trigger the selected one at runtime. For example,
+  //
+  //     +------+
+  // x-->|Switch|-->a  (in practice there may be multiple consumers of each
+  // x-->|      |-->b   output branch.)
+  //     +------+
+  //
+  // Is rewritten as
+  //
+  //     +------+
+  // x-->|Switch|-->Identity--^>Const(false)-->a
+  // x-->|      |-->Identity--^>Const(true)-->b
+  //     +------+
+  if (node->op() == "Switch" && node->input(0) == node->input(1) &&
+      !OptimizedNodeExists(*node, "_const_false") &&
+      !OptimizedNodeExists(*node, "_const_true")) {
+    bool already_optimized = true;
+    // If the optimization was already applied, the switch would have exactly
+    // one Identity node consuming each of its outputs, each without any
+    // non-control outputs.
+    auto fanouts = node_map_->GetOutputs(node->name());
+    if (fanouts.size() == 2) {
+      for (NodeDef* fanout : fanouts) {
+        if (!IsIdentity(*fanout) ||
+            NumNonControlOutputs(*fanout, *node_map_) > 0) {
+          already_optimized = false;
+          break;
+        }
       }
-      node->set_op("Identity");
-      node->clear_attr();
-      (*node->mutable_attr())["T"].set_type(output_type);
-      *node->mutable_input(1) = AsControlDependency(node->input(1));
-      graph_modified_ = true;
-      continue;
     }
-    if (use_shape_info && IsSimplifiableReshape(*node, *properties)) {
-      DataType output_type = node->attr().at("T").type();
-      node->set_op("Identity");
-      node->clear_attr();
-      (*node->mutable_attr())["T"].set_type(output_type);
-      *node->mutable_input(1) = AsControlDependency(node->input(1));
-      graph_modified_ = true;
-      continue;
-    }
-
-    const bool is_mul = IsMul(*node) || IsLogicalAnd(*node);
-    const bool is_matmul = IsMatMul(*node);
-    const bool is_add = IsAdd(*node) || IsBiasAdd(*node) || IsLogicalOr(*node);
-    const bool is_sub = IsSub(*node);
-    const bool is_any_div = IsAnyDiv(*node);
-    // Simplify arithmetic operations with ones or zeros.
-    if (use_shape_info &&
-        (is_mul || is_matmul || is_add || is_sub || is_any_div) &&
-        properties->HasInputProperties(node->name()) &&
-        properties->HasOutputProperties(node->name())) {
-      const NodeDef* x = node_map_->GetNode(node->input(0));
-      const NodeDef* y = node_map_->GetNode(node->input(1));
-      if (x == nullptr || y == nullptr) {
-        return errors::InvalidArgument("Invalid inputs to node: ",
-                                       node->DebugString());
-      }
-      const TensorShapeProto& output_shape =
-          properties->GetOutputProperties(node->name())[0].shape();
-
-      // Simplify element-wise multiplication by ones or addition/subtraction
-      // of zeros.
-      const TensorShapeProto& y_shape =
-          properties->GetInputProperties(node->name())[1].shape();
-      const bool x_is_zero = IsZeros(*x);
-      const bool x_is_one = x_is_zero ? false : IsOnes(*x);
-      const bool y_matches_output_shape = ShapesEqual(output_shape, y_shape);
-      if (y_matches_output_shape &&
-          ((is_mul && x_is_one) || (is_add && x_is_zero))) {
-        // 1 * y = y or 0 + y = y.
-        ReplaceOperationWithSnapshot(1, *properties, node, optimized_graph);
-        continue;
+    Tensor false_t(DT_BOOL, TensorShape({}));
+    Tensor true_t(DT_BOOL, TensorShape({}));
+    // Make sure we don't proceed if this switch node was already optimized.
+    if (!already_optimized && SetTensorValue(DT_BOOL, true, &true_t).ok() &&
+        SetTensorValue(DT_BOOL, false, &false_t).ok()) {
+      // Copy the set of consumers of the switch as they will be manipulated
+      // below.
+      const std::set<NodeDef*>& consumer_set =
+          node_map_->GetOutputs(node->name());
+      std::vector<NodeDef*> consumers(consumer_set.begin(), consumer_set.end());
+      std::sort(consumers.begin(), consumers.end(),
+                [](const NodeDef* n1, const NodeDef* n2) {
+                  return n1->name() < n2->name();
+                });
+      // Create constant false & true nodes.
+      NodeDef* false_node = optimized_graph->add_node();
+      false_node->set_name(OptimizedNodeName(*node, "_const_false"));
+      if (!CreateNodeDef(false_node->name(), TensorValue(&false_t), false_node)
+               .ok()) {
+        return Status::OK();
       }
+      false_node->set_device(node->device());
 
-      if (y_matches_output_shape && (is_sub && x_is_zero)) {
-        // Replace 0 - y with Neg(y).
-        ReplaceSubtractionFromZeroByNegation(node, optimized_graph);
-        continue;
+      NodeDef* true_node = optimized_graph->add_node();
+      true_node->set_name(OptimizedNodeName(*node, "_const_true"));
+      if (!CreateNodeDef(true_node->name(), TensorValue(&true_t), true_node)
+               .ok()) {
+        return Status::OK();
       }
-
-      // Replace 1 / y with Reciprocal op.
-      if (y_matches_output_shape && is_any_div && x_is_one) {
-        DataType type = node->attr().at("T").type();
-        if (DataTypeIsFloating(type) || DataTypeIsComplex(type)) {
-          ReplaceDivisionOfOnesByReciprocal(node, optimized_graph);
-          continue;
+      true_node->set_device(node->device());
+
+      // Add controls from the switch ports to the constants, and connect the
+      // constants to the original switch outputs.
+      const string false_port = node->name();
+      const string true_port = strings::StrCat(node->name(), ":1");
+      const string false_ctrl_dep =
+          AddControlDependency(false_port, optimized_graph, node_map_.get());
+      false_node->add_input(false_ctrl_dep);
+      const string true_ctrl_dep =
+          AddControlDependency(true_port, optimized_graph, node_map_.get());
+      true_node->add_input(true_ctrl_dep);
+
+      node_map_->AddNode(false_node->name(), false_node);
+      node_map_->AddNode(true_node->name(), true_node);
+      node_map_->AddOutput(NodeName(false_ctrl_dep), false_node->name());
+      node_map_->AddOutput(NodeName(true_ctrl_dep), true_node->name());
+
+      for (NodeDef* consumer : consumers) {
+        for (int i = 0; i < consumer->input_size(); ++i) {
+          const string& input = consumer->input(i);
+          if (input == false_port) {
+            consumer->set_input(i, false_node->name());
+            node_map_->UpdateInput(consumer->name(), false_port,
+                                   false_node->name());
+          } else if (input == true_port) {
+            consumer->set_input(i, true_node->name());
+            node_map_->UpdateInput(consumer->name(), true_port,
+                                   true_node->name());
+          }
         }
       }
+      graph_modified_ = true;
+      return Status::OK();
+    }
+  }
+  if (IsSimplifiableReduction(*node, *properties)) {
+    // Replace the reduction node with an identity node, that can be further
+    // optimized by the model pruner.
+    DataType output_type;
+    if (node->attr().count("T") > 0) {
+      output_type = node->attr().at("T").type();
+    } else {
+      // This is an 'any' or 'all' reduction. The output is always boolean.
+      output_type = DT_BOOL;
+    }
+    node->set_op("Identity");
+    node->clear_attr();
+    (*node->mutable_attr())["T"].set_type(output_type);
+    *node->mutable_input(1) = AsControlDependency(node->input(1));
+    graph_modified_ = true;
+    return Status::OK();
+  }
+  if (use_shape_info && IsSimplifiableReshape(*node, *properties)) {
+    DataType output_type = node->attr().at("T").type();
+    node->set_op("Identity");
+    node->clear_attr();
+    (*node->mutable_attr())["T"].set_type(output_type);
+    *node->mutable_input(1) = AsControlDependency(node->input(1));
+    graph_modified_ = true;
+    return Status::OK();
+  }
 
-      const TensorShapeProto& x_shape =
-          properties->GetInputProperties(node->name())[0].shape();
-      const bool y_is_zero = IsZeros(*y);
-      const bool y_is_one = y_is_zero ? false : IsOnes(*y);
-      const bool x_matches_output_shape = ShapesEqual(output_shape, x_shape);
-      if (x_matches_output_shape && (((is_mul || is_any_div) && y_is_one) ||
-                                     ((is_add || is_sub) && y_is_zero))) {
-        // x * 1 = x or x / 1 = x or x +/- 0 = x
-        ReplaceOperationWithSnapshot(0, *properties, node, optimized_graph);
-        continue;
-      }
+  bool arithmetic_simplification_succeed = false;
+  Status simplify_arithmetic_status = SimplifyArithmeticOperations(
+      optimized_graph, properties, node, use_shape_info,
+      &arithmetic_simplification_succeed);
+  if (!simplify_arithmetic_status.ok()) {
+    return simplify_arithmetic_status;
+  } else if (arithmetic_simplification_succeed) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
 
-      // x OR true = true OR y = true.
-      const PartialTensorShape shp(output_shape);
-      if (shp.IsFullyDefined() && IsLogicalOr(*node) &&
-          (y_is_one || x_is_one)) {
-        TF_RETURN_IF_ERROR(ReplaceOperationWithConstant(
-            1, *properties, output_shape, node, optimized_graph));
-      }
-
-      // Simplify multiplication and matmul by zeros.
-      // Also optimize zeros divided by a tensor, but only if we are in
-      // aggressive mode, since we might get rid of divisions by zero.
-      bool optimize_zeros_divided_by_y =
-          is_any_div && x_is_zero && is_aggressive;
-      if ((x_is_zero || y_is_zero) &&
-          (is_mul || is_matmul || optimize_zeros_divided_by_y)) {
-        if (shp.IsFullyDefined()) {
-          TF_RETURN_IF_ERROR(ReplaceOperationWithConstant(
-              0, *properties, output_shape, node, optimized_graph));
-          continue;
-        }
-        // Even if an input shape is only partially known, we may known that it
-        // matches the output shape and thus forward the corresponding zero
-        // input.
-        if ((is_mul || is_any_div) && x_is_zero && x_matches_output_shape) {
-          ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
-          continue;
-        } else if (is_mul && y_is_zero && y_matches_output_shape) {
-          ReplaceOperationWithIdentity(1, *properties, node, optimized_graph);
-          continue;
-        }
-      }
+  if (ReduceDivToReciprocalMul(optimized_graph, node)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  if (ConstantPushDown(node)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  if (MulConvPushDown(node, *properties)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  if (PartialConstPropThroughIdentityN(node)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  if (PartialAssocOpConstFolding(optimized_graph, properties, node)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  if (PartialConcatConstFolding(optimized_graph, properties, node)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  return Status::OK();
+}
+
+Status ConstantFolding::SimplifyArithmeticOperations(
+    GraphDef* optimized_graph, GraphProperties* properties, NodeDef* node,
+    bool use_shape_info, bool* success) {
+  const bool is_mul = IsMul(*node) || IsLogicalAnd(*node);
+  const bool is_matmul = IsMatMul(*node);
+  const bool is_add = IsAdd(*node) || IsBiasAdd(*node) || IsLogicalOr(*node);
+  const bool is_sub = IsSub(*node);
+  const bool is_any_div = IsAnyDiv(*node);
+  // Simplify arithmetic operations with ones or zeros.
+  if (use_shape_info &&
+      (is_mul || is_matmul || is_add || is_sub || is_any_div) &&
+      properties->HasInputProperties(node->name()) &&
+      properties->HasOutputProperties(node->name())) {
+    const NodeDef* x = node_map_->GetNode(node->input(0));
+    const NodeDef* y = node_map_->GetNode(node->input(1));
+    if (x == nullptr || y == nullptr) {
+      return errors::InvalidArgument("Invalid inputs to node: ",
+                                     node->DebugString());
+    }
+    const TensorShapeProto& output_shape =
+        properties->GetOutputProperties(node->name())[0].shape();
+
+    // Simplify element-wise multiplication by ones or addition/subtraction
+    // of zeros.
+    const TensorShapeProto& y_shape =
+        properties->GetInputProperties(node->name())[1].shape();
+    const bool x_is_zero = IsZeros(*x);
+    const bool x_is_one = x_is_zero ? false : IsOnes(*x);
+    const bool y_matches_output_shape = ShapesEqual(output_shape, y_shape);
+    if (y_matches_output_shape &&
+        ((is_mul && x_is_one) || (is_add && x_is_zero))) {
+      // 1 * y = y or 0 + y = y.
+      ReplaceOperationWithSnapshot(1, *properties, node, optimized_graph);
+      *success = true;
+      return Status::OK();
     }
 
-    // Strength reduce floating point division by a constant Div(x, const) to
-    // multiplication by the reciprocal Mul(x, Reciprocal(const)). This in turn
-    // will be constant folded to Mul(x, 1.0/const).
-    if (node->input_size() >= 2 && (IsRealDiv(*node) || IsDiv(*node))) {
-      const string& const_input = node->input(1);
-      const NodeDef* denom = node_map_->GetNode(const_input);
-      CHECK(denom != nullptr);
-      if (!IsReallyConstant(*denom)) {
-        continue;
-      }
-      if (node->attr().count("T") == 0) {
-        continue;
-      }
+    if (y_matches_output_shape && (is_sub && x_is_zero)) {
+      // Replace 0 - y with Neg(y).
+      ReplaceSubtractionFromZeroByNegation(node, optimized_graph);
+      *success = true;
+      return Status::OK();
+    }
+
+    // Replace 1 / y with Reciprocal op.
+    if (y_matches_output_shape && is_any_div && x_is_one) {
       DataType type = node->attr().at("T").type();
-      if (IsDiv(*node) &&
-          !(DataTypeIsFloating(type) || DataTypeIsComplex(type))) {
-        continue;
+      if (DataTypeIsFloating(type) || DataTypeIsComplex(type)) {
+        ReplaceDivisionOfOnesByReciprocal(node, optimized_graph);
+        *success = true;
+        return Status::OK();
       }
-      // Insert new reciprocal op and change node from Div to Mul.
-      NodeDef* reciprocal_node = optimized_graph->add_node();
-      reciprocal_node->set_name(OptimizedNodeName(*node, "_recip"));
-      reciprocal_node->set_op("Reciprocal");
-      reciprocal_node->set_device(node->device());
-      node->set_op("Mul");
-      // Re-wire inputs and outputs.
-      reciprocal_node->add_input(const_input);
-      (*reciprocal_node->mutable_attr())["T"].set_type(type);
-      node->set_input(1, reciprocal_node->name());
-      node_map_->AddNode(reciprocal_node->name(), reciprocal_node);
-      node_map_->UpdateOutput(node->name(), const_input,
-                              reciprocal_node->name());
-      graph_modified_ = true;
-      continue;
     }
 
-    // Consider the transformation
-    //
-    //                      +                +       = parent
-    //                     / \              / \
-    //                    C   +    -- >    X   +     = children
-    //                       / \              / \
-    //                      X   Y            C   Y   = leaves
-    //
-    // where C is constant and X is non-constant, and '+' denotes an
-    // associative and commutative operator like addition or multiplication.
-    // This optimization pushes constants down in the tree to canonicalize it.
-    // Moreoever, in cases where the child node has a second constant input Y
-    // we will create a leaf node that can be folded, e.g.
-    //
-    //    Add(C1, Add(C2, X)) -> Add(X, Add(C1, C2)) -> Add(X, C1 + C2)
-    //
-    // TODO(rmlarsen): Handle non-associative/non-commutative operators like
-    // subtraction and division, as well as mixed subtraction/addition,
-    // division/multiplication.
-    // Don't touch BiasAdd since they can't handle vectors as their first
-    // inputs.
-    if (has_fetch_ && (IsAdd(*node) || is_mul) &&
-        NumNonControlInputs(*node) == 2) {
-      NodeDef* left_child = node_map_->GetNode(node->input(0));
-      NodeDef* right_child = node_map_->GetNode(node->input(1));
-      // One child must be constant, and the other the same op as the parent.
-      if (node->op() != left_child->op() && node->op() != right_child->op()) {
-        continue;
-      }
-      const bool left_child_is_constant = IsReallyConstant(*left_child);
-      const bool right_child_is_constant = IsReallyConstant(*right_child);
-      if (!left_child_is_constant && !right_child_is_constant) {
-        continue;
-      }
-      if (node->device() != left_child->device() ||
-          node->device() != right_child->device()) {
-        continue;
+    const TensorShapeProto& x_shape =
+        properties->GetInputProperties(node->name())[0].shape();
+    const bool y_is_zero = IsZeros(*y);
+    const bool y_is_one = y_is_zero ? false : IsOnes(*y);
+    const bool x_matches_output_shape = ShapesEqual(output_shape, x_shape);
+    if (x_matches_output_shape && (((is_mul || is_any_div) && y_is_one) ||
+                                   ((is_add || is_sub) && y_is_zero))) {
+      // x * 1 = x or x / 1 = x or x +/- 0 = x
+      ReplaceOperationWithSnapshot(0, *properties, node, optimized_graph);
+      *success = true;
+      return Status::OK();
+    }
+
+    // x OR true = true OR y = true.
+    bool updated_graph = false;
+    const PartialTensorShape shp(output_shape);
+    if (shp.IsFullyDefined() && IsLogicalOr(*node) && (y_is_one || x_is_one)) {
+      bool replace_succeed = false;
+      Status replace_op_status =
+          ReplaceOperationWithConstant(1, *properties, output_shape, node,
+                                       optimized_graph, &replace_succeed);
+      if (!replace_op_status.ok()) {
+        return replace_op_status;
+      } else if (replace_succeed) {
+        updated_graph = true;
+      }
+    }
+
+    // Simplify multiplication and matmul by zeros.
+    // Also optimize zeros divided by a tensor, but only if we are in
+    // aggressive mode, since we might get rid of divisions by zero.
+    const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
+    bool optimize_zeros_divided_by_y = is_any_div && x_is_zero && is_aggressive;
+    if ((x_is_zero || y_is_zero) &&
+        (is_mul || is_matmul || optimize_zeros_divided_by_y)) {
+      if (shp.IsFullyDefined()) {
+        bool replace_succeed = false;
+        Status replace_op_status =
+            ReplaceOperationWithConstant(0, *properties, output_shape, node,
+                                         optimized_graph, &replace_succeed);
+        if (!replace_op_status.ok()) {
+          return replace_op_status;
+        } else if (replace_succeed) {
+          *success = true;
+          return Status::OK();
+        }
       }
-      NodeDef* op_child_node =
-          left_child_is_constant ? right_child : left_child;
-      NodeDef* const_child_node =
-          left_child_is_constant ? left_child : right_child;
-      // Make sure that it is safe to change the value of the child node->
-      if (op_child_node->input_size() < 2 ||
-          nodes_to_preserve_.find(op_child_node->name()) !=
-              nodes_to_preserve_.end() ||
-          NumNonControlOutputs(*op_child_node, *node_map_) > 1) {
-        continue;
+      // Even if an input shape is only partially known, we may known that it
+      // matches the output shape and thus forward the corresponding zero
+      // input.
+      if ((is_mul || is_any_div) && x_is_zero && x_matches_output_shape) {
+        ReplaceOperationWithIdentity(0, *properties, node, optimized_graph);
+        *success = true;
+        return Status::OK();
+      } else if (is_mul && y_is_zero && y_matches_output_shape) {
+        ReplaceOperationWithIdentity(1, *properties, node, optimized_graph);
+        *success = true;
+        return Status::OK();
       }
+    }
+    if (updated_graph) {
+      *success = true;
+      return Status::OK();
+    }
+  }
+  *success = false;
+  return Status::OK();
+}
 
-      // Identify the nodes to swap.
-      NodeDef* left_leaf = node_map_->GetNode(op_child_node->input(0));
-      NodeDef* right_leaf = node_map_->GetNode(op_child_node->input(1));
-      const bool left_leaf_is_constant = IsReallyConstant(*left_leaf);
-      const bool right_leaf_is_constant = IsReallyConstant(*right_leaf);
-      if (left_leaf_is_constant && right_leaf_is_constant) {
-        // Child is already foldable, leave it alone.
-        continue;
-      }
-      const int non_const_leaf_input = left_leaf_is_constant ? 1 : 0;
-      const int parent_const_input = left_child_is_constant ? 0 : 1;
-      const auto& child_output = node_map_->GetOutputs(op_child_node->name());
-      if (child_output.find(const_child_node) != child_output.end()) {
-        // If there is a control edge from the child op to C, the transformation
-        // would create a cycle in the graph. We know that it must be a control
-        // edge. We can replace such a control edge with a control edge from A
-        // to C.
-        CHECK(MaybeRemoveControlInput(op_child_node->name(), const_child_node,
-                                      graph_, node_map_.get()));
-        NodeDef* other_leaf = left_leaf_is_constant ? left_leaf : right_leaf;
-        MaybeAddControlInput(other_leaf->name(), const_child_node, graph_,
-                             node_map_.get());
-      }
-
-      // Swap the constant child with a non-constant leaf node.
-      node_map_->UpdateInput(node->name(), node->input(parent_const_input),
-                             op_child_node->input(non_const_leaf_input));
-      node_map_->UpdateInput(op_child_node->name(),
-                             op_child_node->input(non_const_leaf_input),
-                             node->input(parent_const_input));
-      std::swap(*node->mutable_input(parent_const_input),
-                *op_child_node->mutable_input(non_const_leaf_input));
-      graph_modified_ = true;
-      continue;
+bool ConstantFolding::ReduceDivToReciprocalMul(GraphDef* optimized_graph,
+                                               NodeDef* node) {
+  // Strength reduce floating point division by a constant Div(x, const) to
+  // multiplication by the reciprocal Mul(x, Reciprocal(const)). This in turn
+  // will be constant folded to Mul(x, 1.0/const).
+  if (node->input_size() >= 2 && (IsRealDiv(*node) || IsDiv(*node))) {
+    const string& const_input = node->input(1);
+    const NodeDef* denom = node_map_->GetNode(const_input);
+    CHECK(denom != nullptr);
+    if (!IsReallyConstant(*denom)) {
+      return false;
+    }
+    if (node->attr().count("T") == 0) {
+      return false;
     }
+    DataType type = node->attr().at("T").type();
+    if (IsDiv(*node) &&
+        !(DataTypeIsFloating(type) || DataTypeIsComplex(type))) {
+      return false;
+    }
+    // Insert new reciprocal op and change node from Div to Mul.
+    NodeDef* reciprocal_node = optimized_graph->add_node();
+    reciprocal_node->set_name(OptimizedNodeName(*node, "_recip"));
+    reciprocal_node->set_op("Reciprocal");
+    reciprocal_node->set_device(node->device());
+    node->set_op("Mul");
+    // Re-wire inputs and outputs.
+    reciprocal_node->add_input(const_input);
+    (*reciprocal_node->mutable_attr())["T"].set_type(type);
+    node->set_input(1, reciprocal_node->name());
+    node_map_->AddNode(reciprocal_node->name(), reciprocal_node);
+    node_map_->UpdateOutput(node->name(), const_input, reciprocal_node->name());
+    return true;
+  }
 
-    // Partial constant propagation through IdentityN.
-    if (IsIdentityN(*node) && NumNonControlInputs(*node) > 0) {
-      const std::set<NodeDef*>& tmp = node_map_->GetOutputs(node->name());
-      const std::vector<NodeDef*> consumers(tmp.begin(), tmp.end());
-      bool updated_graph = false;
-      for (int input_idx = 0; input_idx < node->input_size(); ++input_idx) {
-        const string& input = node->input(input_idx);
-        if (IsControlInput(input)) {
-          break;
-        }
-        const NodeDef* input_node = node_map_->GetNode(NodeName(input));
-        if (input_node == nullptr) {
-          LOG(ERROR) << "Bad input: " << input;
-          break;
-        }
-        // Forward constant inputs to outputs and add a control dependency on
-        // the IdentityN node.
-        if (IsReallyConstant(*input_node)) {
-          // Update each consumer.
-          for (NodeDef* consumer : consumers) {
-            bool add_dep = false;
-            for (int consumer_input_idx = 0;
-                 consumer_input_idx < consumer->input_size();
-                 ++consumer_input_idx) {
-              const string& consumer_input =
-                  consumer->input(consumer_input_idx);
-              if (IsControlInput(consumer_input)) {
-                break;
-              }
-              int output_idx;
-              const string input_node_name =
-                  ParseNodeName(consumer_input, &output_idx);
-              if (input_node_name == node->name() && output_idx == input_idx) {
-                consumer->set_input(consumer_input_idx, input);
-                // We will keep the input from IdentityN through a control
-                // dependency, so we only need to add the consumer as an output
-                // for the constant input node.
-                node_map_->AddOutput(NodeName(input), consumer->name());
-                add_dep = true;
-              }
+  return false;
+}
+
+bool ConstantFolding::ConstantPushDown(NodeDef* node) {
+  // Consider the transformation
+  //
+  //                      +                +       = parent
+  //                     / \              / \
+  //                    C   +    -- >    X   +     = children
+  //                       / \              / \
+  //                      X   Y            C   Y   = leaves
+  //
+  // where C is constant and X is non-constant, and '+' denotes an
+  // associative and commutative operator like addition or multiplication.
+  // This optimization pushes constants down in the tree to canonicalize it.
+  // Moreoever, in cases where the child node has a second constant input Y
+  // we will create a leaf node that can be folded, e.g.
+  //
+  //    Add(C1, Add(C2, X)) -> Add(X, Add(C1, C2)) -> Add(X, C1 + C2)
+  //
+  // TODO(rmlarsen): Handle non-associative/non-commutative operators like
+  // subtraction and division, as well as mixed subtraction/addition,
+  // division/multiplication.
+  // Don't touch BiasAdd since they can't handle vectors as their first
+  // inputs.
+  if (has_fetch_ && (IsAdd(*node) || IsMul(*node)) &&
+      NumNonControlInputs(*node) == 2) {
+    NodeDef* left_child = node_map_->GetNode(node->input(0));
+    NodeDef* right_child = node_map_->GetNode(node->input(1));
+    // One child must be constant, and the other the same op as the parent.
+    if (node->op() != left_child->op() && node->op() != right_child->op()) {
+      return false;
+    }
+    const bool left_child_is_constant = IsReallyConstant(*left_child);
+    const bool right_child_is_constant = IsReallyConstant(*right_child);
+    if (!left_child_is_constant && !right_child_is_constant) {
+      return false;
+    }
+    if (node->device() != left_child->device() ||
+        node->device() != right_child->device()) {
+      return false;
+    }
+    NodeDef* op_child_node = left_child_is_constant ? right_child : left_child;
+    NodeDef* const_child_node =
+        left_child_is_constant ? left_child : right_child;
+    // Make sure that it is safe to change the value of the child node->
+    if (op_child_node->input_size() < 2 ||
+        nodes_to_preserve_.find(op_child_node->name()) !=
+            nodes_to_preserve_.end() ||
+        NumNonControlOutputs(*op_child_node, *node_map_) > 1) {
+      return false;
+    }
+
+    // Identify the nodes to swap.
+    NodeDef* left_leaf = node_map_->GetNode(op_child_node->input(0));
+    NodeDef* right_leaf = node_map_->GetNode(op_child_node->input(1));
+    const bool left_leaf_is_constant = IsReallyConstant(*left_leaf);
+    const bool right_leaf_is_constant = IsReallyConstant(*right_leaf);
+    if (left_leaf_is_constant && right_leaf_is_constant) {
+      // Child is already foldable, leave it alone.
+      return false;
+    }
+    const int non_const_leaf_input = left_leaf_is_constant ? 1 : 0;
+    const int parent_const_input = left_child_is_constant ? 0 : 1;
+    const auto& child_output = node_map_->GetOutputs(op_child_node->name());
+    if (child_output.find(const_child_node) != child_output.end()) {
+      // If there is a control edge from the child op to C, the transformation
+      // would create a cycle in the graph. We know that it must be a control
+      // edge. We can replace such a control edge with a control edge from A
+      // to C.
+      CHECK(MaybeRemoveControlInput(op_child_node->name(), const_child_node,
+                                    graph_, node_map_.get()));
+      NodeDef* other_leaf = left_leaf_is_constant ? left_leaf : right_leaf;
+      MaybeAddControlInput(other_leaf->name(), const_child_node, graph_,
+                           node_map_.get());
+    }
+
+    // Swap the constant child with a non-constant leaf node.
+    node_map_->UpdateInput(node->name(), node->input(parent_const_input),
+                           op_child_node->input(non_const_leaf_input));
+    node_map_->UpdateInput(op_child_node->name(),
+                           op_child_node->input(non_const_leaf_input),
+                           node->input(parent_const_input));
+    std::swap(*node->mutable_input(parent_const_input),
+              *op_child_node->mutable_input(non_const_leaf_input));
+    return true;
+  }
+  return false;
+}
+
+bool ConstantFolding::MulConvPushDown(NodeDef* node,
+                                      const GraphProperties& properties) {
+  // Push down multiplication on ConvND.
+  //                       *                  ConvND
+  //                     /   \                /    \
+  //                 ConvND  C2    -- >      X      *
+  //                  / \                          / \
+  //                 X  C1                       C1  C2
+  //
+  // where C1 and C2 are constants and X is non-constant.
+  if (IsMul(*node) && NumNonControlInputs(*node) == 2) {
+    NodeDef* mul_left_child = node_map_->GetNode(node->input(0));
+    NodeDef* mul_right_child = node_map_->GetNode(node->input(1));
+    // One child must be constant, and the second must be Conv op.
+    const bool left_child_is_constant = IsReallyConstant(*mul_left_child);
+    const bool right_child_is_constant = IsReallyConstant(*mul_right_child);
+    if (!left_child_is_constant && !right_child_is_constant) {
+      return false;
+    }
+    NodeDef* conv_node =
+        left_child_is_constant ? mul_right_child : mul_left_child;
+    if (!IsConv2D(*conv_node) && !IsConv3D(*conv_node)) {
+      return false;
+    }
+    if (node->device() != mul_left_child->device() ||
+        node->device() != mul_right_child->device()) {
+      return false;
+    }
+
+    // Make sure that it is safe to change the value of the convolution
+    // output.
+    if (conv_node->input_size() < 2 ||
+        NumNonControlOutputs(*conv_node, *node_map_) > 1 ||
+        nodes_to_preserve_.find(conv_node->name()) !=
+            nodes_to_preserve_.end()) {
+      return false;
+    }
+
+    // Identify the nodes to swap.
+    NodeDef* conv_left_child = node_map_->GetNode(conv_node->input(0));
+    NodeDef* conv_right_child = node_map_->GetNode(conv_node->input(1));
+    const bool conv_left_is_constant = IsReallyConstant(*conv_left_child);
+    const bool conv_right_is_constant = IsReallyConstant(*conv_right_child);
+    if (!conv_left_is_constant && !conv_right_is_constant) {
+      // At least one of the convolution inputs should be constant.
+      return false;
+    }
+    if (conv_left_is_constant && conv_right_is_constant) {
+      // Leverage regular constant folding to handle this.
+      return false;
+    }
+    const auto& mul_props = properties.GetOutputProperties(node->name());
+    const auto& conv_props = properties.GetOutputProperties(conv_node->name());
+    if (mul_props.empty() || conv_props.empty()) {
+      return false;
+    }
+    const auto& mul_shape = mul_props[0].shape();
+    const auto& conv_shape = conv_props[0].shape();
+    if (!ShapesSymbolicallyEqual(mul_shape, conv_shape)) {
+      return false;
+    }
+
+    const auto& input_props = properties.GetInputProperties(conv_node->name());
+    if (input_props.size() < 2) {
+      return false;
+    }
+    const auto& filter_shape = input_props[1].shape();
+
+    NodeDef* const_node =
+        left_child_is_constant ? mul_left_child : mul_right_child;
+    const auto& const_props =
+        properties.GetOutputProperties(const_node->name());
+    if (const_props.empty()) {
+      return false;
+    }
+    const auto& const_shape = const_props[0].shape();
+
+    TensorShapeProto new_filter_shape;
+    if (!ShapeAfterBroadcast(filter_shape, const_shape, &new_filter_shape)) {
+      return false;
+    }
+    if (!ShapesSymbolicallyEqual(filter_shape, new_filter_shape)) {
+      return false;
+    }
+
+    string mul_new_name =
+        AddPrefixToNodeName("merged_input", conv_node->name());
+    if (node_map_->NodeExists(mul_new_name)) {
+      return false;
+    }
+    // Make sure we don't introduce loops in the graph by removing control
+    // dependencies from the conv2d node to c2.
+    NodeDef* conv_const_node =
+        conv_left_is_constant ? conv_left_child : conv_right_child;
+    if (MaybeRemoveControlInput(conv_node->name(), const_node, graph_,
+                                node_map_.get())) {
+      // Add a control dep from c1 to c2 to ensure c2 is in the right frame
+      *const_node->add_input() = AsControlDependency(*conv_const_node);
+    }
+
+    conv_node->set_name(node->name());
+    node->set_name(mul_new_name);
+    if (conv_left_is_constant) {
+      node_map_->UpdateInput(conv_node->name(), node->input(0), mul_new_name);
+      conv_node->set_input(0, mul_new_name);
+    } else {
+      node_map_->UpdateInput(conv_node->name(), node->input(1), mul_new_name);
+      conv_node->set_input(1, mul_new_name);
+    }
+    if (left_child_is_constant) {
+      node->set_input(1, conv_const_node->name());
+    } else {
+      node->set_input(0, conv_const_node->name());
+    }
+    node_map_->AddNode(mul_new_name, node);
+
+    return true;
+  }
+  return false;
+}
+
+bool ConstantFolding::PartialConstPropThroughIdentityN(NodeDef* node) {
+  // Partial constant propagation through IdentityN.
+  if (IsIdentityN(*node) && NumNonControlInputs(*node) > 0) {
+    const std::set<NodeDef*>& tmp = node_map_->GetOutputs(node->name());
+    const std::vector<NodeDef*> consumers(tmp.begin(), tmp.end());
+    bool updated_graph = false;
+    for (int input_idx = 0; input_idx < node->input_size(); ++input_idx) {
+      const string& input = node->input(input_idx);
+      if (IsControlInput(input)) {
+        break;
+      }
+      const NodeDef* input_node = node_map_->GetNode(NodeName(input));
+      if (input_node == nullptr) {
+        LOG(ERROR) << "Bad input: " << input;
+        break;
+      }
+      // Forward constant inputs to outputs and add a control dependency on
+      // the IdentityN node.
+      if (IsReallyConstant(*input_node)) {
+        // Update each consumer.
+        for (NodeDef* consumer : consumers) {
+          bool add_dep = false;
+          for (int consumer_input_idx = 0;
+               consumer_input_idx < consumer->input_size();
+               ++consumer_input_idx) {
+            const string& consumer_input = consumer->input(consumer_input_idx);
+            if (IsControlInput(consumer_input)) {
+              break;
             }
-            if (add_dep) {
-              consumer->add_input(AsControlDependency(node->name()));
-              updated_graph = true;
+            int output_idx;
+            const string input_node_name =
+                ParseNodeName(consumer_input, &output_idx);
+            if (input_node_name == node->name() && output_idx == input_idx) {
+              consumer->set_input(consumer_input_idx, input);
+              // We will keep the input from IdentityN through a control
+              // dependency, so we only need to add the consumer as an output
+              // for the constant input node.
+              node_map_->AddOutput(NodeName(input), consumer->name());
+              add_dep = true;
             }
           }
+          if (add_dep) {
+            consumer->add_input(AsControlDependency(node->name()));
+            updated_graph = true;
+          }
         }
       }
+    }
 
-      if (updated_graph) {
-        for (NodeDef* consumer : consumers) {
-          DedupControlInputs(consumer);
-        }
-        graph_modified_ = true;
-        continue;
+    if (updated_graph) {
+      for (NodeDef* consumer : consumers) {
+        DedupControlInputs(consumer);
       }
+      return true;
     }
+  }
+  return false;
+}
 
-    // Partial constant folding for associative operators:
-    // Split AddN/AccumulateNV2 to enable partial
-    // folding of ops when more than one but not all inputs are constant.
-    // For AddN and AccumulateNV2, we may furthermore reorder inputs, since
-    // addition is commutative.
-    const int num_non_control_inputs = NumNonControlInputs(*node);
-    if (IsAggregate(*node) && IsCommutative(*node) &&
-        num_non_control_inputs > 2) {
-      const int num_control_inputs =
-          node->input_size() - num_non_control_inputs;
-      std::vector<int> const_inputs;
-      std::vector<int> nonconst_inputs;
-      for (int i = 0; i < node->input_size(); ++i) {
-        const string& input = node->input(i);
-        const NodeDef* input_node = node_map_->GetNode(NodeName(input));
-        CHECK(input_node != nullptr) << input;
-        if (!IsControlInput(input) && IsReallyConstant(*input_node)) {
-          const_inputs.push_back(i);
-        } else {
-          // Non-const and control inputs.
-          nonconst_inputs.push_back(i);
+bool ConstantFolding::PartialAssocOpConstFolding(GraphDef* optimized_graph,
+                                                 GraphProperties* properties,
+                                                 NodeDef* node) {
+  // Partial constant folding for associative operators:
+  // Split AddN/AccumulateNV2 to enable partial
+  // folding of ops when more than one but not all inputs are constant.
+  // For AddN and AccumulateNV2, we may furthermore reorder inputs, since
+  // addition is commutative.
+  const int num_non_control_inputs = NumNonControlInputs(*node);
+  if (IsAggregate(*node) && IsCommutative(*node) &&
+      num_non_control_inputs > 2) {
+    const int num_control_inputs = node->input_size() - num_non_control_inputs;
+    std::vector<int> const_inputs;
+    std::vector<int> nonconst_inputs;
+    for (int i = 0; i < node->input_size(); ++i) {
+      const string& input = node->input(i);
+      const NodeDef* input_node = node_map_->GetNode(NodeName(input));
+      CHECK(input_node != nullptr) << input;
+      if (!IsControlInput(input) && IsReallyConstant(*input_node)) {
+        const_inputs.push_back(i);
+      } else {
+        // Non-const and control inputs.
+        nonconst_inputs.push_back(i);
+      }
+    }
+    // Promote AccumulateNV2 with all constant inputs to AddN, since it is
+    // a fake node that cannot be constant folded by itself.
+    if (const_inputs.size() == num_non_control_inputs &&
+        node->op() == "AccumulateNV2") {
+      node->set_op("AddN");
+      node->mutable_attr()->erase("shape");
+      return true;
+    }
+    const string new_node_name = OptimizedNodeName(
+        *node, strings::StrCat("_partial_split_", const_inputs.size()));
+    if (1 < const_inputs.size() &&
+        const_inputs.size() < num_non_control_inputs &&
+        !node_map_->NodeExists(new_node_name)) {
+      NodeDef* added_node = optimized_graph->add_node();
+      *added_node = *node;
+      // Always use AddN for the constant node, since AccumulateNV2 is a fake
+      // node that cannot be constant folded, since it does not have a kernel.
+      added_node->set_op("AddN");
+      added_node->mutable_attr()->erase("shape");
+      added_node->set_name(new_node_name);
+      node_map_->AddNode(added_node->name(), added_node);
+      added_node->clear_input();
+      for (int i : const_inputs) {
+        added_node->add_input(node->input(i));
+        node_map_->UpdateOutput(NodeName(node->input(i)), node->name(),
+                                added_node->name());
+      }
+
+      // Overwrite the first const input with the added node.
+      node->set_input(const_inputs[0], added_node->name());
+      node_map_->AddOutput(added_node->name(), node->name());
+      nonconst_inputs.push_back(const_inputs[0]);
+      // Compact the remaining inputs to the original node.
+      std::sort(nonconst_inputs.begin(), nonconst_inputs.end());
+      int idx = 0;
+      for (int i : nonconst_inputs) {
+        if (idx != i) {
+          node->set_input(idx, node->input(i));
         }
+        ++idx;
       }
-      // Promote AccumulateNV2 with all constant inputs to AddN, since it is
-      // a fake node that cannot be constant folded by itself.
-      if (const_inputs.size() == num_non_control_inputs &&
-          node->op() == "AccumulateNV2") {
-        node->set_op("AddN");
-        node->mutable_attr()->erase("shape");
-        graph_modified_ = true;
-        continue;
-      }
-      const string new_node_name = OptimizedNodeName(
-          *node, strings::StrCat("_partial_split_", const_inputs.size()));
-      if (1 < const_inputs.size() &&
-          const_inputs.size() < num_non_control_inputs &&
-          !node_map_->NodeExists(new_node_name)) {
-        NodeDef* added_node = optimized_graph->add_node();
-        *added_node = *node;
-        // Always use AddN for the constant node, since AccumulateNV2 is a fake
-        // node that cannot be constant folded, since it does not have a kernel.
-        added_node->set_op("AddN");
-        added_node->mutable_attr()->erase("shape");
-        added_node->set_name(new_node_name);
-        node_map_->AddNode(added_node->name(), added_node);
-        added_node->clear_input();
-        for (int i : const_inputs) {
-          added_node->add_input(node->input(i));
-          node_map_->UpdateOutput(NodeName(node->input(i)), node->name(),
-                                  added_node->name());
-        }
+      node->mutable_input()->DeleteSubrange(nonconst_inputs.size(),
+                                            const_inputs.size() - 1);
+      (*node->mutable_attr())["N"].set_i(node->input_size() -
+                                         num_control_inputs);
+      properties->ClearInputProperties(node->name());
+      (*added_node->mutable_attr())["N"].set_i(const_inputs.size());
+      return true;
+    }
+  }
+  return false;
+}
 
-        // Overwrite the first const input with the added node.
-        node->set_input(const_inputs[0], added_node->name());
-        node_map_->AddOutput(added_node->name(), node->name());
-        nonconst_inputs.push_back(const_inputs[0]);
-        // Compact the remaining inputs to the original node.
-        std::sort(nonconst_inputs.begin(), nonconst_inputs.end());
-        int idx = 0;
-        for (int i : nonconst_inputs) {
-          if (idx != i) {
-            node->set_input(idx, node->input(i));
-          }
-          ++idx;
-        }
-        node->mutable_input()->DeleteSubrange(nonconst_inputs.size(),
-                                              const_inputs.size() - 1);
-        (*node->mutable_attr())["N"].set_i(node->input_size() -
-                                           num_control_inputs);
-        properties->ClearInputProperties(node->name());
-        (*added_node->mutable_attr())["N"].set_i(const_inputs.size());
-        graph_modified_ = true;
-        continue;
-      }
+bool ConstantFolding::PartialConcatConstFolding(GraphDef* optimized_graph,
+                                                GraphProperties* properties,
+                                                NodeDef* node) {
+  // Partial constant folding for Concat which is not commutative, so
+  // we have to preserve order and can only push consecutive runs of constant
+  // inputs into sub-nodes.
+  const int num_non_control_inputs = NumNonControlInputs(*node);
+  if (IsConcat(*node) && num_non_control_inputs > 3 &&
+      node->name().rfind("_partial_split_") == string::npos) {
+    int axis_arg = -1;
+    int begin = 0;
+    int end = num_non_control_inputs;
+    if (node->op() == "Concat") {
+      begin = 1;
+      axis_arg = 0;
+    } else if (node->op() == "ConcatV2") {
+      end = num_non_control_inputs - 1;
+      axis_arg = num_non_control_inputs - 1;
+    } else {
+      return false;
     }
 
-    // Partial constant folding for Concat which is not commutative, so
-    // we have to preserve order and can only push consecutive runs of constant
-    // inputs into sub-nodes.
-    if (IsConcat(*node) && num_non_control_inputs > 3 &&
-        node->name().rfind("_partial_split_") == string::npos) {
-      int axis_arg = -1;
-      int begin = 0;
-      int end = num_non_control_inputs;
-      if (node->op() == "Concat") {
-        begin = 1;
-        axis_arg = 0;
-      } else if (node->op() == "ConcatV2") {
-        end = num_non_control_inputs - 1;
-        axis_arg = num_non_control_inputs - 1;
-      } else {
-        continue;
-      }
+    const NodeDef* axis_arg_node =
+        node_map_->GetNode(NodeName(node->input(axis_arg)));
+    if (axis_arg_node == nullptr || !IsReallyConstant(*axis_arg_node)) {
+      // We cannot constant fold Concat unless we the axis argument is
+      // constant. Skip node.
+      return false;
+    }
 
-      const NodeDef* axis_arg_node =
-          node_map_->GetNode(NodeName(node->input(axis_arg)));
-      if (axis_arg_node == nullptr || !IsReallyConstant(*axis_arg_node)) {
-        // We cannot constant fold Concat unless we the axis argument is
-        // constant. Skip node.
-        continue;
+    // We search for consecutive runs of constant inputs in the range
+    // [begin:end[ and push then down into child nodes.
+    std::vector<std::pair<int, int>> constant_input_runs;
+    int first = begin;
+    int last = begin;
+    while (last < end) {
+      while (first < end && !IsReallyConstant(*node_map_->GetNode(
+                                NodeName(node->input(first))))) {
+        ++first;
       }
-
-      // We search for consecutive runs of constant inputs in the range
-      // [begin:end[ and push then down into child nodes.
-      std::vector<std::pair<int, int>> constant_input_runs;
-      int first = begin;
-      int last = begin;
-      while (last < end) {
-        while (first < end && !IsReallyConstant(*node_map_->GetNode(
-                                  NodeName(node->input(first))))) {
-          ++first;
-        }
-        // Invariant: node[first] is constant || first >= end.
-        last = first + 1;
-        while (last < end && IsReallyConstant(*node_map_->GetNode(
-                                 NodeName(node->input(last))))) {
-          ++last;
-        }
-        // Invariant: node[last] is not constant || last >= end
-        // Discard intervals shorter than 2 elements.
-        if (first < end && (last - first) > 1) {
-          constant_input_runs.emplace_back(first, last);
-        }
-        first = last;
+      // Invariant: node[first] is constant || first >= end.
+      last = first + 1;
+      while (last < end && IsReallyConstant(*node_map_->GetNode(
+                               NodeName(node->input(last))))) {
+        ++last;
       }
+      // Invariant: node[last] is not constant || last >= end
+      // Discard intervals shorter than 2 elements.
+      if (first < end && (last - first) > 1) {
+        constant_input_runs.emplace_back(first, last);
+      }
+      first = last;
+    }
 
-      // Skip if all inputs are constant, and let constant folding take over.
-      if (constant_input_runs.size() == 1 &&
-          constant_input_runs[0].first == begin &&
-          constant_input_runs[0].second == end) {
-        continue;
+    // Skip if all inputs are constant, and let constant folding take over.
+    if (constant_input_runs.size() == 1 &&
+        constant_input_runs[0].first == begin &&
+        constant_input_runs[0].second == end) {
+      return false;
+    }
+    std::set<int> inputs_to_delete;
+    for (auto interval : constant_input_runs) {
+      // Push the constant inputs in the interval to a child node than can be
+      // constant folded.
+      const string new_node_name = OptimizedNodeName(
+          *node, strings::StrCat("_partial_split_", interval.first));
+      if (node_map_->NodeExists(new_node_name)) {
+        break;
       }
-      std::set<int> inputs_to_delete;
-      for (auto interval : constant_input_runs) {
-        // Push the constant inputs in the interval to a child node than can be
-        // constant folded.
-        const string new_node_name = OptimizedNodeName(
-            *node, strings::StrCat("_partial_split_", interval.first));
-        if (node_map_->NodeExists(new_node_name)) {
-          break;
-        }
-        NodeDef* added_node = optimized_graph->add_node();
-        *added_node = *node;
-        added_node->set_name(new_node_name);
-        node_map_->AddNode(added_node->name(), added_node);
-        added_node->clear_input();
-        for (int i = interval.first; i < interval.second; ++i) {
-          added_node->add_input(node->input(i));
-          node_map_->UpdateOutput(NodeName(node->input(i)), node->name(),
-                                  added_node->name());
-          if (i != interval.first) {
-            inputs_to_delete.insert(i);
-          }
+      NodeDef* added_node = optimized_graph->add_node();
+      *added_node = *node;
+      added_node->set_name(new_node_name);
+      node_map_->AddNode(added_node->name(), added_node);
+      added_node->clear_input();
+      for (int i = interval.first; i < interval.second; ++i) {
+        added_node->add_input(node->input(i));
+        node_map_->UpdateOutput(NodeName(node->input(i)), node->name(),
+                                added_node->name());
+        if (i != interval.first) {
+          inputs_to_delete.insert(i);
         }
-        added_node->add_input(node->input(axis_arg));
-        (*added_node->mutable_attr())["N"].set_i(interval.second -
-                                                 interval.first);
-        node_map_->AddOutput(NodeName(node->input(axis_arg)),
-                             added_node->name());
-
-        // Overwrite the first constant input with the result of the added
-        // child node.
-        node->set_input(interval.first, added_node->name());
-        node_map_->AddOutput(added_node->name(), node->name());
-      }
-      if (!constant_input_runs.empty()) {
-        graph_modified_ = true;
-        if (!inputs_to_delete.empty()) {
-          // Fix up the inputs to the original node.
-          std::vector<string> tmp(node->input().begin(), node->input().end());
-          node->clear_input();
-          for (int i = 0; i < tmp.size(); ++i) {
-            if (inputs_to_delete.find(i) == inputs_to_delete.end()) {
-              node->add_input(tmp[i]);
-            }
+      }
+      added_node->add_input(node->input(axis_arg));
+      (*added_node->mutable_attr())["N"].set_i(interval.second -
+                                               interval.first);
+      node_map_->AddOutput(NodeName(node->input(axis_arg)), added_node->name());
+
+      // Overwrite the first constant input with the result of the added
+      // child node.
+      node->set_input(interval.first, added_node->name());
+      node_map_->AddOutput(added_node->name(), node->name());
+    }
+    if (!constant_input_runs.empty()) {
+      if (!inputs_to_delete.empty()) {
+        // Fix up the inputs to the original node.
+        std::vector<string> tmp(node->input().begin(), node->input().end());
+        node->clear_input();
+        for (int i = 0; i < tmp.size(); ++i) {
+          if (inputs_to_delete.find(i) == inputs_to_delete.end()) {
+            node->add_input(tmp[i]);
           }
-          (*node->mutable_attr())["N"].set_i(node->input_size() - 1);
-          properties->ClearInputProperties(node->name());
         }
-        continue;
+        (*node->mutable_attr())["N"].set_i(node->input_size() - 1);
+        properties->ClearInputProperties(node->name());
       }
+      return true;
     }
   }
-
-  return Status::OK();
+  return false;
 }
 
 Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
@@ -2536,6 +2876,14 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
     cpu_device_ = owned_device_.get();
   }
 
+  graph_contains_assign_or_inplace_op_ = false;
+  for (const NodeDef& node : item.graph.node()) {
+    if (ModifiesInputsInPlace(node) || MaybeHasRefInput(node)) {
+      graph_contains_assign_or_inplace_op_ = true;
+      break;
+    }
+  }
+
   has_fetch_ = !item.fetch.empty();
   GrapplerItem item_to_optimize = item;
   *optimized_graph = item.graph;
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index a694f1721ad416b1217b860a293a62440f9d980f..e477934f308b243ac6273fdd79d334445d4254c8 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -88,19 +88,57 @@ class ConstantFolding : public GraphOptimizer {
   Status ReplaceOperationWithConstant(double value,
                                       const GraphProperties& properties,
                                       const TensorShapeProto& shape,
-                                      NodeDef* node, GraphDef* graph);
+                                      NodeDef* node, GraphDef* graph,
+                                      bool* success);
   void ReplaceDivisionOfOnesByReciprocal(NodeDef* node, GraphDef* graph);
   Status FoldGraph(GraphDef* output);
 
-  bool IsSimplifiableReduction(const NodeDef& node) const;
+  bool IsSimplifiableReduction(const NodeDef& node,
+                               const GraphProperties& properties) const;
   bool IsSimplifiableReshape(const NodeDef& node,
                              const GraphProperties& properties) const;
   Status SimplifyGraph(GraphDef* output, GraphProperties* properties,
                        bool use_shape_info);
+  Status SimplifyNode(NodeDef* node, GraphDef* optimized_graph,
+                      GraphProperties* properties, bool use_shape_info);
 
   Status RunOptimizationPass(Cluster* cluster, const GrapplerItem& item,
                              GraphDef* output);
 
+  // Applies partial constant folding for Concat which is not commutative.
+  // Returns true if the transformation applied successfully.
+  bool PartialConcatConstFolding(GraphDef* optimized_graph,
+                                 GraphProperties* properties, NodeDef* node);
+
+  // Applies partial constant folding for associative operators AddN and
+  // AccumulateNV2. Returns true if the transformation applied successfully.
+  bool PartialAssocOpConstFolding(GraphDef* optimized_graph,
+                                  GraphProperties* properties, NodeDef* node);
+
+  // Applies partial constant propagation through IdentityN operator.
+  // Returns true if the transformation applied successfully.
+  bool PartialConstPropThroughIdentityN(NodeDef* node);
+
+  // Pushes down constants on '+' and '*' operators if applicable. Returns true
+  // the transformation applied successfully.
+  bool ConstantPushDown(NodeDef* node);
+
+  // Aggregate constants present around a conv operator. Returns true if the
+  // transformation was applied successfully.
+  bool MulConvPushDown(NodeDef* node, const GraphProperties& properties);
+
+  // Strength reduces floating point division by a constant Div(x, const) to
+  // multiplication by the reciprocal Mul(x, Reciprocal(const)).
+  bool ReduceDivToReciprocalMul(GraphDef* optimized_graph, NodeDef* node);
+
+  // Simplifies arithmetic operations with ones or zeros. Returns the status,
+  // and updates the success input argument that denotes if any simplification
+  // was applied.
+  Status SimplifyArithmeticOperations(GraphDef* optimized_graph,
+                                      GraphProperties* properties,
+                                      NodeDef* node, bool use_shape_info,
+                                      bool* success);
+
   // Points to an externally provided device or to owned_device_;
   RewriterConfig::Toggle opt_level_;
   DeviceBase* cpu_device_;
@@ -114,6 +152,7 @@ class ConstantFolding : public GraphOptimizer {
   std::unordered_set<string> feed_nodes_;
   bool has_fetch_;
   bool graph_modified_;
+  bool graph_contains_assign_or_inplace_op_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index f018b217e66365b3d4af2270573e1000041a84b8..9f051ca248b75f3578100ed74c242640d2ac849c 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -33,77 +33,88 @@ class ConstantFoldingTest : public GrapplerTest {
  protected:
   template <DataType DTYPE>
   void SimpleNeutralElementTest() {
-    typedef typename EnumToDataType<DTYPE>::Type T;
-    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-    Output x = ops::Placeholder(s.WithOpName("x"), DTYPE,
-                                ops::Placeholder::Shape(TensorShape({2, 2})));
-    Tensor zeros_t(DTYPE, TensorShape({2, 2}));
-    Tensor ones_t(DTYPE, TensorShape({2, 2}));
-    Tensor x_t(DTYPE, TensorShape({2, 2}));
-    for (int i = 0; i < 4; ++i) {
-      zeros_t.flat<T>()(i) = T(0);
-      ones_t.flat<T>()(i) = T(1);
-      x_t.flat<T>()(i) = T(i + 1);
-    }
-    Output zeros = ops::Const(s.WithOpName("zeros"), zeros_t);
-    Output ones = ops::Const(s.WithOpName("ones"), ones_t);
-    Output mul1;
-    Output mul2;
-    Output add1;
-    Output add2;
-    if (DTYPE == DT_BOOL) {
-      mul1 = ops::LogicalAnd(s.WithOpName("mul1"), x, zeros);
-      mul2 = ops::LogicalAnd(s.WithOpName("mul2"), x, ones);
-      add1 = ops::LogicalOr(s.WithOpName("add1"), x, zeros);
-      add2 = ops::LogicalOr(s.WithOpName("add2"), x, ones);
-    } else {
-      mul1 = ops::Mul(s.WithOpName("mul1"), x, zeros);
-      mul2 = ops::Mul(s.WithOpName("mul2"), x, ones);
-      add1 = ops::Add(s.WithOpName("add1"), x, zeros);
-      add1 = ops::Add(s.WithOpName("add2"), x, ones);
-    }
-    GrapplerItem item;
-    TF_CHECK_OK(s.ToGraphDef(&item.graph));
-    item.fetch = {"mul1", "mul2", "add1", "add2"};
-    ConstantFolding optimizer(nullptr /* cpu_device */);
-    GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
-    TF_EXPECT_OK(status);
-
-    EXPECT_EQ(7, output.node_size());
-    for (int i = 0; i < output.node_size(); ++i) {
-      const NodeDef& node = output.node(i);
-      const string& name = node.name();
-      if (name == "mul1") {
-        EXPECT_EQ("Const", node.op());
-        EXPECT_EQ("^x", node.input(0));
-        EXPECT_EQ("^zeros", node.input(1));
-      } else if (name == "mul2") {
-        EXPECT_EQ("Snapshot", node.op());
-        EXPECT_EQ("x", node.input(0));
-        EXPECT_EQ("^ones", node.input(1));
-      } else if (name == "add1") {
-        EXPECT_EQ("Snapshot", node.op());
-        EXPECT_EQ("x", node.input(0));
-        EXPECT_EQ("^zeros", node.input(1));
-      } else if (name == "add2") {
-        if (DTYPE == DT_BOOL) {
+    for (bool use_snapshot : {false, true}) {
+      typedef typename EnumToDataType<DTYPE>::Type T;
+      tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+      Output x = ops::Placeholder(s.WithOpName("x"), DTYPE,
+                                  ops::Placeholder::Shape(TensorShape({2, 2})));
+      Output v = ops::Variable(s.WithOpName("v"), {2, 2}, DTYPE);
+      Tensor zeros_t(DTYPE, TensorShape({2, 2}));
+      Tensor ones_t(DTYPE, TensorShape({2, 2}));
+      Tensor x_t(DTYPE, TensorShape({2, 2}));
+      for (int i = 0; i < 4; ++i) {
+        zeros_t.flat<T>()(i) = T(0);
+        ones_t.flat<T>()(i) = T(1);
+        x_t.flat<T>()(i) = T(i + 1);
+      }
+      Output zeros = ops::Const(s.WithOpName("zeros"), zeros_t);
+      Output ones = ops::Const(s.WithOpName("ones"), ones_t);
+      Output mul1;
+      Output mul2;
+      Output add1;
+      Output add2;
+      if (DTYPE == DT_BOOL) {
+        mul1 = ops::LogicalAnd(s.WithOpName("mul1"), x, zeros);
+        mul2 = ops::LogicalAnd(s.WithOpName("mul2"), x, ones);
+        add1 = ops::LogicalOr(s.WithOpName("add1"), x, zeros);
+        add2 = ops::LogicalOr(s.WithOpName("add2"), x, ones);
+      } else {
+        mul1 = ops::Mul(s.WithOpName("mul1"), x, zeros);
+        mul2 = ops::Mul(s.WithOpName("mul2"), x, ones);
+        add1 = ops::Add(s.WithOpName("add1"), x, zeros);
+        add1 = ops::Add(s.WithOpName("add2"), x, ones);
+      }
+      if (use_snapshot) {
+        // Add an op with ref input to prevent Snapshot from being
+        // turned into Identity.
+        ops::Assign(s.WithOpName("assign"), v, ones);
+      }
+      GrapplerItem item;
+      TF_CHECK_OK(s.ToGraphDef(&item.graph));
+      item.fetch = {"mul1", "mul2", "add1", "add2"};
+      ConstantFolding optimizer(nullptr /* cpu_device */);
+      GraphDef output;
+      Status status = optimizer.Optimize(nullptr, item, &output);
+      TF_EXPECT_OK(status);
+
+      EXPECT_EQ(7, output.node_size());
+      const string snapshot_or_identity =
+          use_snapshot ? "Snapshot" : "Identity";
+      for (int i = 0; i < output.node_size(); ++i) {
+        const NodeDef& node = output.node(i);
+        const string& name = node.name();
+        if (name == "mul1") {
           EXPECT_EQ("Const", node.op());
           EXPECT_EQ("^x", node.input(0));
+          EXPECT_EQ("^zeros", node.input(1));
+        } else if (name == "mul2") {
+          EXPECT_EQ(snapshot_or_identity, node.op());
+          EXPECT_EQ("x", node.input(0));
           EXPECT_EQ("^ones", node.input(1));
-        } else {
-          EXPECT_EQ("Add", node.op());
+        } else if (name == "add1") {
+          EXPECT_EQ(snapshot_or_identity, node.op());
           EXPECT_EQ("x", node.input(0));
-          EXPECT_EQ("ones", node.input(1));
+          EXPECT_EQ("^zeros", node.input(1));
+        } else if (name == "add2") {
+          if (DTYPE == DT_BOOL) {
+            EXPECT_EQ("Const", node.op());
+            EXPECT_EQ("^x", node.input(0));
+            EXPECT_EQ("^ones", node.input(1));
+          } else {
+            EXPECT_EQ("Add", node.op());
+            EXPECT_EQ("x", node.input(0));
+            EXPECT_EQ("ones", node.input(1));
+          }
         }
       }
-    }
-    auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"x", x_t}});
-    auto tensors = EvaluateNodes(output, item.fetch, {{"x", x_t}});
-    EXPECT_EQ(4, tensors_expected.size());
-    EXPECT_EQ(4, tensors.size());
-    for (int i = 0; i < item.fetch.size(); ++i) {
-      test::ExpectTensorEqual<T>(tensors_expected[i], tensors[i]);
+      auto tensors_expected =
+          EvaluateNodes(item.graph, item.fetch, {{"x", x_t}});
+      auto tensors = EvaluateNodes(output, item.fetch, {{"x", x_t}});
+      EXPECT_EQ(4, tensors_expected.size());
+      EXPECT_EQ(4, tensors.size());
+      for (int i = 0; i < item.fetch.size(); ++i) {
+        test::ExpectTensorEqual<T>(tensors_expected[i], tensors[i]);
+      }
     }
   }
 };
@@ -229,6 +240,77 @@ TEST_F(ConstantFoldingTest, AddTree) {
   }
 }
 
+TEST_F(ConstantFoldingTest, ConvPushDownTest) {
+  // Tests if the following rewrite is performed:
+  //
+  //         *                       Conv2D
+  //        / \                       / \
+  //       c  Conv2D        -->      x  (c * filter)
+  //           / \
+  //          x  filter
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  int input_depth = 3;
+  int filter_count = 5;
+  int filter_size = 2;
+  TensorShape filter_shape(
+      {filter_size, filter_size, input_depth, filter_count});
+  Tensor filter_values(DT_FLOAT, filter_shape);
+  for (int i = 0; i < filter_values.NumElements(); ++i) {
+    filter_values.flat<float>()(i) = std::sqrt(static_cast<float>(i));
+  }
+  Output filter =
+      ops::Const(s.WithOpName("filter"), Input::Initializer(filter_values));
+
+  int batch_size = 4;
+  int input_dim = 10;
+  TensorShape input_shape({batch_size, input_dim, input_dim, input_depth});
+  Output input = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                                  ops::Placeholder::Shape(input_shape));
+
+  Output conv =
+      ops::Conv2D(s.WithOpName("conv"), input, filter, {1, 1, 1, 1}, "VALID");
+  Output c = ops::Const(s.WithOpName("c"), 3.0f, {1});
+  Output mul = ops::Mul(s.WithOpName("mul"), c, conv);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding fold(nullptr);
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::cout << output.DebugString() << std::endl;
+
+  EXPECT_EQ(5, output.node_size());
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "mul") {
+      found++;
+      EXPECT_EQ("Conv2D", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("conv/merged_input", node.input(1));
+    } else if (node.name() == "conv/merged_input") {
+      found++;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(0, node.input_size());
+    }
+  }
+  EXPECT_EQ(2, found);
+
+  // Check that const folded multiplication node has the expected value.
+  std::vector<string> fetch = {"mul"};
+  Tensor value(DT_FLOAT, input_shape);
+  for (int i = 0; i < value.NumElements(); ++i) {
+    value.flat<float>()(i) = i;
+  }
+  auto actual = EvaluateNodes(output, fetch, {{"x", value}});
+  auto expected = EvaluateNodes(item.graph, fetch, {{"x", value}});
+  test::ExpectTensorEqual<float>(expected[0], actual[0]);
+}
+
 TEST_F(ConstantFoldingTest, NeutralElement) {
   int kConst = 0;
   int kLike = 1;
@@ -309,11 +391,11 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         EXPECT_EQ(ctrl_zeros_name, node.input(0));
         EXPECT_EQ("^y", node.input(1));
       } else if (name == "mul3") {
-        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("Identity", node.op());
         EXPECT_EQ("x", node.input(0));
         EXPECT_EQ(ctrl_ones_name, node.input(1));
       } else if (name == "mul4") {
-        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("Identity", node.op());
         EXPECT_EQ("y", node.input(0));
         EXPECT_EQ(ctrl_ones_name, node.input(1));
       } else if (name == "mul5") {
@@ -325,7 +407,7 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         EXPECT_EQ("^zeros_1d", node.input(0));
         EXPECT_EQ("^y", node.input(1));
       } else if (name == "div1") {
-        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("Identity", node.op());
         EXPECT_EQ("x", node.input(0));
         EXPECT_EQ(ctrl_ones_name, node.input(1));
       } else if (name == "div2") {
@@ -361,15 +443,15 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         EXPECT_EQ(2, t.tensor_shape().dim(0).size());
         EXPECT_EQ(3, t.tensor_shape().dim(1).size());
       } else if (name == "add1") {
-        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("Identity", node.op());
         EXPECT_EQ("x", node.input(0));
         EXPECT_EQ(ctrl_zeros_name, node.input(1));
       } else if (name == "add2") {
-        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("Identity", node.op());
         EXPECT_EQ("y", node.input(0));
         EXPECT_EQ(ctrl_zeros_name, node.input(1));
       } else if (name == "bias_add1") {
-        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("Identity", node.op());
         EXPECT_EQ("x", node.input(0));
         EXPECT_EQ("^zeros_1d", node.input(1));
       } else if (name == "bias_add2") {
@@ -378,7 +460,7 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         EXPECT_EQ(zeros_name, node.input(0));
         EXPECT_EQ("bias", node.input(1));
       } else if (name == "sub1") {
-        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("Identity", node.op());
         EXPECT_EQ("x", node.input(0));
         EXPECT_EQ(ctrl_zeros_name, node.input(1));
       } else if (name == "sub2") {
@@ -1801,6 +1883,118 @@ TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
   }
 }
 
+TEST_F(ConstantFoldingTest, StridedSliceWithSameDimensionRemoval) {
+  {  // no mask
+    tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+    auto in1 = ops::Variable(scope.WithOpName("in1"), {3, 5, 2}, DT_FLOAT);
+    auto begin = ops::Const(scope.WithOpName("begin"), {0, 0}, {2});
+    auto end = ops::Const(scope.WithOpName("end"), {3, 5}, {2});
+    auto strides = ops::Const(scope.WithOpName("strides"), {1, 1}, {2});
+    Output in2 = ops::Variable(scope.WithOpName("in2"), {4, 6, 2}, DT_FLOAT);
+    ops::StridedSlice s1(scope.WithOpName("s1"), in1, begin, end, strides);
+    ops::StridedSlice s2(scope.WithOpName("s2"), in2, begin, end, strides);
+
+    ops::Add out(scope.WithOpName("out"), s1, s2);
+
+    GrapplerItem item;
+    item.fetch = {"out"};
+    TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+    ConstantFolding optimizer(nullptr /* cpu_device */);
+    GraphDef got;
+    Status status = optimizer.Optimize(nullptr, item, &got);
+    TF_EXPECT_OK(status);
+
+    GraphDef want;
+    AddNode("in1", "VariableV2", {}, {}, &want);
+    AddNode("in2", "VariableV2", {}, {}, &want);
+    AddNode("begin", "Const", {}, {}, &want);
+    AddNode("end", "Const", {}, {}, &want);
+    AddNode("strides", "Const", {}, {}, &want);
+    AddNode("s1", "Identity",
+            {"in1", AsControlDependency("begin"), AsControlDependency("end"),
+             AsControlDependency("strides")},
+            {}, &want);
+    AddNode("s2", "StridedSlice", {"in2", "begin", "end", "strides"}, {},
+            &want);
+    AddNode("out", "Add", {"s1", "s2"}, {}, &want);
+
+    CompareGraphs(want, got);
+
+    auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5, 2}));
+    auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6, 2}));
+    auto tensors_expected =
+        EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors_expected.size());
+    auto tensors =
+        EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
+  }
+  {  // with begin/end/ellipsis mask
+    tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+    // s1 = in1[:, ..., 0:5, 0:6]
+    auto in1 =
+        ops::Variable(scope.WithOpName("in1"), {2, 3, 4, 5, 6}, DT_FLOAT);
+    auto begin1 = ops::Const(scope.WithOpName("begin1"), {0, 0, 0}, {3});
+    auto end1 = ops::Const(scope.WithOpName("end1"), {0, 5, 6}, {3});
+    auto strides1 = ops::Const(scope.WithOpName("strides1"), {1, 1, 1}, {3});
+    ops::StridedSlice s1(
+        scope.WithOpName("s1"), in1, begin1, end1, strides1,
+        ops::StridedSlice::Attrs().BeginMask(1).EndMask(1).EllipsisMask(2));
+
+    Output in2 =
+        ops::Variable(scope.WithOpName("in2"), {5, 8, 5, 6, 9}, DT_FLOAT);
+    auto begin2 = ops::Const(scope.WithOpName("begin2"), {0, 0, 0, 0, 0}, {5});
+    auto end2 = ops::Const(scope.WithOpName("end2"), {2, 3, 4, 5, 6}, {5});
+    auto strides2 =
+        ops::Const(scope.WithOpName("strides2"), {1, 1, 1, 1, 1}, {5});
+    ops::StridedSlice s2(scope.WithOpName("s2"), in2, begin2, end2, strides2);
+
+    ops::Add out(scope.WithOpName("out"), s1, s2);
+
+    GrapplerItem item;
+    item.fetch = {"out"};
+    TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+    ConstantFolding optimizer(nullptr /* cpu_device */);
+    GraphDef got;
+    Status status = optimizer.Optimize(nullptr, item, &got);
+    TF_EXPECT_OK(status);
+
+    GraphDef want;
+    AddNode("in1", "VariableV2", {}, {}, &want);
+    AddNode("in2", "VariableV2", {}, {}, &want);
+    AddNode("begin1", "Const", {}, {}, &want);
+    AddNode("end1", "Const", {}, {}, &want);
+    AddNode("strides1", "Const", {}, {}, &want);
+    AddNode("s1", "Identity",
+            {"in1", AsControlDependency("begin1"), AsControlDependency("end1"),
+             AsControlDependency("strides1")},
+            {}, &want);
+    AddNode("begin2", "Const", {}, {}, &want);
+    AddNode("end2", "Const", {}, {}, &want);
+    AddNode("strides2", "Const", {}, {}, &want);
+    AddNode("s2", "StridedSlice", {"in2", "begin2", "end2", "strides2"}, {},
+            &want);
+    AddNode("out", "Add", {"s1", "s2"}, {}, &want);
+
+    CompareGraphs(want, got);
+
+    auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 3, 4, 5, 6}));
+    auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 8, 5, 6, 9}));
+    auto tensors_expected =
+        EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors_expected.size());
+    auto tensors =
+        EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
+  }
+}
+
 TEST_F(ConstantFoldingTest, TileWithMultipliesBeingOne) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
@@ -1929,7 +2123,7 @@ TEST_F(ConstantFoldingTest, SqueezeWithAllDimesionsGreaterThanOne) {
 }
 
 TEST_F(ConstantFoldingTest, NoOpReduction) {
-  // Build a simple graph with a reduction that can be reduced to the
+  // Build a simple graph with reductions that can be reduced to the
   // identity.
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
@@ -1940,8 +2134,15 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
   Output p = ops::Prod(scope.WithOpName("p"), v, i);
   Output s = ops::Square(scope.WithOpName("s"), p);
 
+  Output v2 = ops::Variable(scope.WithOpName("v2"), {3, 5, 1}, DT_FLOAT);
+  Output c2 =
+      ops::Const(scope.WithOpName("c2").WithControlDependencies(v), 2, {1});
+  ops::Prod::Attrs attr;
+  attr = attr.KeepDims(true);
+  Output p2 = ops::Prod(scope.WithOpName("p2"), v2, c2, attr);
+
   GrapplerItem item;
-  item.fetch.push_back("s");
+  item.fetch = {"s", "p2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
   ConstantFolding optimizer(nullptr /* cpu_device */);
@@ -1949,24 +2150,33 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  bool found = false;
+  int found = 0;
   for (const auto& node : output.node()) {
     if (node.name() == "p") {
-      found = true;
+      found++;
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("v", node.input(0));
       EXPECT_EQ("^i", node.input(1));
+    } else if (node.name() == "p2") {
+      found++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("v2", node.input(0));
+      EXPECT_EQ("^c2", node.input(1));
     }
   }
-  EXPECT_TRUE(found);
+  EXPECT_EQ(2, found);
 
   auto v_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5, 7}));
-  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"v", v_t}});
-  EXPECT_EQ(1, tensors_expected.size());
-  auto tensors = EvaluateNodes(output, item.fetch, {{"v", v_t}});
-  EXPECT_EQ(1, tensors.size());
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5, 1}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"v", v_t}, {"v2", v2_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch, {{"v", v_t}, {"v2", v2_t}});
+  EXPECT_EQ(2, tensors.size());
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
+  test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, NoOpReshape) {
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 7b7fd8115588a6dceb4a74d502e2883a84f57199..200454b5222c9520faabf83378c7a9d23b665436 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -126,9 +126,9 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
     return false;
   }
   const std::unordered_set<string> do_not_rewrite_ops{
-      "Assert",      "CheckNumerics",         "_Retval",
-      "_Arg",        "_ParallelConcatUpdate", "_TPUExecute",
-      "_TPUCompile", "ControlTrigger"};
+      "Assert",     "CheckNumerics",         "_Retval",
+      "_Arg",       "_ParallelConcatUpdate", "TPUExecute",
+      "TPUCompile", "ControlTrigger"};
   if (do_not_rewrite_ops.find(node.op()) != do_not_rewrite_ops.end()) {
     return false;
   }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 1bec9086f7151fb35a4c6bc4c76518503771c9a0..611d871eeabb87746abc1e83ea1c195a634217bd 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
+
 #include <unordered_map>
+
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
@@ -74,11 +77,85 @@ string UniqueSpecializedFunctionName(const FunctionDef& func,
   return unique_name;
 }
 
+// Specialized function instantiation type parameters, body parameters, and
+// const inputs.
+struct FunctionSpecializationSignature {
+  string func_name;
+  std::unordered_map<string, DataType> type_parameters;
+  std::unordered_map<string, AttrValue> body_parameters;
+  std::unordered_map<int, string> const_inputs;
+
+  bool operator==(const FunctionSpecializationSignature& other) const {
+    bool equals = func_name == other.func_name &&
+                  type_parameters == other.type_parameters &&
+                  const_inputs == other.const_inputs;
+
+    if (!equals) return false;
+
+    // Equality is not defined for AttrValue.
+    if (body_parameters.size() != other.body_parameters.size()) return false;
+
+    for (const auto& lhs : body_parameters) {
+      auto it = other.body_parameters.find(lhs.first);
+      if (it == other.body_parameters.end()) return false;
+      if (!FastAreAttrValuesEqual(lhs.second, (*it).second)) return false;
+    }
+
+    return true;
+  }
+
+  struct Hash {
+    uint64 operator()(FunctionSpecializationSignature const& s) const {
+      uint64 h = Hash64(s.func_name);
+
+      // Use std::map for deterministic iteration order.
+
+      std::map<string, DataType> types(s.type_parameters.begin(),
+                                       s.type_parameters.end());
+      for (const auto& pair : types) {
+        AttrValue attr_value;
+        attr_value.set_type(pair.second);
+        h = Hash64Combine(Hash64(pair.first), h);
+        h = Hash64Combine(AttrValueHash(attr_value), h);
+      }
+
+      std::map<string, AttrValue> body(s.body_parameters.begin(),
+                                       s.body_parameters.end());
+      for (const auto& pair : body) {
+        h = Hash64Combine(Hash64(pair.first), h);
+        h = Hash64Combine(FastAttrValueHash(pair.second), h);
+      }
+
+      std::map<int, string> inputs(s.const_inputs.begin(),
+                                   s.const_inputs.end());
+      for (const auto& pair : inputs) {
+        h = Hash64Combine(std::hash<int>()(pair.first), h);
+        h = Hash64Combine(Hash64(pair.second), h);
+      }
+
+      return h;
+    }
+  };
+};
+
+struct FunctionSpecialization {
+  string specialized_func_name;
+  std::unordered_set<string> const_inputs;
+  std::unordered_set<string> control_deps;
+};
+
+class FakeCPUDevice : public Device {
+ public:
+  FakeCPUDevice(Env* env, const DeviceAttributes& attr) : Device(env, attr) {}
+  Status Sync() override { return Status::OK(); }
+};
+
 class FunctionOptimizerContext {
  public:
   explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level,
                                     const GrapplerItem& item)
-      : function_library_(OpRegistry::Global(), item.graph.library()) {
+      : graph_version_(item.graph.versions().producer()),
+        function_library_(OpRegistry::Global(), item.graph.library()) {
     InitializeTrulyConstNodes(item);
     InitializeInlinedFunctions(opt_level, item);
   }
@@ -91,6 +168,11 @@ class FunctionOptimizerContext {
     return &function_library_;
   }
 
+  FunctionLibraryRuntime* mutable_function_library_runtime() {
+    InitializeFunctionLibraryRuntime();
+    return flr_;
+  }
+
   bool IsInlinedFunction(const string& name) const {
     return inlined_functions_.count(name) > 0;
   }
@@ -108,6 +190,16 @@ class FunctionOptimizerContext {
     return gtl::FindWithDefault(inlined_functions_, name, nullptr);
   }
 
+  const FunctionSpecialization* FindFunctionSpecialization(
+      const FunctionSpecializationSignature& sig) const {
+    return gtl::FindOrNull(specialized_functions_, sig);
+  }
+
+  void AddSpecializedFunction(const FunctionSpecializationSignature& sig,
+                              const FunctionSpecialization& specialized_func) {
+    specialized_functions_.emplace(sig, specialized_func);
+  }
+
  private:
   void InitializeTrulyConstNodes(const GrapplerItem& item) {
     std::unordered_set<string> feed_nodes;
@@ -142,11 +234,40 @@ class FunctionOptimizerContext {
     }
   }
 
+  void InitializeFunctionLibraryRuntime() {
+    if (!flr_) {
+      Env* env = Env::Default();
+      DeviceAttributes attr;
+      attr.set_name("/device:CPU:0");
+      attr.set_device_type("CPU");
+      Device* device = new FakeCPUDevice(env, attr);
+      device_mgr_.reset(new DeviceMgr({device}));
+      OptimizerOptions optimizer_opts;
+      optimizer_opts.set_do_function_inlining(true);
+      process_flr_.reset(new ProcessFunctionLibraryRuntime(
+          device_mgr_.get(), env, graph_version_, &function_library_,
+          optimizer_opts));
+      flr_ = process_flr_->GetFLR(device->name());
+    }
+  }
+
+  const int graph_version_;
   FunctionLibraryDefinition function_library_;
+
+  // These fields initialized lazily only if needed.
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> process_flr_;
+  FunctionLibraryRuntime* flr_ = nullptr;
+
   // Functions that can be inlined into optimized graph.
   std::unordered_map<string, const FunctionDef*> inlined_functions_;
   // Nodes that are Const and not in feed.
   std::unordered_map<string, const NodeDef*> truly_const_nodes_;
+  // Specialized functions.
+  std::unordered_map<FunctionSpecializationSignature,
+                     const FunctionSpecialization,
+                     FunctionSpecializationSignature::Hash>
+      specialized_functions_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext);
 };
@@ -303,14 +424,34 @@ void RemovePushedDownConstInputs(const std::unordered_set<string>& const_inputs,
 
     for (const string& ctrl : control_deps) {
       if (existing_control_deps.find(ctrl) == existing_control_deps.end()) {
-        VLOG(3) << "Forward control dependency to function caller node: input="
-                << ctrl;
+        VLOG(3) << "Forward control dependency: input=" << ctrl;
         specialized_func_node->add_input(ctrl);
       }
     }
   }
 }
 
+Status InitializeFunctionSpecializationSignature(
+    const NodeDef& func_node, const FunctionDef& func,
+    const AttrValueMap& func_attr, const FunctionOptimizerContext& ctx,
+    FunctionSpecializationSignature* sig) {
+  sig->func_name = func.signature().name();
+
+  TF_RETURN_IF_ERROR(
+      InstantiationTypeParameters(func, func_attr, &sig->type_parameters));
+  TF_RETURN_IF_ERROR(
+      InstantiationBodyParameters(func, func_attr, &sig->body_parameters));
+
+  for (int i = 0; i < func_node.input_size(); ++i) {
+    const string& input = func_node.input(i);
+    if (ctx.IsTrulyConst(input)) {
+      sig->const_inputs.emplace(i, input);
+    }
+  }
+
+  return Status::OK();
+}
+
 Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
                           FunctionOptimizerContext* ctx,
                           GraphDef* optimized_graph) {
@@ -320,6 +461,32 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   const std::unordered_map<string, AttrValue> func_attr(
       func_node.attr().begin(), func_node.attr().end());
 
+  FunctionSpecializationSignature signature;
+  TF_RETURN_IF_ERROR(InitializeFunctionSpecializationSignature(
+      func_node, func, func_attr, *ctx, &signature));
+
+  // Check if function was already specialized for identical context.
+  const FunctionSpecialization* already_specialized =
+      ctx->FindFunctionSpecialization(signature);
+
+  if (already_specialized) {
+    VLOG(2) << "Function was already specialized in identical context: "
+               "specialized_name="
+            << already_specialized->specialized_func_name;
+
+    // Add a function call node for the specialized function.
+    NodeDef* specialized_func_node = optimized_graph->add_node();
+    *specialized_func_node = func_node;
+    specialized_func_node->set_op(already_specialized->specialized_func_name);
+
+    RemovePushedDownConstInputs(already_specialized->const_inputs,
+                                already_specialized->control_deps,
+                                specialized_func_node);
+
+    return Status::OK();
+  }
+
+  // Add a new specialized function definition to the library.
   const auto& flib = ctx->function_library();
 
   // Make a GrapplerFunctionItem and convert it back to FunctionDef after
@@ -358,66 +525,53 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   // Update specialized node to remove inputs for pushed down consts.
   RemovePushedDownConstInputs(const_inputs, control_deps,
                               specialized_func_node);
-  return Status::OK();
-}
 
-// Copy input/output argument type to the type_list. Return error if argument
-// type is not explicitly defined, and not specified in function attributes.
-Status CopyArgType(const NodeDef& func_node,
-                   const std::unordered_map<string, AttrValue>& func_attr,
-                   const string& arg_kind, const OpDef::ArgDef& arg,
-                   AttrValue::ListValue* type_list) {
-  if (arg.type() != DT_INVALID) {
-    type_list->add_type(arg.type());
-  } else {
-    auto it = func_attr.find(arg.type_attr());
-    if (it == func_attr.end() || it->second.type() == DT_INVALID) {
-      return errors::InvalidArgument(
-          "Invalid ", arg_kind, " argument ", arg.name(), " for function ",
-          func_node.op(), " instantiated by ", func_node.name());
-    }
-    type_list->add_type(it->second.type());
-  }
+  ctx->AddSpecializedFunction(
+      signature, {specialized_func_name, const_inputs, control_deps});
+
   return Status::OK();
 }
 
-// Add an IdentityN op to hook the function inputs to: this ensures that
+// Create an IdentityN node to hook the function inputs to: this ensures that
 // they're all evaluated before the evaluation of the function body starts.
-Status HookInlinedFunctionInputs(
-    const NodeDef& func_node, const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_attr, NodeDef* inputs) {
-  inputs->set_name(strings::StrCat(func_node.name(), "/", "inlined_inputs"));
-  inputs->set_op("IdentityN");
-  inputs->set_device(func_node.device());
-  *inputs->mutable_input() = func_node.input();
+NodeDef InlinedFunctionInputsNode(const NodeDef& func_node,
+                                  const GrapplerFunctionItem& item) {
+  NodeDef inputs;
+  inputs.set_name(strings::StrCat(func_node.name(), "/", "inlined_inputs"));
+  inputs.set_op("IdentityN");
+  inputs.set_device(func_node.device());
+  *inputs.mutable_input() = func_node.input();
   AttrValue::ListValue* type_list =
-      (*inputs->mutable_attr())["T"].mutable_list();
-  for (const OpDef::ArgDef& arg : func.signature().input_arg()) {
-    TF_RETURN_IF_ERROR(
-        CopyArgType(func_node, func_attr, "input", arg, type_list));
+      (*inputs.mutable_attr())["T"].mutable_list();
+
+  for (const InputArgExpansion& input_arg : item.inputs()) {
+    for (int i = 0; i < input_arg.placeholders.size(); ++i) {
+      type_list->add_type(input_arg.data_type);
+    }
   }
-  return Status::OK();
+
+  return inputs;
 }
 
-// Add an IdentityN op to hook the function outputs to: this ensures that the
-// function body is fully evaluated before its fanout gets scheduled.
-Status HookInlinedFunctionOutputs(
-    const NodeDef& func_node, const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_attr,
-    const gtl::ArraySlice<string> fetch, NodeDef* outputs) {
-  outputs->set_name(func_node.name());
-  outputs->set_op("IdentityN");
-  outputs->set_device(func_node.device());
+// Create an IdentityN node to hook the function outputs to: this ensures that
+// the function body is fully evaluated before its fanout gets scheduled.
+NodeDef InlinedFunctionOutputsNode(const NodeDef& func_node,
+                                   const GrapplerFunctionItem& item) {
+  NodeDef outputs;
+  outputs.set_name(func_node.name());
+  outputs.set_op("IdentityN");
+  outputs.set_device(func_node.device());
   AttrValue::ListValue* type_list =
-      (*outputs->mutable_attr())["T"].mutable_list();
-  for (int i = 0; i < func.signature().output_arg_size(); ++i) {
-    const OpDef::ArgDef& arg = func.signature().output_arg(i);
-    TF_RETURN_IF_ERROR(
-        CopyArgType(func_node, func_attr, "output", arg, type_list));
-    // Use the fetch names since they take into account the output mapping.
-    outputs->add_input(strings::StrCat(func_node.name(), "/", fetch[i]));
+      (*outputs.mutable_attr())["T"].mutable_list();
+
+  for (const OutputArgExpansion& output_arg : item.outputs()) {
+    for (const string& output_tensor : output_arg.output_tensors) {
+      type_list->add_type(output_arg.data_type);
+      outputs.add_input(strings::StrCat(func_node.name(), "/", output_tensor));
+    }
   }
-  return Status::OK();
+
+  return outputs;
 }
 
 Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
@@ -438,27 +592,27 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
                                    ". Error: ", item_status.error_message());
   }
 
-  std::unordered_map<string, int> input_nodes;
-  for (int i = 0; i < func.signature().input_arg_size(); ++i) {
-    const OpDef::ArgDef& arg = func.signature().input_arg(i);
-    input_nodes[arg.name()] = i;
+  // Mapping from input placeholder name to function input position.
+  int idx = 0;
+  std::unordered_map<string, int> input_placeholders_idx;
+  for (const InputArgExpansion& input_arg : item.inputs()) {
+    for (const string& placeholder : input_arg.placeholders) {
+      input_placeholders_idx[placeholder] = idx++;
+    }
   }
 
-  // Hook inlined function inputs to IdentityN node
+  // Hook inlined function inputs to IdentityN node.
   NodeDef* func_inputs = optimized_graph->add_node();
-  TF_RETURN_IF_ERROR(
-      HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs));
+  *func_inputs = InlinedFunctionInputsNode(func_node, item);
 
   for (NodeDef& func_body_node : *item.mutable_function_body().mutable_node()) {
-    if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
+    if (item.IsInputPlaceholder(func_body_node.name())) {
+      // Turn input placeholders into identity nodes.
       CHECK_EQ(0, func_body_node.input_size());
-      // Turn input placeholders into identity nodes
-      if (IsPlaceholder(func_body_node)) {
-        func_body_node.set_op("Identity");
-      }
-      int input_id = input_nodes[func_body_node.name()];
+      func_body_node.set_op("Identity");
+      int input_idx = input_placeholders_idx[func_body_node.name()];
       func_body_node.add_input(
-          strings::StrCat(func_inputs->name(), ":", input_id));
+          strings::StrCat(func_inputs->name(), ":", input_idx));
     } else {
       // Update the input names if any.
       for (string& input : *func_body_node.mutable_input()) {
@@ -472,18 +626,18 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       }
     }
 
-    // Add the node name as a prefix to avoid collisions after inlining
+    // Add the node name as a prefix to avoid collisions after inlining.
     func_body_node.set_name(
         strings::StrCat(func_node.name(), "/", func_body_node.name()));
 
-    // Make sure the node is placed
+    // Make sure the node is placed.
     func_body_node.set_device(func_node.device());
 
-    // Check if a body node is itself a function
+    // Check if a body node is itself a function.
     const FunctionDef* func_body_node_func =
         ctx.FindInlinedFunction(func_body_node.op());
     if (func_body_node_func != nullptr) {
-      // Recursively inline function calls
+      // Recursively inline function calls.
       TF_RETURN_IF_ERROR(InlineFunction(func_body_node, *func_body_node_func,
                                         ctx, optimized_graph));
     } else {
@@ -491,72 +645,20 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       for (const auto& attr : func.attr()) {
         func_body_node.mutable_attr()->insert(attr);
       }
-      // Move the node to the main graph
+      // Move the node to the main graph.
       optimized_graph->add_node()->Swap(&func_body_node);
     }
   }
 
-  // Hook inlined function outputs to IdentityN node
+  // Hook inlined function outputs to IdentityN node.
   NodeDef* func_outputs = optimized_graph->add_node();
-  std::vector<string> fetch = OutputTensors(item);
-  TF_RETURN_IF_ERROR(HookInlinedFunctionOutputs(func_node, func, func_attr,
-                                                fetch, func_outputs));
+  *func_outputs = InlinedFunctionOutputsNode(func_node, item);
 
   return Status::OK();
 }
 
-class FakeCPUDevice : public Device {
- public:
-  FakeCPUDevice(Env* env, const DeviceAttributes& attr) : Device(env, attr) {}
-  Status Sync() override { return Status::OK(); }
-};
-
-class SymbolicGradientEnv {
- public:
-  SymbolicGradientEnv(int graph_version, const FunctionDefLibrary& library)
-      : graph_version_(graph_version), library_(library) {}
-
-  FunctionLibraryDefinition* function_library() {
-    InitializeIfNeeded();
-    return fld_.get();
-  }
-  FunctionLibraryRuntime* function_library_runtime() {
-    InitializeIfNeeded();
-    return flr_;
-  }
-
- private:
-  // This initialization is expensive. Do it lazily to avoid paying for it
-  // unless it's needed.
-  void InitializeIfNeeded() {
-    if (flr_) {
-      return;
-    }
-    Env* env = Env::Default();
-    DeviceAttributes attr;
-    attr.set_name("/device:CPU:0");
-    attr.set_device_type("CPU");
-    FakeCPUDevice* dev = new FakeCPUDevice(env, attr);
-    std::vector<Device*> devices;
-    devices.push_back(dev);
-    dvc_mgr_.reset(new DeviceMgr(devices));
-    fld_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), library_));
-    OptimizerOptions optimizer_opts;
-    optimizer_opts.set_do_function_inlining(true);
-    pflr_.reset(new ProcessFunctionLibraryRuntime(
-        dvc_mgr_.get(), env, graph_version_, fld_.get(), optimizer_opts));
-    flr_ = pflr_->GetFLR(dev->name());
-  }
-
-  const int graph_version_;
-  const FunctionDefLibrary& library_;
-  std::unique_ptr<DeviceMgr> dvc_mgr_;
-  std::unique_ptr<FunctionLibraryDefinition> fld_;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  FunctionLibraryRuntime* flr_ = nullptr;
-};
-
-Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
+Status InlineSymbolicGradient(const NodeDef& node,
+                              FunctionOptimizerContext* ctx,
                               GraphDef* inlined_graph) {
   VLOG(2) << "Inline symbolic gradient: " << SummarizeNodeDef(node);
 
@@ -596,15 +698,15 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
   GraphConstructorOptions graph_ctor_opts;
   graph_ctor_opts.allow_internal_ops = true;
   graph_ctor_opts.expect_device_spec = false;
-  Graph graph(env->function_library());
+  Graph graph(ctx->function_library());
   TF_RETURN_IF_ERROR(
       ConvertGraphDefToGraph(graph_ctor_opts, graph_def, &graph));
 
   // Recursively inline the functions until there is nothing more to inline. We
   // should at least expand one function.
   int counter = 0;
-  while (counter < 50 &&
-         ExpandInlineFunctions(env->function_library_runtime(), &graph)) {
+  while (counter < 50 && ExpandInlineFunctions(
+                             ctx->mutable_function_library_runtime(), &graph)) {
     ++counter;
   }
 
@@ -665,8 +767,6 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
 
   FunctionOptimizerContext ctx(opt_level_, item);
-  SymbolicGradientEnv env(item.graph.versions().producer(),
-                          item.graph.library());
 
   bool inline_gradients = options_.enable_symbolic_gradient_inlining;
   bool inline_func = options_.enable_function_inlining;
@@ -680,7 +780,7 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       const auto* f_attr = gtl::FindOrNull(node.attr(), "f");
       string f_name = f_attr != nullptr ? f_attr->func().name() : "";
       if (ctx.IsInlinedFunction(f_name)) {
-        TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &env, optimized_graph));
+        TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &ctx, optimized_graph));
         continue;
       }
     }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index 147a2644212e237de4115de1391ae48293295201..0aaf57e947f2c27fbfbedf55866c887e5ca1377b 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -718,5 +718,147 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_PushDownConstInput) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
+TEST_F(FunctionOptimizerTest, SpecializeFunction_OncePerUniqueContext) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
+  // Mark MyMul as noinline.
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, int32}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+  (*mul_func.mutable_attr())["_noinline"].set_b(true);
+  std::vector<FunctionDef> function_library = {mul_func};
+
+  const Tensor kTwo = test::AsScalar<float>(2.0);
+  const Tensor kThree = test::AsScalar<float>(3.0);
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("init", "NoOp", {}, {}, kDevice),
+
+       // Float placeholders.
+       NDef("xf", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("yf", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Int32 placeholders.
+       NDef("xi", "Placeholder", {}, {{"dtype", DT_INT32}}, kDevice),
+       NDef("yi", "Placeholder", {}, {{"dtype", DT_INT32}}, kDevice),
+
+       // Consts. Control inputs has to be attached to specialized func calls.
+       NDef("two", "Const", {"^init", "^xf"},
+            {{"dtype", DT_FLOAT}, {"value", kTwo}}, kDevice),
+       NDef("three", "Const", {"^init", "^xf"},
+            {{"dtype", DT_FLOAT}, {"value", kThree}}, kDevice),
+
+       // Specialization #1: DT_FLOAT type parameter.
+       NDef("mul_1", "MyMul", {"xf", "yf"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("mul_2", "MyMul", {"yf", "xf"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Specialization #2: DT_INT32 type parameter.
+       NDef("mul_3", "MyMul", {"xi", "yi"}, {{"T", DT_INT32}}, kDevice),
+
+       // Specialization #3: DT_FLOAT type parameter + const input kTwo.
+       NDef("mul_4", "MyMul", {"xf", "two"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("mul_5", "MyMul", {"yf", "two"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Specialization #4: DT_FLOAT type parameter + const input kThree.
+       NDef("mul_6", "MyMul", {"three", "xf"}, {{"T", DT_FLOAT}}, kDevice)},
+      function_library);
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  // Make sure that MyMul was specialized once per unique context.
+  EXPECT_EQ(4, output.library().function_size());
+
+  // And graph nodes calling specialized functions.
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "mul_1" && count++) {
+      EXPECT_EQ("MyMul_specialized_for_mul_1", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("xf", node.input(0));
+      EXPECT_EQ("yf", node.input(1));
+
+    } else if (node.name() == "mul_2" && count++) {
+      EXPECT_EQ("MyMul_specialized_for_mul_1", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("yf", node.input(0));
+      EXPECT_EQ("xf", node.input(1));
+
+    } else if (node.name() == "mul_3" && count++) {
+      EXPECT_EQ("MyMul_specialized_for_mul_3", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("xi", node.input(0));
+      EXPECT_EQ("yi", node.input(1));
+
+    } else if (node.name() == "mul_4" && count++) {
+      EXPECT_EQ("MyMul_specialized_for_mul_4", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("xf", node.input(0));
+      EXPECT_EQ("^init", node.input(1));
+
+    } else if (node.name() == "mul_5" && count++) {
+      EXPECT_EQ("MyMul_specialized_for_mul_4", node.op());
+      ASSERT_EQ(3, node.input_size());
+      EXPECT_EQ("yf", node.input(0));
+      EXPECT_EQ("^init", node.input(1));
+      EXPECT_EQ("^xf", node.input(2));
+
+    } else if (node.name() == "mul_6" && count++) {
+      EXPECT_EQ("MyMul_specialized_for_mul_6", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("xf", node.input(0));
+      EXPECT_EQ("^init", node.input(1));
+    }
+  }
+  EXPECT_EQ(6, count);
+
+  // And that graph evaluation yields the same result.
+  Tensor pi = test::AsScalar<float>(3.14f);
+  Tensor four = test::AsScalar<int32>(4);
+  item.fetch = {"mul_1", "mul_2", "mul_3", "mul_4", "mul_5", "mul_6"};
+  item.feed = {{"xf", pi}, {"yf", pi}, {"xi", four}, {"yi", four}};
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorEqual<float>(tensors_expected[1], tensors[1]);
+  test::ExpectTensorEqual<int32>(tensors_expected[2], tensors[2]);
+  test::ExpectTensorEqual<float>(tensors_expected[3], tensors[3]);
+  test::ExpectTensorEqual<float>(tensors_expected[4], tensors[4]);
+  test::ExpectTensorEqual<float>(tensors_expected[5], tensors[5]);
+}
+
+TEST_F(FunctionOptimizerTest, PruningUselessLibraryFunctions) {
+  using test::function::NDef;
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+  DisableFunctionSpecialization(&optimizer);
+  auto func = test::function::XTimesTwo();
+  (*func.mutable_attr())["_noinline"].set_b(true);
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, "/device:CPU:0"),
+       NDef("y", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, "/device:CPU:0"),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, "/device:CPU:0")},
+      // FunctionLib
+      {
+          func,
+          test::function::XTimesTwoInt32(),
+          test::function::XTimes16(),
+      });
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(output.library().function().size(), 1);
+  EXPECT_EQ(output.library().function(0).signature().name(), "XTimesTwo");
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index 5adc5b9227ff94089b11ec8de3c69d8c57df3ff2..9627ed732346bcc0ce6a98c2a9e66129fcc6f7b8 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
@@ -389,7 +389,7 @@ Status LoopInvariantNodeMotionOptimizer::Optimize() {
       frame_children_[frame_ids[0]].insert(frame_ids[1]);
       frame_parent_[frame_ids.back()] = frame_ids[frame_ids.size() - 2];
     }
-    if (frame_ids.size() >= 1) {
+    if (!frame_ids.empty()) {
       frame_children_.insert(std::make_pair(frame_ids.back(), empty_set_));
       if (node->op() == "LoopCond") {
         if (loop_cond_.count(frame_ids.back())) {
@@ -408,7 +408,7 @@ Status LoopInvariantNodeMotionOptimizer::Optimize() {
   }
 
   for (auto it = frame_children_.begin(); it != frame_children_.end(); ++it) {
-    if (it->second.size() == 0) {
+    if (it->second.empty()) {
       worklist.push_back(it->first);
     }
   }
@@ -421,7 +421,7 @@ Status LoopInvariantNodeMotionOptimizer::Optimize() {
     if (parent_it != frame_parent_.end()) {
       int parent_id = parent_it->second;
       frame_children_[parent_id].erase(frame_id);
-      if (frame_children_[parent_id].size() == 0) {
+      if (frame_children_[parent_id].empty()) {
         worklist.push_back(parent_id);
       }
     }
@@ -504,6 +504,144 @@ Status RemoveStackOps(const std::unordered_set<string>& nodes_to_preserve,
   return Status::OK();
 }
 
+Status RemoveDeadBranches(const std::unordered_set<string>& nodes_to_preserve,
+                          GraphDef* optimized_graph) {
+  std::unordered_set<const NodeDef*> dead_nodes;
+  std::unordered_map<NodeDef*, std::set<int>> dead_merge_inputs;
+  // TODO(bsteiner): also rewrite switches as identity. For now we just record
+  // them
+  std::unordered_set<GraphView::OutputPort, GraphView::HashPort>
+      identity_switches;
+
+  GraphView view(optimized_graph);
+  for (const NodeDef& node : optimized_graph->node()) {
+    if (!IsSwitch(node)) {
+      continue;
+    }
+    if (nodes_to_preserve.find(node.name()) != nodes_to_preserve.end()) {
+      continue;
+    }
+    GraphView::InputPort ctrl_port(&node, 1);
+    GraphView::OutputPort ctrl_node = view.GetRegularFanin(ctrl_port);
+    if (!IsConstant(*ctrl_node.node)) {
+      continue;
+    }
+    Tensor selector;
+    CHECK(selector.FromProto(ctrl_node.node->attr().at("value").tensor()));
+    const int dead_fanout = selector.scalar<bool>()() ? 0 : 1;
+    GraphView::OutputPort dead(const_cast<NodeDef*>(&node), dead_fanout);
+    identity_switches.insert(dead);
+
+    SetVector<GraphView::InputPort, GraphView::HashPort> zombie_inputs;
+    for (const GraphView::InputPort& port : view.GetFanout(dead)) {
+      if (dead_nodes.find(port.node) == dead_nodes.end()) {
+        zombie_inputs.PushBack(port);
+      }
+    }
+    // If we encounter a single node that must be preserved in the fanout of the
+    // switch node we need to preserve the entire switch fanout: we therefore
+    // work on a local copy that only gets committed to the master copy once the
+    // whole fanout has been explored.
+    std::unordered_set<const NodeDef*> local_dead_nodes = dead_nodes;
+    std::unordered_map<NodeDef*, std::set<int>> local_dead_merge_inputs =
+        dead_merge_inputs;
+    bool found_node_to_preserve = false;
+    while (!found_node_to_preserve && !zombie_inputs.Empty()) {
+      GraphView::InputPort dead = zombie_inputs.PopBack();
+      if (nodes_to_preserve.find(dead.node->name()) !=
+          nodes_to_preserve.end()) {
+        found_node_to_preserve = true;
+        break;
+      }
+
+      if (local_dead_nodes.find(dead.node) != local_dead_nodes.end()) {
+        continue;
+      }
+
+      if (IsMerge(*dead.node)) {
+        const int fanout = dead.node->attr().at("N").i();
+        if (fanout > 2) {
+          // This never happens in practice, so we'll just skip these to
+          // simplify the code for now.
+          found_node_to_preserve = true;
+          break;
+        }
+        GraphView::OutputPort value_index(dead.node, 1);
+        const std::unordered_set<GraphView::InputPort, GraphView::HashPort>&
+            index_fanout = view.GetFanout(value_index);
+        if (!index_fanout.empty()) {
+          // The 2nd output (that indicates which input is propagated) is
+          // connected. This never happens in practice, so we'll just skip this
+          // case to simplify the code for now.
+          found_node_to_preserve = true;
+          break;
+        }
+
+        bool fully_dead = false;
+        if (dead.port_id < 0) {
+          // If the control dependency never gets triggered the merge will also
+          // never get triggered.
+          local_dead_nodes.insert(dead.node);
+          fully_dead = true;
+        } else {
+          local_dead_merge_inputs[dead.node].insert(dead.port_id);
+          if (local_dead_merge_inputs[dead.node].size() ==
+              dead.node->attr().at("N").i()) {
+            fully_dead = true;
+          }
+          if (fully_dead) {
+            local_dead_nodes.insert(dead.node);
+            for (const GraphView::InputPort& port :
+                 view.GetFanouts(*dead.node, true)) {
+              zombie_inputs.PushBack(port);
+            }
+          }
+        }
+      } else if (dead.node->op() == "ControlTrigger") {
+        // Control trigger have different semantic, so don't touch them
+        found_node_to_preserve = true;
+        break;
+      } else {
+        if (local_dead_nodes.insert(dead.node).second) {
+          for (const GraphView::InputPort& dead_fanout :
+               view.GetFanouts(*dead.node, true)) {
+            zombie_inputs.PushBack(dead_fanout);
+          }
+        }
+      }
+    }
+    if (!found_node_to_preserve) {
+      std::swap(dead_nodes, local_dead_nodes);
+      std::swap(dead_merge_inputs, local_dead_merge_inputs);
+    }
+  }
+
+  int last = optimized_graph->node_size() - 1;
+  for (int i = optimized_graph->node_size() - 1; i >= 0; --i) {
+    NodeDef* node = optimized_graph->mutable_node(i);
+    if (dead_nodes.find(node) != dead_nodes.end()) {
+      optimized_graph->mutable_node()->SwapElements(i, last);
+      last--;
+    }
+  }
+  optimized_graph->mutable_node()->DeleteSubrange(last + 1, dead_nodes.size());
+
+  for (const auto& itr : dead_merge_inputs) {
+    NodeDef* dead_node = itr.first;
+    if (dead_nodes.find(dead_node) != dead_nodes.end()) {
+      // The node has been pruned since all its inputs are dead.
+      continue;
+    }
+    const std::set<int>& dead_inputs = itr.second;
+    for (int index : dead_inputs) {
+      dead_node->mutable_input()->DeleteSubrange(index, 1);
+    }
+    dead_node->set_op("Identity");
+    dead_node->mutable_attr()->erase("N");
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
@@ -517,6 +655,10 @@ Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   if (options_.enable_stack_push_removal) {
     TF_RETURN_IF_ERROR(RemoveStackOps(item.NodesToPreserve(), optimized_graph));
   }
+  if (options_.enable_dead_branch_removal) {
+    TF_RETURN_IF_ERROR(
+        RemoveDeadBranches(item.NodesToPreserve(), optimized_graph));
+  }
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
index 764506f7c1a4f3510d3007e9dc8524ec5fba1b82..85b8e655439b28c88356cacbe52a80aabc88df7d 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -54,6 +54,7 @@ class LoopOptimizer : public GraphOptimizer {
   struct LoopOptimizerOptions {
     bool enable_loop_invariant_node_motion = false;
     bool enable_stack_push_removal = true;
+    bool enable_dead_branch_removal = true;
 
     static LoopOptimizerOptions Default(RewriterConfig::Toggle opt_level) {
       LoopOptimizerOptions options;
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
index 10ec544424e651a1c0d39ef6af9a8f824de6c99e..6fd177b7103eac09795109e5393aa7e5680cb28c 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -589,5 +589,112 @@ TEST_F(LoopOptimizerTest, RemovePushWithoutMatchingPop) {
   }
 }
 
+TEST_F(LoopOptimizerTest, RemoveDeadBranches) {
+  Scope scope = Scope::NewRootScope();
+  Output v_in = ops::Variable(scope.WithOpName("v_in"), {3}, DT_FLOAT);
+
+  Output ctrl1 = ops::Const(scope.WithOpName("ctrl1"), false, TensorShape({}));
+  ops::Switch s1(scope.WithOpName("switch1"), v_in, ctrl1);
+  Output square1 = ops::Square(scope.WithOpName("square1"), s1.output_false);
+  Output sqrt1 = ops::Sqrt(scope.WithOpName("sqrt1"), s1.output_true);
+
+  Output ctrl2 = ops::Const(scope.WithOpName("ctrl2"), true, TensorShape({}));
+  ops::Switch s2(scope.WithOpName("switch2"), v_in, ctrl2);
+  Output square2 = ops::Square(scope.WithOpName("square2"), s2.output_false);
+  Output sqrt2 = ops::Sqrt(scope.WithOpName("sqrt2"), s2.output_true);
+
+  Output ctrl3 = ops::Const(scope.WithOpName("ctrl3"), false, TensorShape({}));
+  ops::Switch s3(scope.WithOpName("switch3"), v_in, ctrl3);
+  Output square3 = ops::Square(scope.WithOpName("square3"), s3.output_false);
+  Output sqrt3 = ops::Sqrt(scope.WithOpName("sqrt3"), s3.output_true);
+
+  Output ctrl4 = ops::Const(scope.WithOpName("ctrl4"), false, TensorShape({}));
+  ops::Switch s4(scope.WithOpName("switch4"), v_in, ctrl4);
+  Output square4 = ops::Square(scope.WithOpName("square4"), s4.output_false);
+  Output sqrt4 = ops::Sqrt(scope.WithOpName("sqrt4"), s4.output_true);
+
+  ops::Merge m1(scope.WithOpName("m1"), {square1, sqrt1});
+  ops::Merge m2(scope.WithOpName("m2"), {v_in, square1});
+  ops::Merge m3(scope.WithOpName("m3"), {v_in, sqrt1});
+  ops::Merge m4(scope.WithOpName("m4"), {square1, sqrt2});
+  ops::Merge m5(scope.WithOpName("m5"), {square2, sqrt1});
+  ops::Merge m6(scope.WithOpName("m6").WithControlDependencies(sqrt2),
+                {v_in, square1});
+  ops::Merge m7(scope.WithOpName("m7").WithControlDependencies(sqrt1),
+                {v_in, square1});
+
+  ops::Switch s5(scope.WithOpName("switch5"), v_in, ctrl1);
+  Output id1 = ops::Identity(scope.WithOpName("id1"), s5.output_false);
+  Output id2 = ops::Identity(scope.WithOpName("id2"), s5.output_true);
+  ops::Merge m8(scope.WithOpName("m8"), {id1, id2});
+
+  ops::Switch s6(scope.WithOpName("switch6"), v_in, ctrl1);
+  Output id3 = ops::Identity(scope.WithOpName("id3"), s6.output_false);
+  Output id4 = ops::Identity(scope.WithOpName("id4"), s6.output_true);
+  ops::Merge m9(scope.WithOpName("m9"), {id3, id4});
+
+  GrapplerItem item;
+  item.fetch.push_back("m8");
+  item.fetch.push_back("id4");
+
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_CHECK_OK(status);
+
+  for (const NodeDef& node : output.node()) {
+    // These nodes should have been pruned
+    EXPECT_NE("Square1", node.name());
+    EXPECT_NE("Sqrt2", node.name());
+    EXPECT_NE("m5", node.name());
+    EXPECT_NE("m7", node.name());
+
+    if (node.name() == "m1") {
+      // sqrt1 is dead
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square1", node.input(0));
+    } else if (node.name() == "m2") {
+      // both inputs are alive
+      EXPECT_EQ("Merge", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("v_in", node.input(0));
+      EXPECT_EQ("square1", node.input(1));
+    } else if (node.name() == "m3") {
+      // sqrt1 is dead
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("v_in", node.input(0));
+    } else if (node.name() == "m4") {
+      // both inputs are alive
+      EXPECT_EQ("Merge", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("square1", node.input(0));
+      EXPECT_EQ("sqrt2", node.input(1));
+    } else if (node.name() == "m6") {
+      // both inputs are alive and the control dependency can get triggered
+      EXPECT_EQ("Merge", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("v_in", node.input(0));
+      EXPECT_EQ("square1", node.input(1));
+      EXPECT_EQ("^sqrt2", node.input(2));
+    } else if (node.name() == "m8") {
+      // The node is to be preserved because of a fetch
+      EXPECT_EQ("Merge", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("id1", node.input(0));
+      EXPECT_EQ("id2", node.input(1));
+    } else if (node.name() == "m9") {
+      // The node is to be preserved because of a fetch
+      EXPECT_EQ("Merge", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("id3", node.input(0));
+      EXPECT_EQ("id4", node.input(1));
+    }
+  }
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 7c6468bfcbca51f03738daaf33fbc007e5bdeebb..1be5f8dcc2ca8a1690f655ae7731bcc2c5ff2d45 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/grappler/utils/traversal.h"
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -1069,9 +1070,11 @@ static bool IdentifySwappingCandidates(
         // ensure that swapping the tensor back in won't recreate the memory
         // bottleneck. Last but not least, we want the tensor to have as few
         // remaining uses as possible.
-        mem_info.fitness = std::pow((earliest_use - peak_time).count(), 2);
-        mem_info.fitness /= std::pow(mem_info.uses_left.size(), 2);
-        mem_info.fitness += std::pow((allocation_time - peak_time).count(), 2);
+        mem_info.fitness =
+            MathUtil::IPow((earliest_use - peak_time).count(), 2);
+        mem_info.fitness /= MathUtil::IPow(mem_info.uses_left.size(), 2);
+        mem_info.fitness +=
+            MathUtil::IPow((allocation_time - peak_time).count(), 2);
         mem_info.fitness = -mem_info.fitness;
         mem_state.push_back(mem_info);
       }
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 0c8e18d7ab18b60bb01d9008efb07cc72c60ea05..a92727535d7f20de9dd3adbefac6c047b10b6359 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -24,11 +24,12 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/debug_stripper.h"
 #include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/optimizers/remapper.h"
+#include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
 #include "tensorflow/core/grappler/utils/colocation.h"
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
@@ -78,6 +79,8 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
   MK_OPT("pruning", new ModelPruner());
   MK_OPT("function", new FunctionOptimizer(cfg_.function_optimization()));
   MK_OPT("constfold", new ConstantFolding(cpu_device_));
+  MK_OPT("shape", new ShapeOptimizer());
+  MK_OPT("remap", new Remapper(cfg_.remapping()));
   MK_OPT("layout", new LayoutOptimizer());
   MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL));
   MK_OPT("arithmetic", new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
@@ -107,6 +110,12 @@ Status MetaOptimizer::InitializeOptimizers(
     optimizers->emplace_back(
         new ConstantFolding(cfg_.constant_folding(), cpu_device_));
   }
+  if (cfg_.shape_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(new ShapeOptimizer());
+  }
+  if (cfg_.remapping() != RewriterConfig::OFF) {
+    optimizers->emplace_back(new Remapper(cfg_.remapping()));
+  }
   if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
     optimizers->emplace_back(
         new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
@@ -344,6 +353,8 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
          cfg.layout_optimizer() != RewriterConfig::OFF ||
          cfg.function_optimization() != RewriterConfig::OFF ||
          cfg.constant_folding() != RewriterConfig::OFF ||
+         cfg.shape_optimization() != RewriterConfig::OFF ||
+         cfg.remapping() != RewriterConfig::OFF ||
          cfg.arithmetic_optimization() != RewriterConfig::OFF ||
          cfg.loop_optimization() != RewriterConfig::OFF ||
          cfg.dependency_optimization() != RewriterConfig::OFF ||
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 887a988af9afed58c903f312a8c83f769474a634..8247cce33922e6576e35ce5faa566af91f4e4939 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -163,30 +163,28 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
                                            output.library());
 
   // Specialized and optimized functions should be added to the graph.
-  EXPECT_EQ(6, optimized_flib.num_functions());
+  EXPECT_EQ(5, optimized_flib.num_functions());
 
   // MyQuadratic should be specialized once:
   //   0. 'quadratic' node in the main graph
   const string optimized_0 = "MyQuadratic_specialized_for_quadratic";
 
   // MySquare should be specialized and optimized for 3 instantiations:
-  //   1. 'square' node in the main graph
-  //   2. 'square' node in the MyQuadratic specialization
-  //   3. 'quadratic' node in the MyQuadratic specialization
+  //   1.  'square' node in the main graph
+  //   2.  'square' node in the MyQuadratic specialization
+  //   3*. 'quadratic' node in the MyQuadratic specialization
+  //        has identical instantiation context to #2
 
   const string optimized_1 = "MySquare_specialized_for_square";
   const string optimized_2 = "MySquare_specialized_for_square_1";
-  const string optimized_3 = "MySquare_specialized_for_quadratic";
 
   const FunctionDef* optimized_func_0 = optimized_flib.Find(optimized_0);
   const FunctionDef* optimized_func_1 = optimized_flib.Find(optimized_1);
   const FunctionDef* optimized_func_2 = optimized_flib.Find(optimized_2);
-  const FunctionDef* optimized_func_3 = optimized_flib.Find(optimized_3);
 
   ASSERT_NE(optimized_func_0, nullptr);
   ASSERT_NE(optimized_func_1, nullptr);
   ASSERT_NE(optimized_func_2, nullptr);
-  ASSERT_NE(optimized_func_3, nullptr);
 
   // Graph should call optimized function.
   int count = 0;
@@ -205,13 +203,14 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
     if (node.name() == "square" && count++) {
       EXPECT_EQ(optimized_2, node.op());
     } else if (node.name() == "quadratic" && count++) {
-      EXPECT_EQ(optimized_3, node.op());
+      // Share specialized function with the 'square' node.
+      EXPECT_EQ(optimized_2, node.op());
     }
   }
   EXPECT_EQ(2, count);
 
-  const std::vector<const FunctionDef*> optimized_funcs = {
-      optimized_func_1, optimized_func_1, optimized_func_3};
+  const std::vector<const FunctionDef*> optimized_funcs = {optimized_func_1,
+                                                           optimized_func_2};
 
   // MyMul should be inlined into all optimized versions of MySquare.
   for (const FunctionDef* optimized_func : optimized_funcs) {
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2a628712930aad52bab821f26f227287591c9c36
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -0,0 +1,166 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/remapper.h"
+
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+void AddBatchNormNodes(GraphDef* optimized_graph, const NodeDef& fused_node) {
+  const string& x = fused_node.input(0);
+  const string& scale = fused_node.input(1);
+  const string& offset = fused_node.input(2);
+  const string& mean = fused_node.input(3);
+  const string& variance = fused_node.input(4);
+
+  float epsilon = 0.0f;
+  if (fused_node.attr().count("epsilon")) {
+    epsilon = fused_node.attr().at("epsilon").f();
+  }
+  DataType dtype = fused_node.attr().at("T").type();
+  Tensor value(dtype, TensorShape());
+  value.scalar<float>()() = epsilon;
+  NodeDef* variance_epsilon = optimized_graph->add_node();
+  TF_CHECK_OK(ConstantFolding::CreateNodeDef(
+      AddPrefixToNodeName("Const", fused_node.name()), &value,
+      variance_epsilon));
+  variance_epsilon->set_device(fused_node.device());
+
+  NodeDef* variance_plus_epsilon = optimized_graph->add_node();
+  variance_plus_epsilon->set_name(
+      AddPrefixToNodeName("VarPlusEpsilon", fused_node.name()));
+  variance_plus_epsilon->set_op("Add");
+  (*variance_plus_epsilon->mutable_attr())["T"].set_type(dtype);
+  variance_plus_epsilon->set_device(fused_node.device());
+  *variance_plus_epsilon->add_input() = variance;
+  *variance_plus_epsilon->add_input() = variance_epsilon->name();
+
+  NodeDef* inv = optimized_graph->add_node();
+  inv->set_name(AddPrefixToNodeName("Inv", fused_node.name()));
+  inv->set_op("Rsqrt");
+  inv->set_device(fused_node.device());
+  (*inv->mutable_attr())["T"].set_type(dtype);
+  *inv->add_input() = variance_plus_epsilon->name();
+
+  NodeDef* scaled = optimized_graph->add_node();
+  scaled->set_name(AddPrefixToNodeName("Scaled", fused_node.name()));
+  scaled->set_op("Mul");
+  scaled->set_device(fused_node.device());
+  (*scaled->mutable_attr())["T"].set_type(dtype);
+  *scaled->add_input() = inv->name();
+  *scaled->add_input() = scale;
+
+  NodeDef* a = optimized_graph->add_node();
+  a->set_name(AddPrefixToNodeName("Mul", fused_node.name()));
+  a->set_op("Mul");
+  a->set_device(fused_node.device());
+  (*a->mutable_attr())["T"].set_type(dtype);
+  *a->add_input() = x;
+  *a->add_input() = scaled->name();
+
+  NodeDef* b = optimized_graph->add_node();
+  b->set_name(AddPrefixToNodeName("Mul2", fused_node.name()));
+  b->set_op("Mul");
+  b->set_device(fused_node.device());
+  (*b->mutable_attr())["T"].set_type(dtype);
+  *b->add_input() = mean;
+  *b->add_input() = scaled->name();
+
+  NodeDef* c = optimized_graph->add_node();
+  c->set_name(AddPrefixToNodeName("Offset", fused_node.name()));
+  c->set_op("Sub");
+  c->set_device(fused_node.device());
+  (*c->mutable_attr())["T"].set_type(dtype);
+  *c->add_input() = offset;
+  *c->add_input() = b->name();
+
+  NodeDef* r = optimized_graph->add_node();
+  r->set_name(fused_node.name());
+  r->set_op("Add");
+  r->set_device(fused_node.device());
+  (*r->mutable_attr())["T"].set_type(dtype);
+  *r->add_input() = a->name();
+  *r->add_input() = c->name();
+}
+
+Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
+                          GraphDef* optimized_graph) {
+  GraphProperties properties(item);
+  TF_RETURN_IF_ERROR(properties.InferStatically(false));
+  GraphView graph(const_cast<GraphDef*>(&item.graph));
+
+  // During inference, most of the inputs to FusedBatchNorm are constant, and we
+  // can therefore replace the op with a much cheaper set of primitives.
+  for (const NodeDef& node : item.graph.node()) {
+    if (node.op() == "FusedBatchNorm" || node.op() == "FusedBatchNormV2") {
+      bool optimizable = (node.attr().count("T") == 0 ||
+                          node.attr().at("T").type() == DT_FLOAT);
+      optimizable &= (node.attr().count("is_training") == 0 ||
+                      !node.attr().at("is_training").b());
+      if (optimizable) {
+        std::unordered_set<int> const_inputs;
+        for (const string& input : node.input()) {
+          int pos;
+          const string input_node = ParseNodeName(input, &pos);
+          if (properties.HasInputProperties(input_node)) {
+            const auto& props = properties.GetInputProperties(input_node);
+            if (props.size() > pos && props[pos].has_value()) {
+              const_inputs.insert(pos);
+            }
+          }
+        }
+        // TODO(bsteiner): use the cost model to compare the cost of fused batch
+        // norm against that of the optimized form.
+        optimizable = (const_inputs.size() >= 4);
+      }
+      if (optimizable) {
+        for (GraphView::Edge edge : graph.GetFanoutEdges(node, false)) {
+          if (edge.src.port_id != 0) {
+            // The optimized version only generates the first output.
+            optimizable = false;
+            break;
+          }
+        }
+      }
+      if (optimizable) {
+        AddBatchNormNodes(optimized_graph, node);
+        continue;
+      }
+    }
+    *optimized_graph->add_node() = node;
+  }
+
+  *optimized_graph->mutable_library() = item.graph.library();
+  *optimized_graph->mutable_versions() = item.graph.versions();
+
+  return Status::OK();
+}
+
+void Remapper::Feedback(Cluster* /*cluster*/, const GrapplerItem& /*item*/,
+                        const GraphDef& /*optimized_graph*/,
+                        double /*result*/) {
+  // Nothing to do for ArithmeticOptimizer.
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/remapper.h b/tensorflow/core/grappler/optimizers/remapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..c18413e4e72bb970e1e15bca25fcc6316c5ac327
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/remapper.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_REMAPPER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_REMAPPER_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Optimize TF computations by remapping subgraphs/nodes onto other subgraphs or
+// nodes to decrease the amount of operations needed to perform a computation.
+class Remapper : public GraphOptimizer {
+ public:
+  explicit Remapper(RewriterConfig::Toggle opt_level) : opt_level_(opt_level) {}
+
+  ~Remapper() override {}
+
+  string name() const override { return "remapper"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override;
+
+ private:
+  RewriterConfig::Toggle opt_level_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_REMAPPER_H_
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..291585c5382173f706a421183b18aea70a30c771
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/remapper.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class RemapperTest : public GrapplerTest {};
+
+TEST_F(RemapperTest, FusedBatchNorm) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output dflt = ops::Const(s.WithOpName("dflt"), {3.14f, 2.7f}, {2, 1, 1, 1});
+  Output x = ops::PlaceholderWithDefault(s.WithOpName("x"), dflt, {2, 1, 1, 1});
+  Output scale = ops::Const(s.WithOpName("scale"), {0.3f}, {1});
+  Output offset = ops::Const(s.WithOpName("offset"), {0.123f}, {1});
+  Output mean = ops::Const(s.WithOpName("mean"), {7.3f}, {1});
+  Output variance = ops::Const(s.WithOpName("variance"), {0.57f}, {1});
+  ops::FusedBatchNorm::Attrs attr;
+  attr = attr.IsTraining(false);
+  ops::FusedBatchNorm bn(s.WithOpName("batch_norm"), x, scale, offset, mean,
+                         variance, attr);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"batch_norm"};
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.cc b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26c54df56b9e250d0de3dbd9a0cce4dbb369f0e2
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
@@ -0,0 +1,133 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
+
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/symbolic_shapes.h"
+
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status ShapeOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                GraphDef* optimized_graph) {
+  *optimized_graph = item.graph;
+
+  GraphProperties properties(item);
+  TF_RETURN_IF_ERROR(properties.InferStatically(false));
+  GraphView graph(optimized_graph);
+
+  // The product of all the dimensions in a tensor shape can be expressed more
+  // simply as the size of the tensor.
+  for (auto& node : *optimized_graph->mutable_node()) {
+    if (!IsShape(node)) {
+      continue;
+    }
+    for (GraphView::InputPort fanout :
+         graph.GetFanout(GraphView::OutputPort(&node, 0))) {
+      if (fanout.node->op() != "Prod") {
+        continue;
+      }
+      if (fanout.node->attr().count("keep_dims") != 0 &&
+          fanout.node->attr().at("keep_dims").b()) {
+        // Keeping the reduced dimensions won't result in a scalar, so we can't
+        // rewrite the whole expression directly as a Size operation.
+        continue;
+      }
+      const GraphView::OutputPort reduce_indices =
+          graph.GetRegularFanin(GraphView::InputPort(fanout.node, 1));
+      const auto& prop =
+          properties.GetOutputProperties(reduce_indices.node->name());
+      if (prop.size() < reduce_indices.port_id) {
+        continue;
+      }
+      const TensorShapeProto& reduction_indices_shape =
+          prop[reduce_indices.port_id].shape();
+      if (NumCoefficients(reduction_indices_shape) == 1) {
+        const auto& input_props = properties.GetInputProperties(node.name());
+        if (input_props.size() != 1) {
+          continue;
+        }
+        // Rewrite the reduction of the shape dimensions as a Size operation.
+        const DataType type = input_props[0].dtype();
+        fanout.node->set_op("Size");
+        fanout.node->set_input(0, node.input(0));
+        fanout.node->set_input(1, AsControlDependency(node));
+        fanout.node->mutable_attr()->erase("Tidx");
+        fanout.node->mutable_attr()->erase("keep_dims");
+        (*fanout.node->mutable_attr())["out_type"] =
+            fanout.node->attr().at("T");
+        (*fanout.node->mutable_attr())["T"].set_type(type);
+      }
+    }
+  }
+  for (auto& node : *optimized_graph->mutable_node()) {
+    // Try to convert the ratio of 2 symbolic tensor sizes into a constant. This
+    // is possible whenever the symbolic dimensions in the numerator and
+    // denominator cancel each other.
+    if (node.op() == "Div") {
+      const GraphView::OutputPort input1 =
+          graph.GetRegularFanin(GraphView::InputPort(&node, 0));
+      const GraphView::OutputPort input2 =
+          graph.GetRegularFanin(GraphView::InputPort(&node, 1));
+      if (!IsSize(*input1.node) || !IsSize(*input2.node)) {
+        continue;
+      }
+      const auto& prop1 = properties.GetInputProperties(input1.node->name());
+      const auto& prop2 = properties.GetInputProperties(input2.node->name());
+      if (prop1.size() != 1 || prop2.size() != 1) {
+        continue;
+      }
+      const TensorShapeProto& shape1 = prop1[0].shape();
+      const TensorShapeProto& shape2 = prop2[0].shape();
+      int64 result = ComputeSizeRatio(shape1, shape2);
+      if (result >= 0) {
+        // Replace div with constant.
+        node.set_op("Const");
+        DataType dtype = node.attr().at("T").type();
+        node.mutable_attr()->erase("T");
+        (*node.mutable_attr())["dtype"].set_type(dtype);
+        TensorProto* t = (*node.mutable_attr())["value"].mutable_tensor();
+        t->set_dtype(dtype);
+        *t->mutable_tensor_shape() = TensorShapeProto();
+        if (dtype == DT_INT32) {
+          t->add_int_val(result);
+        } else {
+          t->add_int64_val(result);
+        }
+        node.set_input(0, AsControlDependency(node.input(0)));
+        node.set_input(1, AsControlDependency(node.input(1)));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+void ShapeOptimizer::Feedback(Cluster* /*cluster*/,
+                              const GrapplerItem& /*item*/,
+                              const GraphDef& /*optimized_graph*/,
+                              double /*result*/) {
+  // Nothing to do for LoopOptimizer.
+}
+
+}  // end namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.h b/tensorflow/core/grappler/optimizers/shape_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7f84a1e5dbe7dd1e2d21e3752522b3f237e2d7c
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.h
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SHAPE_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SHAPE_OPTIMIZER_H_
+
+#include <unordered_set>
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Optimize TensorFlow subgraphs that operate on shape and shape related
+// information.
+class ShapeOptimizer : public GraphOptimizer {
+ public:
+  ShapeOptimizer() : opt_level_(RewriterConfig::ON) {}
+  explicit ShapeOptimizer(RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level) {}
+
+  ~ShapeOptimizer() override {}
+
+  string name() const override { return "shape_optimizer"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override;
+
+ private:
+  RewriterConfig::Toggle opt_level_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SHAPE_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer_test.cc b/tensorflow/core/grappler/optimizers/shape_optimizer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95a5eccd4f0ee84ae1fafb68c8a4fc6ee3bb9519
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer_test.cc
@@ -0,0 +1,105 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class ShapeOptimizerTest : public GrapplerTest {};
+
+TEST_F(ShapeOptimizerTest, OptimizeShapeProduct) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 3.14f, {32, 16});
+  Output c = ops::Shape(s.WithOpName("c"), a);
+  Output d = ops::Const(s.WithOpName("d"), 0, {1});
+  ops::ReduceProd::Attrs attrs;
+  Output e = ops::ReduceProd(s.WithOpName("e"), c, d, attrs.KeepDims(false));
+  Output f = ops::ReduceProd(s.WithOpName("f"), c, d, attrs.KeepDims(true));
+
+  GrapplerItem item;
+  item.fetch = {"e", "f"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  GraphDef output;
+  ShapeOptimizer optimizer;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "e") {
+      found++;
+      EXPECT_EQ("Size", node.op());
+      EXPECT_EQ("a", node.input(0));
+    } else if (node.name() == "f") {
+      found++;
+      EXPECT_EQ("Prod", node.op());
+      EXPECT_EQ("c", node.input(0));
+    }
+  }
+  EXPECT_EQ(2, found);
+
+  auto tensors_actual = EvaluateNodes(output, item.fetch);
+  EXPECT_NEAR(tensors_expected[0].scalar<int>()(),
+              tensors_actual[0].scalar<int>()(), 0);
+  EXPECT_NEAR(tensors_expected[1].scalar<int>()(),
+              tensors_actual[1].scalar<int>()(), 0);
+}
+
+TEST_F(ShapeOptimizerTest, OptimizeShapeRatio) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 3.14f, {32, 32});
+  Output b = ops::Const(s.WithOpName("b"), 3.14f, {32, 16});
+  Output c = ops::Size(s.WithOpName("c"), a);
+  Output d = ops::Size(s.WithOpName("d"), b);
+  Output e = ops::Div(s.WithOpName("e"), c, d);
+
+  GrapplerItem item;
+  item.fetch = {"e"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  GraphDef output;
+  ShapeOptimizer optimizer;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "e") {
+      found++;
+      EXPECT_EQ("Const", node.op());
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_actual = EvaluateNodes(output, item.fetch);
+  EXPECT_NEAR(tensors_expected[0].scalar<int>()(),
+              tensors_actual[0].scalar<int>()(), 0);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/symbolic_shapes.cc b/tensorflow/core/grappler/optimizers/symbolic_shapes.cc
index cfca2dc0d38480240c9b158ecbb3cc718bfa1ad2..155843a74403b080e90796d11185315873e91eff 100644
--- a/tensorflow/core/grappler/optimizers/symbolic_shapes.cc
+++ b/tensorflow/core/grappler/optimizers/symbolic_shapes.cc
@@ -49,6 +49,27 @@ bool ShapeIsSymbolicallyDefined(const OpInfo::TensorProperties& properties) {
   return ShapeIsSymbolicallyDefined(properties.shape());
 }
 
+int Rank(const TensorShapeProto& shape) {
+  if (shape.unknown_rank()) {
+    return -1;
+  }
+  return shape.dim_size();
+}
+
+int64 NumCoefficients(const TensorShapeProto& shape) {
+  if (shape.unknown_rank()) {
+    return -1;
+  }
+  int64 num_coefficients = 1;
+  for (const auto& dim : shape.dim()) {
+    if (dim.size() < 0) {
+      return -1;
+    }
+    num_coefficients *= dim.size();
+  }
+  return num_coefficients;
+}
+
 bool ShapesSymbolicallyEqual(const TensorShapeProto& left,
                              const TensorShapeProto& right) {
   if (left.unknown_rank() || right.unknown_rank() ||
@@ -85,6 +106,25 @@ bool ShapesBroadcastable(const OpInfo::TensorProperties& left,
   return ShapesBroadcastable(left.shape(), right.shape());
 }
 
+bool ShapeAfterBroadcast(const TensorShapeProto& left,
+                         const TensorShapeProto& right,
+                         TensorShapeProto* output_shape) {
+  if (!ShapeIsSymbolicallyDefined(left) || !ShapeIsSymbolicallyDefined(right)) {
+    return false;
+  }
+  BCast bcast(ShapeDims(left), ShapeDims(right),
+              /*fewer_dims_optimization*/ false);
+  if (!bcast.IsValid()) {
+    return false;
+  }
+  output_shape->set_unknown_rank(false);
+  output_shape->clear_dim();
+  for (const auto& dim : bcast.output_shape()) {
+    output_shape->add_dim()->set_size(dim);
+  }
+  return true;
+}
+
 bool CompareSymbolicallyShapedTensorSizes(const TensorShapeProto& left,
                                           const TensorShapeProto& right) {
   // if one of the ranks is unknown, it's impossible to compare tensor sizes
@@ -173,5 +213,44 @@ bool CompareSymbolicallyShapedTensorSizes(
   return CompareSymbolicallyShapedTensorSizes(left.shape(), right.shape());
 }
 
+int64 ComputeSizeRatio(const TensorShapeProto& numerator,
+                       const TensorShapeProto& denominator) {
+  if (numerator.unknown_rank() || denominator.unknown_rank()) {
+    return -1;
+  }
+  std::multiset<int> symbolic_dims;
+  int64 num = 1;
+  for (const auto& dim : numerator.dim()) {
+    if (dim.size() == -1) {
+      return -1;
+    } else if (dim.size() < -1) {
+      symbolic_dims.insert(dim.size());
+    } else {
+      num *= dim.size();
+    }
+  }
+  int64 denom = 1;
+  for (const auto& dim : denominator.dim()) {
+    if (dim.size() == -1) {
+      return -1;
+    } else if (dim.size() < -1) {
+      auto it = symbolic_dims.find(dim.size());
+      if (it == symbolic_dims.end()) {
+        return -1;
+      }
+      symbolic_dims.erase(it);
+    } else {
+      denom *= dim.size();
+    }
+  }
+  if (denom == 0) {
+    return -1;
+  }
+  if (!symbolic_dims.empty()) {
+    return -1;
+  }
+  return num / denom;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/symbolic_shapes.h b/tensorflow/core/grappler/optimizers/symbolic_shapes.h
index eb79bab3141579132ea2e2d2afc5733f0013a0d5..ace7bd1fe7ecf96c60f140d1aa416d9cbcb0d631 100644
--- a/tensorflow/core/grappler/optimizers/symbolic_shapes.h
+++ b/tensorflow/core/grappler/optimizers/symbolic_shapes.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -31,6 +32,14 @@ bool IsUnknown(const TensorShapeProto::Dim& dim);
 bool ShapeIsSymbolicallyDefined(const TensorShapeProto& shape);
 bool ShapeIsSymbolicallyDefined(const OpInfo::TensorProperties& properties);
 
+// Returns the rank of the shape ir -1 if unknown
+int Rank(const TensorShapeProto& shape);
+
+// Returns the number of coefficients in the shape or -1 if unknown.
+// TODO(bsteiner) Add a function that computes the minimum size of the tensor,
+// ie the size assuming all the symbolic dimensions take the value 1.
+int64 NumCoefficients(const TensorShapeProto& shape);
+
 // Shapes are symbolically equal, if they have the same rank, they are known or
 // symbolically defined, and have matching dimensions.
 bool ShapesSymbolicallyEqual(const TensorShapeProto& left,
@@ -44,6 +53,9 @@ bool ShapesBroadcastable(const TensorShapeProto& left,
                          const TensorShapeProto& right);
 bool ShapesBroadcastable(const OpInfo::TensorProperties& left,
                          const OpInfo::TensorProperties& right);
+bool ShapeAfterBroadcast(const TensorShapeProto& left,
+                         const TensorShapeProto& right,
+                         TensorShapeProto* output_shape);
 
 // Return true if can prove, that tensor of size 'left' is smaller than tensor
 // of size 'right'. Return false if it's larger or equal, or it's impossible to
@@ -54,6 +66,11 @@ bool CompareSymbolicallyShapedTensorSizes(
     const OpInfo::TensorProperties& left,
     const OpInfo::TensorProperties& right);
 
+// Returns the ratio of the sizes of the 2 shapes if known statically, or -1
+// otherwise.
+int64 ComputeSizeRatio(const TensorShapeProto& numerator,
+                       const TensorShapeProto& denominator);
+
 }  // namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/optimizers/symbolic_shapes_test.cc b/tensorflow/core/grappler/optimizers/symbolic_shapes_test.cc
index 5ef9f6592571062564d16e5fb282b1dd85d074ef..7ce995d1c5fcea5095ee0f49eb3e8955580eb6b4 100644
--- a/tensorflow/core/grappler/optimizers/symbolic_shapes_test.cc
+++ b/tensorflow/core/grappler/optimizers/symbolic_shapes_test.cc
@@ -74,6 +74,26 @@ TEST_F(SymbolicShapesTest, ShapesBroadcastable) {
   EXPECT_TRUE(ShapesBroadcastable(MakeShape({-2, 1}), MakeShape({1, -2})));
   EXPECT_TRUE(ShapesBroadcastable(MakeShape({-2, 1}), MakeShape({1, -3})));
   EXPECT_TRUE(ShapesBroadcastable(MakeShape({-3}), MakeShape({-2, -3})));
+
+  TensorShapeProto output_shape;
+  EXPECT_TRUE(
+      ShapeAfterBroadcast(MakeShape({1, 2}), MakeShape({1, 2}), &output_shape));
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({1, 2}), output_shape));
+  EXPECT_TRUE(ShapeAfterBroadcast(MakeShape({-2, 2}), MakeShape({-2, 2}),
+                                  &output_shape));
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({-2, 2}), output_shape));
+  EXPECT_TRUE(ShapeAfterBroadcast(MakeShape({-2, 32}), MakeShape({-2, 1}),
+                                  &output_shape));
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({-2, 32}), output_shape));
+  EXPECT_TRUE(ShapeAfterBroadcast(MakeShape({-2, 1}), MakeShape({1, -2}),
+                                  &output_shape));
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({-2, -2}), output_shape));
+  EXPECT_TRUE(ShapeAfterBroadcast(MakeShape({-2, 1}), MakeShape({1, -3}),
+                                  &output_shape));
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({-2, -3}), output_shape));
+  EXPECT_TRUE(
+      ShapeAfterBroadcast(MakeShape({-3}), MakeShape({-2, -3}), &output_shape));
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({-2, -3}), output_shape));
 }
 
 TEST_F(SymbolicShapesTest, CompareSymbolicallyShapedTensorSizes) {
@@ -90,6 +110,33 @@ TEST_F(SymbolicShapesTest, CompareSymbolicallyShapedTensorSizes) {
   EXPECT_FALSE(MakeShape({-1, -1, 32}) < MakeShape({1, -1, 32}));
 }
 
+TEST_F(SymbolicShapesTest, RankAndNumCoeff) {
+  EXPECT_EQ(2, Rank(MakeShape({32, 32})));
+  EXPECT_EQ(32 * 32, NumCoefficients(MakeShape({32, 32})));
+  EXPECT_EQ(2, Rank(MakeShape({-2, 32})));
+  EXPECT_EQ(-1, NumCoefficients(MakeShape({-2, 32})));
+  TensorShapeProto shape;
+  shape.set_unknown_rank(true);
+  EXPECT_EQ(-1, Rank(shape));
+  EXPECT_EQ(-1, NumCoefficients(shape));
+}
+
+TEST_F(SymbolicShapesTest, SizeRatio) {
+  EXPECT_EQ(16, ComputeSizeRatio(MakeShape({32, 32}), MakeShape({32, 2})));
+  EXPECT_EQ(16, ComputeSizeRatio(MakeShape({-2, 32}), MakeShape({-2, 2})));
+  EXPECT_EQ(16,
+            ComputeSizeRatio(MakeShape({-2, -2, 32}), MakeShape({-2, 2, -2})));
+  EXPECT_EQ(-1,
+            ComputeSizeRatio(MakeShape({-2, -2, 32}), MakeShape({-2, 2, 2})));
+  EXPECT_EQ(-1,
+            ComputeSizeRatio(MakeShape({-2, 2, 32}), MakeShape({-2, 2, -2})));
+  EXPECT_EQ(-1, ComputeSizeRatio(MakeShape({-2, -2}), MakeShape({-2, 2})));
+  EXPECT_EQ(-1, ComputeSizeRatio(MakeShape({-2, 32}), MakeShape({-2, -2})));
+  EXPECT_EQ(1, ComputeSizeRatio(MakeShape({-2, -3}), MakeShape({-3, -2})));
+  EXPECT_EQ(-1, ComputeSizeRatio(MakeShape({-1, 32}), MakeShape({-2, 2})));
+  EXPECT_EQ(-1, ComputeSizeRatio(MakeShape({-1, 32}), MakeShape({-2, 0})));
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index b87ae055469b673ea3be8c3b9d5c505201b00f16..1c6fef59eaec8e9b9ae867d01e52382fd758e15e 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -65,7 +65,7 @@ class NodeMap {
 // A vector with a set. The set stores the same elements as the vector, and
 // quickly answers whether a value is in the vector. Duplicated elements are not
 // allowed for now.
-template <class T>
+template <class T, class Hash = std::hash<T>>
 class SetVector {
  public:
   // Returns false if value already existed in the set, true otherwise.
@@ -91,7 +91,7 @@ class SetVector {
   void Reserve(int64 size) { vector_.reserve(size); }
 
  private:
-  std::unordered_set<T> set_;
+  std::unordered_set<T, Hash> set_;
   std::vector<T> vector_;
 };
 
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 44ef4a965b59a5198ca1ea38f94e5adf09821606..e540cc047679c982418c47125b17b608224751ef 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -97,8 +97,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
-        "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
     ],
 )
 
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 79b823fa2da1297c1b53efe78e544d855fbfa67b..5a5dc47fa06626a5363cf9106f41728f98389a54 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -380,16 +380,6 @@ GrapplerFunctionItem& GrapplerFunctionItem::SwapFunctionBody(GraphDef&& other) {
   return *this;
 }
 
-std::vector<string> OutputTensors(const GrapplerFunctionItem& item) {
-  std::vector<string> output_tensors;
-  for (const OutputArgExpansion& output : item.outputs()) {
-    for (const string& tensor : output.output_tensors) {
-      output_tensors.push_back(tensor);
-    }
-  }
-  return output_tensors;
-}
-
 bool HasParametrizedType(const FunctionDef& func) {
   const auto is_type_parametrized = [](const OpDef::ArgDef& arg) {
     return !arg.type_attr().empty() || !arg.number_attr().empty() ||
@@ -417,6 +407,63 @@ bool IsParametrized(const FunctionDef& func) {
   return HasParametrizedType(func) || HasParametrizedBody(func);
 }
 
+Status InstantiationTypeParameters(
+    const FunctionDef& func, const AttrValueMap& func_instantiation_attr,
+    std::unordered_map<string, DataType>* type_parameters) {
+  if (!type_parameters->empty()) {
+    return errors::InvalidArgument("Type parameters output map must be empty");
+  }
+
+  GrapplerFunctionItemInstantiation instantiation(&func_instantiation_attr);
+
+  const auto resolve_type_attr = [&](const OpDef::ArgDef& arg) {
+    // Check if it's unknown and unresolved type.
+    if (arg.type() == DT_INVALID &&
+        type_parameters->find(arg.type_attr()) == type_parameters->end()) {
+      DataType data_type;
+      TF_RETURN_IF_ERROR(instantiation.GetArgType(arg, &data_type));
+      type_parameters->insert({arg.type_attr(), data_type});
+    }
+    return Status::OK();
+  };
+
+  for (const auto& input : func.signature().input_arg())
+    TF_RETURN_IF_ERROR(resolve_type_attr(input));
+  for (const auto& output : func.signature().output_arg())
+    TF_RETURN_IF_ERROR(resolve_type_attr(output));
+
+  return Status::OK();
+}
+
+Status InstantiationBodyParameters(
+    const FunctionDef& func, const AttrValueMap& func_instantiation_attr,
+    std::unordered_map<string, AttrValue>* body_parameters) {
+  if (!body_parameters->empty()) {
+    return errors::InvalidArgument("Body parameters output map must be empty");
+  }
+
+  for (const NodeDef& func_body_node : func.node_def()) {
+    for (auto& attr : func_body_node.attr()) {
+      const string& placeholder = attr.second.placeholder();
+
+      if (placeholder.empty() ||
+          body_parameters->find(placeholder) != body_parameters->end()) {
+        continue;
+      }
+
+      auto it = func_instantiation_attr.find(placeholder);
+      if (it != func_instantiation_attr.end()) {
+        body_parameters->emplace(placeholder, it->second);
+      } else {
+        return errors::InvalidArgument("Can't resolve placeholder: ",
+                                       placeholder);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
                                 const AttrValueMap& func_instantiation_attr,
                                 const FunctionLibraryDefinition& flib,
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index d9d71b80ebcf4d0f65622e1a8364d9718275922d..6227daa71b57f5534bb1afb3aac33711693b9e01 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -176,9 +176,6 @@ class GrapplerFunctionItem : public GrapplerItem {
   bool is_stateful_;
 };
 
-// Return all output tensors referenced by item output args.
-std::vector<string> OutputTensors(const GrapplerFunctionItem& item);
-
 // Check if function input/output types are fully defined only at instantiation
 // time (parametrized by it's instantiation node).
 bool HasParametrizedType(const FunctionDef& func);
@@ -191,6 +188,19 @@ bool HasParametrizedBody(const FunctionDef& func);
 // Check if function has parametrized type or body.
 bool IsParametrized(const FunctionDef& func);
 
+// Resolve function instantiation type parameters from the attributes of the
+// caller node. Return error if type can't be resolved.
+Status InstantiationTypeParameters(
+    const FunctionDef& func, const AttrValueMap& func_instantiation_attr,
+    std::unordered_map<string, DataType>* type_parameters);
+
+// Resolve function instantiation body parameters (values for the function body
+// attr placeholders) from the attributes of the caller node. Return error if
+// type can't be resolved.
+Status InstantiationBodyParameters(
+    const FunctionDef& func, const AttrValueMap& func_instantiation_attr,
+    std::unordered_map<string, AttrValue>* body_parameters);
+
 // Register GrapplerFunctionItem input arg expansion and function body outputs
 // in the GrapplerFunctionConnectivity. Use function library definition to
 // lookup function body nodes output names and ranges.
@@ -205,10 +215,10 @@ Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
 // Make a GrapplerFunctionItem from the function definition and function
 // instantiation attributes (caller node attributes). Returns error if the given
 // function def cannot be converted (e.g. not all attributes are defined).
-Status MakeGrapplerFunctionItem(
-    const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_instantiation_attr,
-    const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item);
+Status MakeGrapplerFunctionItem(const FunctionDef& func,
+                                const AttrValueMap& func_instantiation_attr,
+                                const FunctionLibraryDefinition& flib,
+                                GrapplerFunctionItem* item);
 
 // Make a GrapplerFunction item from the function definition. Function must be
 // fully defined (no type or body parametrization).
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index fa6fec70ff974410fa68f6713b1adb39fd68c281..302f02dd392b704ea676f86fafa8b65c1f211c69 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -54,6 +53,44 @@ TEST_F(FunctionsTest, IsParametrized) {
   EXPECT_FALSE(IsParametrized(non_parametrized_func));
 }
 
+TEST_F(FunctionsTest, InstantiationParameters) {
+  // Function definition is invalid, only type/body parameters are important.
+  FunctionDef func = FunctionDefHelper::Create(
+      "ParametrizedFunc",
+      /* inputs */
+      {"input1:A", "input2:B", "input3:float"},
+      /* outputs */
+      {"output1: A", "output2:C"},
+      /* type parameters */
+      {"A: {float, double}", "B: {float, int32}", "C: {float, double}"},
+      /* function body*/
+      {{{"output"}, "FakeOp", {"input1", "input2"}, {{"key", "$key"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"x", "cx:output:0"}, {"y", "cy:output:0"}});
+
+  std::unordered_map<string, AttrValue> func_instantiation_attr;
+  func_instantiation_attr["key"].set_s("key-value");
+  func_instantiation_attr["A"].set_type(DT_FLOAT);
+  func_instantiation_attr["B"].set_type(DT_INT32);
+  func_instantiation_attr["C"].set_type(DT_DOUBLE);
+
+  std::unordered_map<string, DataType> type_parameters;
+  TF_EXPECT_OK(InstantiationTypeParameters(func, func_instantiation_attr,
+                                           &type_parameters));
+
+  ASSERT_EQ(3, type_parameters.size());
+  EXPECT_EQ(DT_FLOAT, type_parameters["A"]);
+  EXPECT_EQ(DT_INT32, type_parameters["B"]);
+  EXPECT_EQ(DT_DOUBLE, type_parameters["C"]);
+
+  std::unordered_map<string, AttrValue> body_parameters;
+  TF_EXPECT_OK(InstantiationBodyParameters(func, func_instantiation_attr,
+                                           &body_parameters));
+
+  ASSERT_EQ(1, body_parameters.size());
+  EXPECT_EQ("key-value", body_parameters["key"].s());
+}
+
 TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandFunctionDefInput) {
   GrapplerFunctionConnectivity connectivity;
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 3fb03cd5bd3a8d4a2fac904dbc1f04a9bec0a294..1bf6eafb583df2d0e5655c754282f709fa213822 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -360,7 +360,6 @@ cc_library(
     srcs = ["queue_base.cc"],
     hdrs = ["queue_base.h"],
     deps = [
-        ":batch_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -382,7 +381,6 @@ cc_library(
     srcs = ["priority_queue.cc"],
     hdrs = ["priority_queue.h"],
     deps = [
-        ":batch_util",
         ":queue_base",
         ":typed_queue",
         "//tensorflow/core:framework",
@@ -1697,7 +1695,6 @@ tf_kernel_library(
     name = "random_shuffle_queue_op",
     prefix = "random_shuffle_queue_op",
     deps = DATA_FLOW_DEPS + [
-        ":batch_util",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -1890,7 +1887,6 @@ cc_library(
     hdrs = ["fifo_queue.h"],
     visibility = ["//visibility:private"],
     deps = [
-        ":batch_util",
         ":queue_base",
         ":typed_queue",
         "//tensorflow/core:framework",
@@ -1905,7 +1901,6 @@ cc_library(
     hdrs = ["padding_fifo_queue.h"],
     visibility = ["//visibility:private"],
     deps = [
-        ":batch_util",
         ":fifo_queue",
         ":queue_base",
         ":typed_queue",
@@ -2032,9 +2027,12 @@ tf_kernel_library(
     name = "functional_ops",
     prefix = "functional_ops",
     deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//third_party/eigen3",
     ],
 )
@@ -4249,6 +4247,7 @@ cc_library(
         ":as_string_op",
         ":base64_ops",
         ":reduce_join_op",
+        ":regex_full_match_op",
         ":regex_replace_op",
         ":string_join_op",
         ":string_split_op",
@@ -4285,6 +4284,12 @@ tf_kernel_library(
     deps = STRING_DEPS,
 )
 
+tf_kernel_library(
+    name = "regex_full_match_op",
+    prefix = "regex_full_match_op",
+    deps = STRING_DEPS + ["@com_googlesource_code_re2//:re2"],
+)
+
 tf_kernel_library(
     name = "regex_replace_op",
     prefix = "regex_replace_op",
@@ -5001,7 +5006,6 @@ filegroup(
 filegroup(
     name = "android_extended_ops_group2",
     srcs = [
-        "batch_util.cc",
         "batchtospace_op.cc",
         "ctc_decoder_ops.cc",
         "decode_bmp_op.cc",
@@ -6128,9 +6132,10 @@ tf_mkl_kernel_library(
     ],
 )
 
+# NOTE(lespeholt): This rule is deprecated, please use:
+# tensorflow/core/util/batch_util.h
 cc_library(
     name = "batch_util",
-    srcs = ["batch_util.cc"],
     hdrs = ["batch_util.h"],
     deps = [
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index a1c03f99181a6cbb86295ddc948d41d9e72c0a4b..475bda848db4a716a6a10715c5c050395bf23d45 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -329,6 +329,8 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
       c_ptrs.push_back(&c_device_memory.back());
     }
 
+    typedef Scalar Coefficient;
+
     // Cublas does
     // C = A x B
     // where A, B and C are assumed to be in column major.
@@ -352,9 +354,9 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
         bool blas_launch_status =
             stream
                 ->ThenBlasGemv(gemv_trans_a, adj_x ? m : k, adj_x ? k : m,
-                               static_cast<Scalar>(1.0), *(a_ptrs[0]),
+                               static_cast<Coefficient>(1.0), *(a_ptrs[0]),
                                adj_x ? m : k, *(b_ptrs[0]), 1,
-                               static_cast<Scalar>(0.0), c_ptrs[0], 1)
+                               static_cast<Coefficient>(0.0), c_ptrs[0], 1)
                 .ok();
         if (!blas_launch_status) {
           context->SetStatus(errors::Internal(
@@ -366,9 +368,9 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
         bool blas_launch_status =
             stream
                 ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
-                               static_cast<Scalar>(1.0), *(b_ptrs[0]),
+                               static_cast<Coefficient>(1.0), *(b_ptrs[0]),
                                adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k,
-                               static_cast<Scalar>(0.0), c_ptrs[0], n)
+                               static_cast<Coefficient>(0.0), c_ptrs[0], n)
                 .ok();
         if (!blas_launch_status) {
           context->SetStatus(errors::Internal(
@@ -383,8 +385,8 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
           stream
               ->ThenBlasGemmBatchedWithScratch(
                   blas_transpose_b, blas_transpose_a, n, m, k,
-                  static_cast<Scalar>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
-                  adj_x ? m : k, static_cast<Scalar>(0.0), c_ptrs, n,
+                  static_cast<Coefficient>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
+                  adj_x ? m : k, static_cast<Coefficient>(0.0), c_ptrs, n,
                   batch_size, &scratch_allocator)
               .ok();
       if (!blas_launch_status) {
@@ -398,6 +400,98 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
   }
 };
 
+template <>
+struct LaunchBatchMatMul<GPUDevice, Eigen::half> {
+  static void Launch(OpKernelContext* context, const Tensor& in_x,
+                     const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
+    typedef Eigen::half Scalar;
+    constexpr perftools::gputools::blas::Transpose kTranspose =
+        is_complex<Scalar>::value
+            ? perftools::gputools::blas::Transpose::kConjugateTranspose
+            : perftools::gputools::blas::Transpose::kTranspose;
+    perftools::gputools::blas::Transpose trans[] = {
+        perftools::gputools::blas::Transpose::kNoTranspose, kTranspose};
+    const uint64 m = in_x.dim_size(adj_x ? 2 : 1);
+    const uint64 k = in_x.dim_size(adj_x ? 1 : 2);
+    const uint64 n = in_y.dim_size(adj_y ? 1 : 2);
+    const uint64 batch_size = in_x.dim_size(0);
+    auto blas_transpose_a = trans[adj_x];
+    auto blas_transpose_b = trans[adj_y];
+
+    auto* stream = context->op_device_context()->stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+    typedef perftools::gputools::DeviceMemory<Scalar> DeviceMemoryType;
+    std::vector<DeviceMemoryType> a_device_memory;
+    std::vector<DeviceMemoryType> b_device_memory;
+    std::vector<DeviceMemoryType> c_device_memory;
+    std::vector<DeviceMemoryType*> a_ptrs;
+    std::vector<DeviceMemoryType*> b_ptrs;
+    std::vector<DeviceMemoryType*> c_ptrs;
+    a_device_memory.reserve(batch_size);
+    b_device_memory.reserve(batch_size);
+    c_device_memory.reserve(batch_size);
+    a_ptrs.reserve(batch_size);
+    b_ptrs.reserve(batch_size);
+    c_ptrs.reserve(batch_size);
+    auto* a_base_ptr = in_x.template flat<Scalar>().data();
+    auto* b_base_ptr = in_y.template flat<Scalar>().data();
+    auto* c_base_ptr = out->template flat<Scalar>().data();
+    for (int64 i = 0; i < batch_size; ++i) {
+      a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k));
+      b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n));
+      c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n));
+      a_ptrs.push_back(&a_device_memory.back());
+      b_ptrs.push_back(&b_device_memory.back());
+      c_ptrs.push_back(&c_device_memory.back());
+    }
+
+    typedef float Coefficient;
+
+    // Cublas does
+    // C = A x B
+    // where A, B and C are assumed to be in column major.
+    // We want the output to be in row-major, so we can compute
+    // C' = B' x A', where ' stands for transpose (not adjoint).
+    // TODO(yangzihao): Choose the best of the three strategies using autotune.
+    if (batch_size == 1) {
+      // This is a regular matrix*matrix or matrix*vector multiply. Avoid the
+      // overhead of the scratch allocator and the batch interface.
+      // TODO(benbarsdell): Use fp16 Gemv if it becomes supported by CUBLAS
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
+                             static_cast<Coefficient>(1.0), *(b_ptrs[0]),
+                             adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k,
+                             static_cast<Coefficient>(0.0), c_ptrs[0], n)
+              .ok();
+      if (!blas_launch_status) {
+        context->SetStatus(errors::Internal(
+            "Blas xGEMM launch failed : a.shape=", in_x.shape().DebugString(),
+            ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
+            ", k=", k));
+      }
+    } else {
+      CublasScratchAllocator scratch_allocator(context);
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemmBatchedWithScratch(
+                  blas_transpose_b, blas_transpose_a, n, m, k,
+                  static_cast<Coefficient>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
+                  adj_x ? m : k, static_cast<Coefficient>(0.0), c_ptrs, n,
+                  batch_size, &scratch_allocator)
+              .ok();
+      if (!blas_launch_status) {
+        context->SetStatus(
+            errors::Internal("Blas xGEMMBatched launch failed : a.shape=",
+                             in_x.shape().DebugString(), ", b.shape=",
+                             in_y.shape().DebugString(), ", m=", m, ", n=", n,
+                             ", k=", k, ", batch_size=", batch_size));
+      }
+    }
+  }
+};
+
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 7e1e2aa4ec135872993f2e7738c7e863416eee87..87a0795f2fd6b401f1d0151ab05c551b96f3e509 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/batch_matmul_op_impl.h"
 
+#if GOOGLE_CUDA
+#include "cuda/include/cuda.h"
+#endif  // GOOGLE_CUDA
+
 namespace tensorflow {
 
 #if !defined(INTEL_MKL)
@@ -27,9 +31,8 @@ TF_CALL_int32(REGISTER_BATCH_MATMUL_CPU);
 #if GOOGLE_CUDA
 TF_CALL_float(REGISTER_BATCH_MATMUL_GPU);
 TF_CALL_double(REGISTER_BATCH_MATMUL_GPU);
-#if CUDA_VERSION >= 7050
-TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
-#endif
+// TODO(csigg): Implement Stream::ThenBlasGemv for Eigen::half and uncomment.
+// TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/batch_util.h b/tensorflow/core/kernels/batch_util.h
index 69098fbd1d8dccc2446edf502abd0d57c228640e..dad2ec4e103e19c37cb3893c05d7c05e6e80ab4c 100644
--- a/tensorflow/core/kernels/batch_util.h
+++ b/tensorflow/core/kernels/batch_util.h
@@ -12,43 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+// NOTE(lespeholt): This file is deprecated. Use
+// "tensorflow/core/util/batch_util.h" instead.
+
 #ifndef TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
 #define TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
 
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tensorflow {
-namespace batch_util {
-
-// Copies element into the index^th slice of parent (in the 0th dimension).
-//
-// NOTE(mrry): The `element` argument is taken by value. Use `std::move()`
-// to move the `element` argument into this function, and the implementation
-// may be able to optimize the copy to a move. This is particularly important
-// for DT_STRING tensors.
-Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
-
-// Copies the index^th slice of parent (in the 0th dimension) into element.
-Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
-
-// Copies the index^th slice of parent (in the 0th dimension) into element.
-//
-// NOTE(mrry): The implementation may be able to optimize the copy to a move.
-// This is particularly important for DT_STRING tensors.
-Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index);
-
-// Zero-initializes the tensor `element` using the scalar stored in `padding`.
-// Both `element` and `padding` must have matching `dtype`.
-Status SetElementZero(Tensor* element, const Tensor& padding);
-
-// Copies `element` into a (0th dimension) slice of `parent`, assuming
-// the shape of `element` is strictly not larger along any axis than a
-// slice.
-Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
-                                int index);
-
-}  // namespace batch_util
-}  // namespace tensorflow
+#include "tensorflow/core/util/batch_util.h"
 
 #endif  // TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index ae652961db3e81f41f38c995ff9bf23f2287bb6e..b77c14d01284319e8d05d67ce1b6d1b94f4c394b 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -92,6 +92,9 @@ class AdaptiveSharedBatchScheduler
     // for num_batch_threads allows for large in_flight_batches_limit_, which
     // will harm latency for some time once load increases again.
     int64 num_batch_threads = port::NumSchedulableCPUs();
+    // Lower bound for in_flight_batches_limit_. As discussed above, can be used
+    // to minimize the damage caused by the random walk under low load.
+    int64 min_in_flight_batches_limit = 1;
     // Although batch selection is primarily based on age, this parameter
     // specifies a preference for larger batches.  A full batch will be
     // scheduled before an older, nearly empty batch as long as the age gap is
@@ -286,6 +289,16 @@ Status AdaptiveSharedBatchScheduler<TaskType>::Create(
     return errors::InvalidArgument("num_batch_threads must be positive; was ",
                                    options.num_batch_threads);
   }
+  if (options.min_in_flight_batches_limit < 1) {
+    return errors::InvalidArgument(
+        "min_in_flight_batches_limit must be >= 1; was ",
+        options.min_in_flight_batches_limit);
+  }
+  if (options.min_in_flight_batches_limit > options.num_batch_threads) {
+    return errors::InvalidArgument(
+        "min_in_flight_batches_limit (", options.min_in_flight_batches_limit,
+        ") must be <= num_batch_threads (", options.num_batch_threads, ")");
+  }
   if (options.full_batch_scheduling_boost_micros < 0) {
     return errors::InvalidArgument(
         "full_batch_scheduling_boost_micros can't be negative; was ",
@@ -298,11 +311,12 @@ Status AdaptiveSharedBatchScheduler<TaskType>::Create(
         ") should not be larger than num_batch_threads (",
         options.num_batch_threads, ")");
   }
-  if (options.initial_in_flight_batches_limit < 1) {
-    return errors::InvalidArgument(
-        "initial_in_flight_batches_limit should be "
-        "greater than or equal to 1; was ",
-        options.initial_in_flight_batches_limit);
+  if (options.initial_in_flight_batches_limit <
+      options.min_in_flight_batches_limit) {
+    return errors::InvalidArgument("initial_in_flight_batches_limit (",
+                                   options.initial_in_flight_batches_limit,
+                                   "must be >= min_in_flight_batches_limit (",
+                                   options.min_in_flight_batches_limit, ")");
   }
   if (options.batches_to_average_over < 1) {
     return errors::InvalidArgument(
@@ -437,7 +451,9 @@ void AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper(
     in_flight_batches_limit_ =
         std::min(in_flight_batches_limit_,
                  static_cast<double>(options_.num_batch_threads));
-    in_flight_batches_limit_ = std::max(in_flight_batches_limit_, 1.0);
+    in_flight_batches_limit_ =
+        std::max(in_flight_batches_limit_,
+                 static_cast<double>(options_.min_in_flight_batches_limit));
     last_avg_latency_ms_ = current_avg_latency_ms;
     last_latency_decreased_ = current_latency_decreased;
     batch_count_ = 0;
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
index 1be0c1f5c65cad3245a9392e2a2db61cfb2dd904..af356cf24db9a9ec716e661ed8ece3a3bf1cbcdd 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
@@ -80,6 +80,18 @@ TEST(AdaptiveSharedBatchSchedulerTest, BadOptions) {
   options = Scheduler::Options();
   options.batches_to_average_over = -5;
   EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = Scheduler::Options();
+  options.min_in_flight_batches_limit = 0;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = Scheduler::Options();
+  options.min_in_flight_batches_limit = 5;
+  options.num_batch_threads = 3;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = Scheduler::Options();
+  options.initial_in_flight_batches_limit = 1;
+  options.min_in_flight_batches_limit = 2;
+  options.num_batch_threads = 3;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
 }
 
 TEST(AdaptiveSharedBatchSchedulerTest, InFlightBatchesLimit) {
diff --git a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
index 1b5ce32b7bed896ef8c990c79db402ff171eca94..20359f28d331ea35ff9629210a41c87edefa84c8 100644
--- a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@@ -213,6 +213,12 @@ class BoostedTreesPredictOp : public OpKernel {
                                 &output_logits_t));
     auto output_logits = output_logits_t->matrix<float>();
 
+    // Return zero logits if it's an empty ensemble.
+    if (resource->num_trees() <= 0) {
+      output_logits.setZero();
+      return;
+    }
+
     const int32 latest_tree = resource->num_trees() - 1;
 
     auto do_work = [&resource, &batch_bucketized_features, &output_logits,
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index aca75176a565dd5b76793b143eab19f5ffc5aad6..bdd08222d40a7630384f04208b16080db77a7dd1 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -404,10 +404,9 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
     // image ('work_unit_size').
 
     // TODO(andydavis)
-    // *) Get L3 cache size from device at runtime (30MB is from ivybridge).
     // *) Consider reducing 'target_working_set_size' if L3 is shared by
     //    other concurrently running tensorflow ops.
-    const size_t target_working_set_size = (30LL << 20) / sizeof(T);
+    const size_t target_working_set_size = Eigen::l3CacheSize() / sizeof(T);
 
     const size_t size_A = output_image_size * filter_total_size;
 
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 63a775afa8bd69d5c9b97b5545086dface4312ac..95301b170fb6f2e2ae76e5f9e23fe32fb63760f0 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -420,9 +420,8 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
     const int output_image_size =
         dims.spatial_dims[0].output_size * dims.spatial_dims[1].output_size;
 
-    // TODO(andydavis) Get L2/L3 cache sizes from device.
-    const size_t l2_cache_size = 256LL << 10;
-    const size_t l3_cache_size = 30LL << 20;
+    const size_t l2_cache_size = Eigen::l2CacheSize();
+    const size_t l3_cache_size = Eigen::l3CacheSize();
 
     // Use L3 cache size as target working set size.
     const size_t target_working_set_size = l3_cache_size / sizeof(T);
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 9edc6d416e33d66514f60ec6dac0619776a7ea15..980b1063de9997a05304c719857a3ea82f40e650 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -195,8 +195,8 @@ class Conv3DBackpropInputOp : public OpKernel {
     TensorShape input_shape;
     if (takes_shape_) {
       const Tensor& input_sizes = context->input(0);
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                  input_sizes.vec<int32>(), &input_shape));
+      // MakeShape is able to handle both DT_INT32 and DT_INT64 for input_sizes.
+      OP_REQUIRES_OK(context, MakeShape(input_sizes, &input_shape));
     } else {
       input_shape = context->input(0).shape();
     }
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc
index 54ef9c6fb48cb9f50b99a4d8065b909f35cf4805..99d01b4db6bac68d890d93ac55bea576f43a5994 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op.cc
@@ -110,10 +110,10 @@ class CropAndResizeOp : public AsyncOpKernel {
  public:
   explicit CropAndResizeOp(OpKernelConstruction* context)
       : AsyncOpKernel(context) {
-    string method;
-    OP_REQUIRES_OK(context, context->GetAttr("method", &method));
-    OP_REQUIRES(context, method == "bilinear",
-                errors::InvalidArgument("method must be 'bilinear'", method));
+    OP_REQUIRES_OK(context, context->GetAttr("method", &method_));
+    OP_REQUIRES(context, method_ == "bilinear" || method_ == "nearest",
+                errors::InvalidArgument(
+                    "method must be 'bilinear' or 'nearest'", method_));
     OP_REQUIRES_OK(context, context->GetAttr("extrapolation_value",
                                              &extrapolation_value_));
   }
@@ -178,7 +178,7 @@ class CropAndResizeOp : public AsyncOpKernel {
       const Tensor& box_index = context->input(2);
       const bool status = functor::CropAndResize<Device, T>()(
           context, image.tensor<T, 4>(), boxes.tensor<float, 2>(),
-          box_index.tensor<int32, 1>(), extrapolation_value_,
+          box_index.tensor<int32, 1>(), method_, extrapolation_value_,
           output->tensor<float, 4>());
       if (!status) {
         context->SetStatus(
@@ -193,6 +193,7 @@ class CropAndResizeOp : public AsyncOpKernel {
 
  private:
   float extrapolation_value_;
+  string method_;
 };
 
 // Partial specialization of CropAndResize functor for a CPUDevice.
@@ -203,7 +204,7 @@ struct CropAndResize<CPUDevice, T> {
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_index,
-                  float extrapolation_value,
+                  const string& method_name, float extrapolation_value,
                   typename TTypes<float, 4>::Tensor crops) {
     const int batch_size = image.dimension(0);
     const int image_height = image.dimension(1);
@@ -247,37 +248,57 @@ struct CropAndResize<CPUDevice, T> {
             }
             continue;
           }
-          const int top_y_index = floorf(in_y);
-          const int bottom_y_index = ceilf(in_y);
-          const float y_lerp = in_y - top_y_index;
-
-          for (int x = 0; x < crop_width; ++x) {
-            const float in_x = (crop_width > 1)
-                                   ? x1 * (image_width - 1) + x * width_scale
-                                   : 0.5 * (x1 + x2) * (image_width - 1);
-            if (in_x < 0 || in_x > image_width - 1) {
+          if (method_name == "bilinear") {
+            const int top_y_index = floorf(in_y);
+            const int bottom_y_index = ceilf(in_y);
+            const float y_lerp = in_y - top_y_index;
+
+            for (int x = 0; x < crop_width; ++x) {
+              const float in_x = (crop_width > 1)
+                                     ? x1 * (image_width - 1) + x * width_scale
+                                     : 0.5 * (x1 + x2) * (image_width - 1);
+              if (in_x < 0 || in_x > image_width - 1) {
+                for (int d = 0; d < depth; ++d) {
+                  crops(b, y, x, d) = extrapolation_value;
+                }
+                continue;
+              }
+              const int left_x_index = floorf(in_x);
+              const int right_x_index = ceilf(in_x);
+              const float x_lerp = in_x - left_x_index;
+
               for (int d = 0; d < depth; ++d) {
-                crops(b, y, x, d) = extrapolation_value;
+                const float top_left(static_cast<float>(
+                    image(b_in, top_y_index, left_x_index, d)));
+                const float top_right(static_cast<float>(
+                    image(b_in, top_y_index, right_x_index, d)));
+                const float bottom_left(static_cast<float>(
+                    image(b_in, bottom_y_index, left_x_index, d)));
+                const float bottom_right(static_cast<float>(
+                    image(b_in, bottom_y_index, right_x_index, d)));
+                const float top = top_left + (top_right - top_left) * x_lerp;
+                const float bottom =
+                    bottom_left + (bottom_right - bottom_left) * x_lerp;
+                crops(b, y, x, d) = top + (bottom - top) * y_lerp;
               }
-              continue;
             }
-            const int left_x_index = floorf(in_x);
-            const int right_x_index = ceilf(in_x);
-            const float x_lerp = in_x - left_x_index;
-
-            for (int d = 0; d < depth; ++d) {
-              const float top_left(static_cast<float>(
-                  image(b_in, top_y_index, left_x_index, d)));
-              const float top_right(static_cast<float>(
-                  image(b_in, top_y_index, right_x_index, d)));
-              const float bottom_left(static_cast<float>(
-                  image(b_in, bottom_y_index, left_x_index, d)));
-              const float bottom_right(static_cast<float>(
-                  image(b_in, bottom_y_index, right_x_index, d)));
-              const float top = top_left + (top_right - top_left) * x_lerp;
-              const float bottom =
-                  bottom_left + (bottom_right - bottom_left) * x_lerp;
-              crops(b, y, x, d) = top + (bottom - top) * y_lerp;
+          } else {  // method == "nearest"
+            for (int x = 0; x < crop_width; ++x) {
+              const float in_x = (crop_width > 1)
+                                     ? x1 * (image_width - 1) + x * width_scale
+                                     : 0.5 * (x1 + x2) * (image_width - 1);
+              if (in_x < 0 || in_x > image_width - 1) {
+                for (int d = 0; d < depth; ++d) {
+                  crops(b, y, x, d) = extrapolation_value;
+                }
+                continue;
+              }
+              const int closest_x_index = roundf(in_x);
+              const int closest_y_index = roundf(in_y);
+              for (int d = 0; d < depth; ++d) {
+                crops(b, y, x, d) = static_cast<float>(
+                    image(b_in, closest_y_index, closest_x_index, d));
+              }
             }
           }
         }
@@ -285,12 +306,17 @@ struct CropAndResize<CPUDevice, T> {
     };
 
     // A rough estimation of the cost for each cropped box.
-    const double cost_per_pixel =
+    double cost_per_pixel =
         depth * (Eigen::TensorOpCost::AddCost<float>() * 6 +
                  Eigen::TensorOpCost::MulCost<float>() * 3 +
                  Eigen::TensorOpCost::CastCost<T, float>() * 4) +
         (Eigen::TensorOpCost::AddCost<float>() * 2 +
          Eigen::TensorOpCost::AddCost<float>() * 3);
+    if (method_name == "nearest") {
+      cost_per_pixel = depth * Eigen::TensorOpCost::CastCost<T, float>() +
+                       Eigen::TensorOpCost::AddCost<float>() * 4 +
+                       Eigen::TensorOpCost::MulCost<float>() * 4;
+    }
     const double cost_per_box = crop_height * crop_width * cost_per_pixel;
 
     const DeviceBase::CpuWorkerThreads& worker_threads =
@@ -309,10 +335,10 @@ class CropAndResizeGradImageOp : public AsyncOpKernel {
  public:
   explicit CropAndResizeGradImageOp(OpKernelConstruction* context)
       : AsyncOpKernel(context) {
-    string method;
-    OP_REQUIRES_OK(context, context->GetAttr("method", &method));
-    OP_REQUIRES(context, method == "bilinear",
-                errors::InvalidArgument("method must be 'bilinear'", method));
+    OP_REQUIRES_OK(context, context->GetAttr("method", &method_));
+    OP_REQUIRES(context, method_ == "bilinear" || method_ == "nearest",
+                errors::InvalidArgument(
+                    "method must be 'bilinear' or 'nearest'", method_));
   }
 
   void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
@@ -372,14 +398,14 @@ class CropAndResizeGradImageOp : public AsyncOpKernel {
             &output),
         done);
 
-    auto compute_callback = [context, output]() {
+    auto compute_callback = [this, context, output]() {
       const Tensor& grads = context->input(0);
       const Tensor& boxes = context->input(1);
       const Tensor& box_index = context->input(2);
       const bool status = functor::CropAndResizeBackpropImage<Device, T>()(
           context->eigen_device<Device>(), grads.tensor<float, 4>(),
           boxes.tensor<float, 2>(), box_index.tensor<int32, 1>(),
-          output->tensor<T, 4>());
+          output->tensor<T, 4>(), method_);
       if (!status) {
         context->SetStatus(errors::Internal(
             "Failed launch CropAndResizeBackpropImage kernel."));
@@ -390,6 +416,9 @@ class CropAndResizeGradImageOp : public AsyncOpKernel {
                                  batch_size, std::move(compute_callback),
                                  std::move(done));
   }
+
+ private:
+  string method_;
 };
 
 // Partial specialization of CropAndResizeBackpropImage functor for a CPUDevice.
@@ -400,7 +429,8 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_index,
-                  typename TTypes<T, 4>::Tensor grads_image) {
+                  typename TTypes<T, 4>::Tensor grads_image,
+                  const string& method_name) {
     const int batch_size = grads_image.dimension(0);
     const int image_height = grads_image.dimension(1);
     const int image_width = grads_image.dimension(2);
@@ -448,21 +478,30 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
           if (in_x < 0 || in_x > image_width - 1) {
             continue;
           }
-          const int left_x_index = floorf(in_x);
-          const int right_x_index = ceilf(in_x);
-          const float x_lerp = in_x - left_x_index;
+          if (method_name == "bilinear") {
+            const int left_x_index = floorf(in_x);
+            const int right_x_index = ceilf(in_x);
+            const float x_lerp = in_x - left_x_index;
 
-          for (int d = 0; d < depth; ++d) {
-            const float dtop = (1 - y_lerp) * grads(b, y, x, d);
-            grads_image(b_in, top_y_index, left_x_index, d) +=
-                static_cast<T>((1 - x_lerp) * dtop);
-            grads_image(b_in, top_y_index, right_x_index, d) +=
-                static_cast<T>(x_lerp * dtop);
-            const float dbottom = y_lerp * grads(b, y, x, d);
-            grads_image(b_in, bottom_y_index, left_x_index, d) +=
-                static_cast<T>((1 - x_lerp) * dbottom);
-            grads_image(b_in, bottom_y_index, right_x_index, d) +=
-                static_cast<T>(x_lerp * dbottom);
+            for (int d = 0; d < depth; ++d) {
+              const float dtop = (1 - y_lerp) * grads(b, y, x, d);
+              grads_image(b_in, top_y_index, left_x_index, d) +=
+                  static_cast<T>((1 - x_lerp) * dtop);
+              grads_image(b_in, top_y_index, right_x_index, d) +=
+                  static_cast<T>(x_lerp * dtop);
+              const float dbottom = y_lerp * grads(b, y, x, d);
+              grads_image(b_in, bottom_y_index, left_x_index, d) +=
+                  static_cast<T>((1 - x_lerp) * dbottom);
+              grads_image(b_in, bottom_y_index, right_x_index, d) +=
+                  static_cast<T>(x_lerp * dbottom);
+            }
+          } else {  // method_name == "nearest"
+            for (int d = 0; d < depth; ++d) {
+              int closest_x_index = roundf(in_x);
+              int closest_y_index = roundf(in_y);
+              grads_image(b_in, closest_y_index, closest_x_index, d) +=
+                  static_cast<T>(grads(b, y, x, d));
+            }
           }
         }
       }
diff --git a/tensorflow/core/kernels/crop_and_resize_op.h b/tensorflow/core/kernels/crop_and_resize_op.h
index b6b1dbd7b0c3d44729f1a8d88f1d562062f61410..61dc3f941f6c8c62a61982e792371925357cc549 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.h
+++ b/tensorflow/core/kernels/crop_and_resize_op.h
@@ -31,7 +31,7 @@ struct CropAndResize {
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_ind,
-                  float extrapolation_value,
+                  string method_name, float extrapolation_value,
                   typename TTypes<float, 4>::Tensor crops);
 };
 
@@ -41,7 +41,8 @@ struct CropAndResizeBackpropImage {
   bool operator()(const Device& d, typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_ind,
-                  typename TTypes<T, 4>::Tensor grads_image);
+                  typename TTypes<T, 4>::Tensor grads_image,
+                  const string& method_name);
 };
 
 template <typename Device, typename T>
diff --git a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
index d12787d5244d12f27dd03a32fecb6f2713af2c29..8ab08fb93aeef2651f2911047d91216c85392705 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
@@ -32,11 +32,16 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace {
 
+enum InterpolationMethod {
+  BILINEAR = 0,
+  NEAREST = 1,
+};
+
 template <typename T>
 __global__ void CropAndResizeKernel(
     const int32 nthreads, const T* image_ptr, const float* boxes_ptr,
     const int32* box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
+    int image_width, int crop_height, int crop_width, int depth, int method_id,
     float extrapolation_value, float* crops_ptr) {
   CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = d + depth * (w + crop_width * (h + crop_height * b))
@@ -80,37 +85,47 @@ __global__ void CropAndResizeKernel(
       continue;
     }
 
-    const int top_y_index = floorf(in_y);
-    const int bottom_y_index = ceilf(in_y);
-    const float y_lerp = in_y - top_y_index;
-
-    const int left_x_index = floorf(in_x);
-    const int right_x_index = ceilf(in_x);
-    const float x_lerp = in_x - left_x_index;
-
-    const float top_left(static_cast<float>(
-        image_ptr[((b_in * image_height + top_y_index) * image_width +
-                   left_x_index) *
-                      depth +
-                  d]));
-    const float top_right(static_cast<float>(
-        image_ptr[((b_in * image_height + top_y_index) * image_width +
-                   right_x_index) *
-                      depth +
-                  d]));
-    const float bottom_left(static_cast<float>(
-        image_ptr[((b_in * image_height + bottom_y_index) * image_width +
-                   left_x_index) *
-                      depth +
-                  d]));
-    const float bottom_right(static_cast<float>(
-        image_ptr[((b_in * image_height + bottom_y_index) * image_width +
-                   right_x_index) *
-                      depth +
-                  d]));
-    const float top = top_left + (top_right - top_left) * x_lerp;
-    const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
-    crops_ptr[out_idx] = top + (bottom - top) * y_lerp;
+    if (method_id == BILINEAR) {
+      const int top_y_index = floorf(in_y);
+      const int bottom_y_index = ceilf(in_y);
+      const float y_lerp = in_y - top_y_index;
+
+      const int left_x_index = floorf(in_x);
+      const int right_x_index = ceilf(in_x);
+      const float x_lerp = in_x - left_x_index;
+
+      const float top_left(static_cast<float>(
+          image_ptr[((b_in * image_height + top_y_index) * image_width +
+                     left_x_index) *
+                        depth +
+                    d]));
+      const float top_right(static_cast<float>(
+          image_ptr[((b_in * image_height + top_y_index) * image_width +
+                     right_x_index) *
+                        depth +
+                    d]));
+      const float bottom_left(static_cast<float>(
+          image_ptr[((b_in * image_height + bottom_y_index) * image_width +
+                     left_x_index) *
+                        depth +
+                    d]));
+      const float bottom_right(static_cast<float>(
+          image_ptr[((b_in * image_height + bottom_y_index) * image_width +
+                     right_x_index) *
+                        depth +
+                    d]));
+      const float top = top_left + (top_right - top_left) * x_lerp;
+      const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
+      crops_ptr[out_idx] = top + (bottom - top) * y_lerp;
+    } else {  // method_id == kMethodNearestId
+      const int closest_x_index = roundf(in_x);
+      const int closest_y_index = roundf(in_y);
+      crops_ptr[out_idx] = static_cast<float>(
+          image_ptr[((b_in * image_height + closest_y_index) * image_width +
+                     closest_x_index) *
+                        depth +
+                    d]);
+    }
   }
 }
 
@@ -119,7 +134,7 @@ __global__ void CropAndResizeBackpropImageKernel(
     const int32 nthreads, const float* grads_ptr, const float* boxes_ptr,
     const int32* box_ind_ptr, int num_boxes, int batch, int image_height,
     int image_width, int crop_height, int crop_width, int depth,
-    T* grads_image_ptr) {
+    T* grads_image_ptr, int method_id) {
   CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = d + depth * (w + crop_width * (h + crop_height * b))
     int idx = out_idx;
@@ -160,41 +175,52 @@ __global__ void CropAndResizeBackpropImageKernel(
       continue;
     }
 
-    const int top_y_index = floorf(in_y);
-    const int bottom_y_index = ceilf(in_y);
-    const float y_lerp = in_y - top_y_index;
-
-    const int left_x_index = floorf(in_x);
-    const int right_x_index = ceilf(in_x);
-    const float x_lerp = in_x - left_x_index;
-
-    const float dtop = (1 - y_lerp) * grads_ptr[out_idx];
-    CudaAtomicAdd(
-        grads_image_ptr +
-            ((b_in * image_height + top_y_index) * image_width + left_x_index) *
-                depth +
-            d,
-        static_cast<T>((1 - x_lerp) * dtop));
-    CudaAtomicAdd(grads_image_ptr +
-                      ((b_in * image_height + top_y_index) * image_width +
-                       right_x_index) *
-                          depth +
-                      d,
-                  static_cast<T>(x_lerp * dtop));
-
-    const float dbottom = y_lerp * grads_ptr[out_idx];
-    CudaAtomicAdd(grads_image_ptr +
-                      ((b_in * image_height + bottom_y_index) * image_width +
-                       left_x_index) *
-                          depth +
-                      d,
-                  static_cast<T>((1 - x_lerp) * dbottom));
-    CudaAtomicAdd(grads_image_ptr +
-                      ((b_in * image_height + bottom_y_index) * image_width +
-                       right_x_index) *
-                          depth +
-                      d,
-                  static_cast<T>(x_lerp * dbottom));
+    if (method_id == BILINEAR) {
+      const int top_y_index = floorf(in_y);
+      const int bottom_y_index = ceilf(in_y);
+      const float y_lerp = in_y - top_y_index;
+
+      const int left_x_index = floorf(in_x);
+      const int right_x_index = ceilf(in_x);
+      const float x_lerp = in_x - left_x_index;
+
+      const float dtop = (1 - y_lerp) * grads_ptr[out_idx];
+      CudaAtomicAdd(grads_image_ptr +
+                        ((b_in * image_height + top_y_index) * image_width +
+                         left_x_index) *
+                            depth +
+                        d,
+                    static_cast<T>((1 - x_lerp) * dtop));
+      CudaAtomicAdd(grads_image_ptr +
+                        ((b_in * image_height + top_y_index) * image_width +
+                         right_x_index) *
+                            depth +
+                        d,
+                    static_cast<T>(x_lerp * dtop));
+
+      const float dbottom = y_lerp * grads_ptr[out_idx];
+      CudaAtomicAdd(grads_image_ptr +
+                        ((b_in * image_height + bottom_y_index) * image_width +
+                         left_x_index) *
+                            depth +
+                        d,
+                    static_cast<T>((1 - x_lerp) * dbottom));
+      CudaAtomicAdd(grads_image_ptr +
+                        ((b_in * image_height + bottom_y_index) * image_width +
+                         right_x_index) *
+                            depth +
+                        d,
+                    static_cast<T>(x_lerp * dbottom));
+    } else {  // method_id == NEAREST
+      const int closest_x_index = roundf(in_x);
+      const int closest_y_index = roundf(in_y);
+      CudaAtomicAdd(grads_image_ptr +
+                        ((b_in * image_height + closest_y_index) * image_width +
+                         closest_x_index) *
+                            depth +
+                        d,
+                    static_cast<T>(grads_ptr[out_idx]));
+    }
   }
 }
 
@@ -324,7 +350,7 @@ struct CropAndResize<GPUDevice, T> {
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_ind,
-                  float extrapolation_value,
+                  string method_name, float extrapolation_value,
                   typename TTypes<float, 4>::Tensor crops) {
     const int batch = image.dimension(0);
     const int image_height = image.dimension(1);
@@ -338,13 +364,19 @@ struct CropAndResize<GPUDevice, T> {
     const int total_count = num_boxes * crop_height * crop_width * depth;
     const GPUDevice& d = context->eigen_device<GPUDevice>();
 
+    InterpolationMethod method = BILINEAR;
+    if (method_name == "nearest") {
+      method = NEAREST;
+    }
+
     if (total_count > 0) {
       CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
       CropAndResizeKernel<<<config.block_count, config.thread_per_block, 0,
                             d.stream()>>>(
           config.virtual_thread_count, image.data(), boxes.data(),
           box_ind.data(), num_boxes, batch, image_height, image_width,
-          crop_height, crop_width, depth, extrapolation_value, crops.data());
+          crop_height, crop_width, depth, method, extrapolation_value,
+          crops.data());
     }
     return d.ok();
   }
@@ -356,7 +388,8 @@ struct CropAndResizeBackpropImage<GPUDevice, T> {
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_ind,
-                  typename TTypes<T, 4>::Tensor grads_image) {
+                  typename TTypes<T, 4>::Tensor grads_image,
+                  const string& method_name) {
     const int batch = grads_image.dimension(0);
     const int image_height = grads_image.dimension(1);
     const int image_width = grads_image.dimension(2);
@@ -377,6 +410,12 @@ struct CropAndResizeBackpropImage<GPUDevice, T> {
           config.virtual_thread_count, grads_image.data());
     }
 
+    // Configurate interpolation method.
+    InterpolationMethod method = BILINEAR;
+    if (method_name == "nearest") {
+      method = NEAREST;
+    }
+
     // Accumulate.
     total_count = num_boxes * crop_height * crop_width * depth;
     if (total_count > 0) {
@@ -385,7 +424,7 @@ struct CropAndResizeBackpropImage<GPUDevice, T> {
           config.block_count, config.thread_per_block, 0, d.stream()>>>(
           config.virtual_thread_count, grads.data(), boxes.data(),
           box_ind.data(), num_boxes, batch, image_height, image_width,
-          crop_height, crop_width, depth, grads_image.data());
+          crop_height, crop_width, depth, grads_image.data(), method);
     }
     return d.ok();
   }
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/crop_and_resize_op_test.cc
index 709082e79903d041771cc19235d4dee76fce66b3..6921020d09e94fa7b99d7ca6cb95c82274b2e4c0 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_test.cc
@@ -34,13 +34,14 @@ namespace tensorflow {
 class CropAndResizeOpTest : public OpsTestBase {
  protected:
   template <typename T>
-  void MakeOp(float extrapolation_value) {
+  void MakeOp(float extrapolation_value, const string& method) {
     TF_EXPECT_OK(NodeDefBuilder("crop_and_resize_op", "CropAndResize")
                      .Input(FakeInput(DataTypeToEnum<T>::value))
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_INT32))
                      .Input(FakeInput(DT_INT32))
                      .Attr("extrapolation_value", extrapolation_value)
+                     .Attr("method", method)
                      .Finalize(node_def()));
     TF_EXPECT_OK(InitOp());
   }
@@ -48,7 +49,7 @@ class CropAndResizeOpTest : public OpsTestBase {
 
 #define REGISTER_TEST(T)                                               \
   TEST_F(CropAndResizeOpTest, TestCropAndResize##T) {                  \
-    MakeOp<T>(0);                                                      \
+    MakeOp<T>(0, "bilinear");                                          \
     AddInputFromArray<T>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});     \
     AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});       \
     AddInputFromArray<int32>(TensorShape({1}), {0});                   \
@@ -58,6 +59,19 @@ class CropAndResizeOpTest : public OpsTestBase {
     Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1})); \
     test::FillValues<float>(&expected, {2.5});                         \
     test::ExpectTensorEqual<float>(expected, *GetOutput(0));           \
+  }                                                                    \
+                                                                       \
+  TEST_F(CropAndResizeOpTest, TestCropAndResize##T##nearest) {         \
+    MakeOp<T>(0, "nearest");                                           \
+    AddInputFromArray<T>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});     \
+    AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});       \
+    AddInputFromArray<int32>(TensorShape({1}), {0});                   \
+    AddInputFromArray<int32>(TensorShape({2}), {1, 1});                \
+    TF_ASSERT_OK(RunOpKernel());                                       \
+                                                                       \
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1})); \
+    test::FillValues<float>(&expected, {4.0});                         \
+    test::ExpectTensorEqual<float>(expected, *GetOutput(0));           \
   }
 
 REGISTER_TEST(float)
@@ -72,7 +86,7 @@ REGISTER_TEST(int64)
 #undef REGISTER_TEST
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Uint8) {
-  MakeOp<uint8>(0);
+  MakeOp<uint8>(0, "bilinear");
   // Input:
   //  1, 2
   //  3, 4
@@ -87,8 +101,24 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Uint8) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Uint8NearestNeibor) {
+  MakeOp<uint8>(0, "nearest");
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<uint8>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<int32>(TensorShape({1}), {0});
+  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillValues<float>(&expected, {4.0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Flipped) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   // Input:
   //  1, 2
   //  3, 4
@@ -103,8 +133,24 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Flipped) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1FlippedNearestNeighbor) {
+  MakeOp<float>(0, "nearest");
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<float>(TensorShape({1, 4}), {1, 1, 0, 0});
+  AddInputFromArray<int32>(TensorShape({1}), {0});
+  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillValues<float>(&expected, {4.0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   // Input:
   //  1, 2
   //  3, 4
@@ -124,8 +170,29 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3NearestNeighbor) {
+  MakeOp<float>(0, "nearest");
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<int32>(TensorShape({1}), {0});
+  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1,  2,  2,
+     3,  4,  4,
+     3,  4,  4});
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Flipped) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   // Input:
   //  1, 2
   //  3, 4
@@ -145,8 +212,54 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Flipped) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3FlippedNearestNeighbor) {
+  MakeOp<float>(0, "nearest");
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<float>(TensorShape({1, 4}), {1, 1, 0, 0});
+  AddInputFromArray<int32>(TensorShape({1}), {0});
+  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {4,  4,  3,
+     4,  4,  3,
+     2,  2,  1});
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
 TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
+  // Input:
+  //  1, 2, 3
+  //  4, 5, 6
+  //  7, 8, 9
+  AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
+                           {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  AddInputFromArray<float>(TensorShape({2, 4}), {0, 0, 1, 1, 0, 0, 0.5, 0.5});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 2, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1,  3,
+     7,  9,
+     1,  2,
+     4,  5});
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2NearestNeighbor) {
+  MakeOp<float>(0, "nearest");
   // Input:
   //  1, 2, 3
   //  4, 5, 6
@@ -171,7 +284,32 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2Flipped) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
+  // Input:
+  //  1, 2, 3
+  //  4, 5, 6
+  //  7, 8, 9
+  AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
+                           {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  AddInputFromArray<float>(TensorShape({2, 4}), {1, 1, 0, 0, 0.5, 0.5, 0, 0});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 2, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {9,  7,
+     3,  1,
+     5,  4,
+     2,  1});
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2FlippedNearestNeighbor) {
+  MakeOp<float>(0, "nearest");
   // Input:
   //  1, 2, 3
   //  4, 5, 6
@@ -197,7 +335,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2Flipped) {
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Extrapolated) {
   const float v = -1;
-  MakeOp<float>(v);
+  MakeOp<float>(v, "bilinear");
   // Input:
   //  1, 2
   //  3, 4
@@ -218,7 +356,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Extrapolated) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3NoCrop) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   // Input:
   //  1, 2
   //  3, 4
@@ -236,7 +374,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3NoCrop) {
 }
 
 TEST_F(CropAndResizeOpTest, TestInvalidInputShape) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
   AddInputFromArray<int32>(TensorShape({1}), {0});
@@ -248,7 +386,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidInputShape) {
 }
 
 TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
@@ -261,7 +399,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
 }
 
 TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
   AddInputFromArray<int32>(TensorShape({1}), {1});
@@ -274,7 +412,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
 }
 
 TEST_F(CropAndResizeOpTest, TestWithSharding) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   // Generate a relatively large input (999x999) so that sharding happens.
   const int kLength = 999;  // Length of the input. Must use an odd number.
   const int kHalf = (kLength + 1) / 2;  // Half size for the cropped result.
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index 25560b7c28273fafb5af1aa98062a4d0312cc574..04959df38d821aa4d321303c13f8547bcc34accd 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -352,7 +352,7 @@ struct ToTFDataType<uint8> : std::integral_constant<DataType, DT_UINT8> {};
 template <typename T>
 class CudnnRnnAllocatorInTemp : public ScratchAllocator {
  public:
-  ~CudnnRnnAllocatorInTemp() = default;
+  ~CudnnRnnAllocatorInTemp() override = default;
 
   explicit CudnnRnnAllocatorInTemp(OpKernelContext* context)
       : context_(context) {}
@@ -571,7 +571,7 @@ Status ExtractForwardInput(OpKernelContext* context,
           : 1;
 
   if ((*input_h)->dims() != 3) {
-    return errors::InvalidArgument("RNN input must be a 3-D vector.");
+    return errors::InvalidArgument("RNN input_h must be a 3-D vector.");
   }
   model_shapes->num_layers = (*input_h)->dim_size(0) / model_shapes->dir_count;
   model_shapes->num_units = (*input_h)->dim_size(2);
@@ -1411,7 +1411,7 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
       CudnnRnnAllocatorInTemp<T> reserve_space_allocator(context);
       CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
       status = DoForward<T>(
-          context, *rnn_desc.get(), model_types(), model_shapes, input, input_h,
+          context, *rnn_desc, model_types(), model_shapes, input, input_h,
           input_c, params, is_training(), output, output_h, output_c,
           &reserve_space_allocator, &workspace_allocator, &fwd_profile_result);
       if (!status.ok()) {
@@ -1422,12 +1422,11 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
         // Get reserve space from the forward pass.
         Tensor reserve_space = reserve_space_allocator.get_allocated_tensor(0);
         status = DoBackward<T>(
-            context, *rnn_desc.get(), model_types(), model_shapes, input,
-            input_h, input_c, params, output, output_h, output_c,
-            &output_backprop, &output_h_backprop, &output_c_backprop,
-            &reserve_space, &input_backprop, &input_h_backprop,
-            &input_c_backprop, &params_backprop, &workspace_allocator,
-            &bak_profile_result);
+            context, *rnn_desc, model_types(), model_shapes, input, input_h,
+            input_c, params, output, output_h, output_c, &output_backprop,
+            &output_h_backprop, &output_c_backprop, &reserve_space,
+            &input_backprop, &input_h_backprop, &input_c_backprop,
+            &params_backprop, &workspace_allocator, &bak_profile_result);
         if (!status.ok()) {
           continue;
         }
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index 14d889e8e3b7c03554388cd51054ee52966329cd..49b90e855be64995966b41e2de57781f3e26277a 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -33,52 +33,41 @@ class ClipOp : public OpKernel {
     const Tensor& in0 = ctx->input(0);
     const Tensor& in1 = ctx->input(1);
     const Tensor& in2 = ctx->input(2);
+    OP_REQUIRES(ctx, (in0.shape() == in1.shape() ||
+                      TensorShapeUtils::IsScalar(in1.shape())) &&
+                     (in0.shape() == in2.shape() ||
+                      TensorShapeUtils::IsScalar(in2.shape())),
+                errors::InvalidArgument(
+                    "clip_value_min and clip_value_max must be either of "
+                    "the same shape as input, or a scalar. ",
+                    "input shape: ", in0.shape().DebugString(),
+                    "clip_value_min shape: ", in1.shape().DebugString(),
+                    "clip_value_max shape: ", in2.shape().DebugString()));
+
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
+    if (out->NumElements() == 0) return;  // Nothing to do for empty output
 
     auto in0_flat = in0.flat<T>();
     auto in1_flat = in1.flat<T>();
     auto in2_flat = in2.flat<T>();
+    auto out_flat = out->flat<T>();
     const Device& d = ctx->eigen_device<Device>();
 
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
-    auto out_flat = out->flat<T>();
     if (in1.shape() == in2.shape()) {
       if (in0.shape() == in1.shape()) {
         functor::TernaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                             out_flat);
       } else {
-        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::UnaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                           out_flat);
       }
     } else {
       if (in0.shape() == in1.shape()) {
-        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in2.shape()),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryLeftClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                out_flat);
       } else {
-        OP_REQUIRES(ctx,
-                    (in0.shape() == in2.shape() &&
-                     TensorShapeUtils::IsScalar(in1.shape())),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryRightClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                 out_flat);
       }
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 9ded2667eb0c7a84a0a642b00776eb66b3b4b003..d35aad980de6affa39b9fa482a1448f3d8c5249e 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -81,7 +81,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:batch_util",
     ],
 )
 
@@ -94,7 +93,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:batch_util",
     ],
 )
 
@@ -107,7 +105,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:batch_util",
     ],
 )
 
@@ -443,7 +440,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:batch_util",
     ],
 )
 
@@ -456,7 +452,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:batch_util",
     ],
 )
 
@@ -469,7 +464,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:batch_util",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 7fa67efb9e22e6877b97524150b9024521619dbc..3618c75827f7bff57183ef3ccbf24ac131547e55 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index 67ddb52d577edb6609764b55c66f1885e6e4d60b..c608f9e1c670971769a4d5c27ff061ddc459549b 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -46,18 +46,6 @@ Status MakeIteratorFromInputElement(
   return Status::OK();
 }
 
-IteratorContext MakeIteratorContext(OpKernelContext* ctx) {
-  IteratorContext::Params params;
-  params.env = ctx->env();
-  params.runner = *(ctx->runner());
-  params.lib = ctx->function_library();
-  DeviceBase* device = ctx->function_library()->device();
-  params.allocator_getter = [device](AllocatorAttributes attrs) {
-    return device->GetAllocator(attrs);
-  };
-  return IteratorContext(params);
-}
-
 }  // namespace dataset
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index e5ca71dd99d7f5cba095f9ce3daa016150c4a6c9..6c4191c2be6c55bfde7c5e8bd2e3b1e92edbaf27 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -28,8 +28,6 @@ Status MakeIteratorFromInputElement(
     int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
     std::unique_ptr<IteratorBase>* out_iterator);
 
-IteratorContext MakeIteratorContext(OpKernelContext* ctx);
-
 }  // namespace dataset
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index a2f6c5fe2c3a4b92e67ba2785b2509e4f5029213..b6bf0ecd096f157e1ecf0cb1a02156b6844c8730 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -1051,7 +1051,7 @@ class DeserializeIteratorOp : public OpKernel {
     IteratorResource* iterator_resource;
     OP_REQUIRES_OK(
         ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator_resource));
-
+    core::ScopedUnref unref_iterator(iterator_resource);
     Variant variant = ctx->input(1).scalar<Variant>()();
     auto* wrapper = variant.get<IteratorStateVariant>();
     OP_REQUIRES(ctx, wrapper != nullptr,
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index c9551fbf16a2be22bf48d5d4bef808c31eff6b90..879bb40331e139e240d24d1d537266e50a534423 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #define EIGEN_USE_THREADS
 
+#include <utility>
+
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -21,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/tracing.h"
@@ -36,7 +39,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
+        graph_def_version_(ctx->graph_def_version()),
+        op_version_(ctx->def().op() == "MapAndBatchDataset" ? 1 : 2) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
@@ -59,12 +63,29 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         ctx, batch_size > 0,
         errors::InvalidArgument("batch_size must be greater than zero."));
 
-    int64 num_parallel_batches;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_batches",
-                                            &num_parallel_batches));
-    OP_REQUIRES(ctx, num_parallel_batches > 0,
-                errors::InvalidArgument(
-                    "num_parallel_batches must be greater than zero."));
+    int64 num_parallel_calls;
+    switch (op_version_) {
+      case 1:
+        int64 num_parallel_batches;
+        OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_batches",
+                                                &num_parallel_batches));
+        num_parallel_calls = num_parallel_batches * batch_size;
+        OP_REQUIRES(ctx, num_parallel_batches > 0,
+                    errors::InvalidArgument(
+                        "num_parallel_batches must be greater than zero."));
+        break;
+      case 2:
+        OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
+                                                &num_parallel_calls));
+        OP_REQUIRES(ctx, num_parallel_calls > 0,
+                    errors::InvalidArgument(
+                        "num_parallel_calls must be greater than zero."));
+        break;
+      default:
+        OP_REQUIRES(ctx, false,
+                    errors::Unimplemented("Unsupported operation version %d.",
+                                          op_version_));
+    }
 
     bool drop_remainder;
     OP_REQUIRES_OK(ctx,
@@ -74,7 +95,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(
                             func_, std::move(other_arguments), &captured_func));
 
-    *output = new Dataset(ctx, input, batch_size, num_parallel_batches,
+    *output = new Dataset(ctx, input, batch_size, num_parallel_calls,
                           drop_remainder, output_types_, output_shapes_, func_,
                           std::move(captured_func), &ctx->eigen_cpu_device());
   }
@@ -83,7 +104,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
   class Dataset : public GraphDatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size,
-            int64 num_parallel_batches, bool drop_remainder,
+            int64 num_parallel_calls, bool drop_remainder,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
             const NameAttrList& func,
@@ -92,7 +113,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         : GraphDatasetBase(ctx),
           input_(input),
           batch_size_(batch_size),
-          num_parallel_batches_(num_parallel_batches),
+          num_parallel_calls_(num_parallel_calls),
           drop_remainder_(drop_remainder),
           output_types_(output_types),
           output_shapes_(output_shapes),
@@ -128,9 +149,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
       Node* batch_size_node;
       TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size_node));
-      Node* num_parallel_batches_node;
+      Node* num_parallel_calls_node;
       TF_RETURN_IF_ERROR(
-          b->AddScalar(num_parallel_batches_, &num_parallel_batches_node));
+          b->AddScalar(num_parallel_calls_, &num_parallel_calls_node));
       Node* drop_remainder_node;
       TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder_node));
 
@@ -153,7 +174,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           this,
           {std::make_pair(0, input_graph_node),
            std::make_pair(2, batch_size_node),
-           std::make_pair(3, num_parallel_batches_node),
+           std::make_pair(3, num_parallel_calls_node),
            std::make_pair(4, drop_remainder_node)},  // Single tensor inputs.
           {std::make_pair(1, other_arguments)},      // Tensor list inputs.
           {std::make_pair("f", f),
@@ -168,129 +189,54 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
             input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
-            invocation_results_(params.dataset->batch_size_ *
-                                params.dataset->num_parallel_batches_),
-            batch_results_(params.dataset->num_parallel_batches_) {}
+            batch_results_((params.dataset->num_parallel_calls_ +
+                            params.dataset->batch_size_ - 1) /
+                           params.dataset->batch_size_) {
+        for (int i = 0; i < batch_results_.size(); ++i) {
+          batch_results_[i].Initialize(params.dataset->batch_size_);
+        }
+      }
 
       ~Iterator() override {
-        // TODO(mrry): Replace this cancellation logic with a
-        // CancellationManager. The syntax would be more heavyweight,
-        // but it would be possible to thread a cancellation manager
-        // through the IteratorContext to upstream,
-        // potentially-blocking iterators, when we add these.
         mutex_lock l(mu_);
-        if (current_batch_index_ != -1) {
-          for (size_t batch_index = 0;
-               batch_index < dataset()->num_parallel_batches_; ++batch_index) {
-            int64 num_elements;
-            WaitForBatch(batch_index, &num_elements).IgnoreError();
-            // Deallocate tensors allocated for the output.
-            batch_results_[batch_index].output.clear();
-          }
+        // Cancel the runner thread.
+        cancelled_ = true;
+        cond_var_.notify_all();
+        // Wait for all in-flight calls to complete.
+        while (num_calls_ > 0) {
+          cond_var_.wait(l);
         }
       }
 
-      // TODO(jsimsa): Implement and profile the following alternative design:
-      //
-      // 0. Set the number of in-flight batches and invocations independently
-      // (though obviously the max number of in-flight invocations must be <
-      // batch_size * num_parallel_batches). Maintain a current producing batch
-      // index and offset.
-      // 1. Issue invocations in order of batch and offset, as you do currently.
-      // 2. When an invocation finishes, increment the current producing batch
-      // and offset. If that invocation would start a new batch and give more
-      // than num_parallel_batches in-flight, block; else start the new
-      // invocation into that location.
-      // 3. When a GetNext() call arrives, block until there's a full batch.
-      // Before returning the batch, if the number of pending invocations is
-      // less than the max, issue that number of invocations.
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
-
-        // One-time initialization.
-        if (current_batch_index_ == -1) {
-          current_batch_index_ = 0;
-          for (size_t i = 0; i < dataset()->num_parallel_batches_; ++i) {
-            StartInvocationBatch(ctx, i);
-          }
-        }
-
-        int64 num_elements = 0;
-        Status status = WaitForBatch(current_batch_index_, &num_elements);
-        if (num_elements == 0) {
-          *end_of_sequence = true;
-          return Status::OK();
-        }
-        if (!status.ok()) {
-          // Deallocate tensors allocated for the output.
-          batch_results_[current_batch_index_].output.clear();
-        } else {
-          if (num_elements < dataset()->batch_size_) {
-            if (dataset()->drop_remainder_) {
-              // Deallocate tensors allocated for the output.
-              batch_results_[current_batch_index_].output.clear();
-              *end_of_sequence = true;
-              return Status::OK();
-            }
-            const std::vector<Tensor>& output =
-                batch_results_[current_batch_index_].output;
-            for (size_t i = 0; i < output.size(); ++i) {
-              TensorShape component_shape(
-                  batch_results_[current_batch_index_].output[i].shape());
-              component_shape.set_dim(0, num_elements);
-              AllocatorAttributes attr;
-              attr.set_gpu_compatible(true);
-              Tensor component(ctx->allocator(attr), output[i].dtype(),
-                               component_shape);
-              TF_RETURN_IF_ERROR(
-                  CopyPartialBatch(&component, output[i], num_elements));
-              out_tensors->emplace_back(std::move(component));
-            }
-            // Deallocate tensors allocated for the output.
-            batch_results_[current_batch_index_].output.clear();
-          } else {
-            *out_tensors =
-                std::move(batch_results_[current_batch_index_].output);
-          }
-          *end_of_sequence = false;
-        }
-        StartInvocationBatch(ctx, current_batch_index_);
-        current_batch_index_ =
-            (current_batch_index_ + 1) % dataset()->num_parallel_batches_;
-        return status;
+        EnsureRunnerThreadStarted(ctx);
+        BatchResult* result = &batch_results_[ComputeIndex(input_batch_)];
+        WaitForBatch(result, &l);
+        return ProcessBatch(ctx, result, out_tensors, end_of_sequence);
       }
 
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
-        if (current_batch_index_ == -1) {
-          // Iterator has not been used. Nothing to save.
-          return Status::OK();
+        // Wait for all in-flight calls to complete.
+        while (num_calls_ > 0) {
+          cond_var_.wait(l);
         }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_batch_index"),
-                                               current_batch_index_));
+        CHECK_EQ(num_calls_, 0);
         TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
-        // Wait for the map_fn dispatches made in `InvokeFunctionLocked` to
-        // finish. This may delay saving a checkpoint by a bit but keeps the
-        // code clean and also saves us from checkpointing the state of the
-        // `BlockingCounter`.
-        std::vector<int64> num_elements(batch_results_.size());
-        for (size_t i = 0; i < batch_results_.size(); i++) {
-          WaitForBatch(i, &num_elements[i]).IgnoreError();
-        }
-
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name("invocation_results_size"), invocation_results_.size()));
-        for (size_t i = 0; i < invocation_results_.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteInvocationResultLocked(writer, i));
-        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("call_counter"), call_counter_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("input_batch"), input_batch_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("output_batch"), output_batch_));
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("batch_results_size"),
                                                batch_results_.size()));
         for (size_t i = 0; i < batch_results_.size(); ++i) {
-          TF_RETURN_IF_ERROR(
-              WriteBatchResultLocked(writer, i, num_elements[i]));
+          TF_RETURN_IF_ERROR(WriteBatchResult(writer, i));
         }
         return Status::OK();
       }
@@ -298,70 +244,136 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        if (!reader->Contains(full_name("current_batch_index"))) {
-          // Iterator was never used so nothing to restore.
-          return Status::OK();
-        }
-        {
-          int64 temp;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("current_batch_index"), &temp));
-          current_batch_index_ = static_cast<int32>(temp);
-          if (current_batch_index_ != temp) {
-            return errors::Internal("Invalid value for current_batch_index ",
-                                    temp);
-          }
-        }
         TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
-        size_t invocation_results_size;
-        {
-          int64 temp;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("invocation_results_size"), &temp));
-          invocation_results_size = static_cast<size_t>(temp);
-          if (invocation_results_size != temp) {
-            return errors::Internal(
-                "Invalid value for invocation_results_size ", temp);
-          }
-        }
-        CHECK_EQ(invocation_results_.size(), invocation_results_size);
-        for (size_t i = 0; i < invocation_results_size; ++i) {
-          TF_RETURN_IF_ERROR(ReadInvocationResultLocked(reader, i));
-        }
-        size_t batch_results_size;
-        {
-          int64 temp;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("batch_results_size"), &temp));
-          batch_results_size = static_cast<size_t>(temp);
-          if (batch_results_size != temp) {
-            return errors::Internal("Invalid value for batch_results_size ",
-                                    temp);
-          }
-        }
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("call_counter"), &call_counter_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("input_batch"), &input_batch_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("output_batch"), &output_batch_));
+        int64 batch_results_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("batch_results_size"),
+                                              &batch_results_size));
         CHECK_EQ(batch_results_.size(), batch_results_size);
-        for (size_t i = 0; i < batch_results_size; ++i) {
-          TF_RETURN_IF_ERROR(ReadBatchResultLocked(ctx, reader, i));
+        for (int i = 0; i < batch_results_size; ++i) {
+          TF_RETURN_IF_ERROR(ReadBatchResult(ctx, reader, i));
         }
         return Status::OK();
       }
 
      private:
       struct BatchResult {
-        mutex mu ACQUIRED_AFTER(mu_);
-        bool output_allocated GUARDED_BY(mu);
+        mutex mu;
+        bool end_of_input GUARDED_BY(mu);
+        int64 num_elements GUARDED_BY(mu);
         std::vector<Tensor> output;
-        std::unique_ptr<BlockingCounter> counter;
+        bool output_allocated GUARDED_BY(mu);
+        Status status GUARDED_BY(mu);
+        // Used for coordination between the main thread and the callback
+        // threads. In particular, the main thread will wait for the value
+        // of `num_calls` to reach zero before processing the batch result.
+        condition_variable cond_var;  // access guarded by owner's mutex
+        // Counts the number of outstanding calls for this batch.
+        int64 num_calls;  // access guarded by owner's mutex
+
+        void Initialize(int64 batch_size) {
+          mutex_lock l(mu);
+          end_of_input = false;
+          num_calls = batch_size;
+          num_elements = 0;
+          output_allocated = false;
+          status = Status::OK();
+        }
+
+        void UpdateStatus(const Status& s) {
+          mutex_lock l(mu);
+          status.Update(s);
+        }
       };
 
-      struct InvocationResult {
-        Status status;
+      void Callback(const std::shared_ptr<IteratorContext>& ctx,
+                    BatchResult* result, std::vector<Tensor>* return_values,
+                    int64 offset, const Status& status) {
+        std::unique_ptr<std::vector<Tensor>> cleanup_retvals(return_values);
+        result->UpdateStatus(status);
+        if (status.ok()) {
+          EnsureOutputAllocated(ctx, result, return_values);
+          for (size_t i = 0; i < return_values->size(); ++i) {
+            const Tensor& tensor = return_values->at(i);
+            Tensor* batch = &(result->output)[i];
+            if (tensor.NumElements() !=
+                (batch->NumElements() / batch->dim_size(0))) {
+              TensorShape batch_shape = batch->shape();
+              batch_shape.RemoveDim(0);
+              result->UpdateStatus(errors::InvalidArgument(
+                  "Cannot add tensor to the batch: number of elements does not "
+                  "match. Shapes are: [tensor]: ",
+                  tensor.shape().DebugString(),
+                  ", [batch]: ", batch_shape.DebugString()));
+              break;
+            }
+            // TODO(mrry): Add a version of DoParallelConcat that allows us to
+            // move `tensor` where possible, to speed up string tensor batching.
+            Status copy_status = ::tensorflow::functor::DoParallelConcat(
+                *dataset()->device_, tensor, offset, batch);
+            if (!copy_status.ok()) {
+              result->UpdateStatus(copy_status);
+              break;
+            }
+          }
+        }
+        {
+          mutex_lock l(result->mu);
+          result->num_elements++;
+        }
+        {
+          mutex_lock l(mu_);
+          CallCompleted(result);
+        }
+      }
+
+      void CallCompleted(BatchResult* result) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        num_calls_--;
+        cond_var_.notify_all();
+        result->num_calls--;
+        result->cond_var.notify_all();
+      }
+
+      void CallFunction(std::shared_ptr<IteratorContext> ctx,
+                        BatchResult* result, int64 offset) {
+        // Get the next input element.
+        std::vector<Tensor> input_element;
         bool end_of_input;
-        std::vector<Tensor> return_values;
-      };
+        Status status =
+            input_impl_->GetNext(ctx.get(), &input_element, &end_of_input);
+        {
+          mutex_lock l(mu_);
+          mutex_lock l2(result->mu);
+          result->end_of_input = result->end_of_input || end_of_input;
+          result->status.Update(status);
+          if (result->end_of_input || !result->status.ok()) {
+            CallCompleted(result);
+            return;
+          }
+        }
 
-      int64 ComputeInvocationIndex(int64 batch_index, int64 offset) {
-        return batch_index * dataset()->batch_size_ + offset;
+        // Call `captured_func_(input_element)`, using `Callback` to store the
+        // result in `result`.
+        (*ctx->runner())(std::bind(
+            [this, result, offset](std::shared_ptr<IteratorContext> ctx,
+                                   std::vector<Tensor> input_element) {
+              std::vector<Tensor>* return_values = new std::vector<Tensor>();
+              dataset()->captured_func_->RunAsync(
+                  ctx.get(), std::move(input_element), return_values,
+                  [this, ctx, result, return_values, offset](Status status) {
+                    Callback(ctx, result, return_values, offset, status);
+                  });
+            },
+            ctx, std::move(input_element)));
+      }
+
+      int64 ComputeIndex(int64 n) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        return n % batch_results_.size();
       }
 
       Status CopyPartialBatch(Tensor* output, const Tensor& value,
@@ -387,253 +399,140 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      void EnsureOutputAllocated(IteratorContext* ctx,
-                                 BatchResult* batch_result,
-                                 const std::vector<Tensor>& return_values) {
-        mutex_lock l(batch_result->mu);
-        if (batch_result->output_allocated) {
+      void EnsureRunnerThreadStarted(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (!runner_thread_) {
+          std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
+          runner_thread_.reset(ctx->env()->StartThread(
+              {}, "runner_thread",
+              std::bind(&Iterator::RunnerThread, this, ctx_copy)));
+        }
+      }
+
+      void EnsureOutputAllocated(const std::shared_ptr<IteratorContext>& ctx,
+                                 BatchResult* result,
+                                 const std::vector<Tensor>* return_values) {
+        mutex_lock l(result->mu);
+        if (result->output_allocated) {
           return;
         }
-        const size_t num_components = return_values.size();
+        const size_t num_components = return_values->size();
         for (size_t i = 0; i < num_components; ++i) {
           TensorShape component_shape({dataset()->batch_size_});
-          component_shape.AppendShape(return_values[i].shape());
+          component_shape.AppendShape(return_values->at(i).shape());
           AllocatorAttributes attr;
           attr.set_gpu_compatible(true);
-          Tensor component(ctx->allocator(attr), return_values[i].dtype(),
+          Tensor component(ctx->allocator(attr), return_values->at(i).dtype(),
                            component_shape);
-          batch_result->output.emplace_back(std::move(component));
+          result->output.emplace_back(std::move(component));
         }
-        batch_result->output_allocated = true;
+        result->output_allocated = true;
       }
 
-      void InvokeFunctionLocked(IteratorContext* ctx, int64 batch_index,
-                                int64 offset) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        size_t index = ComputeInvocationIndex(batch_index, offset);
-        InvocationResult* result = &invocation_results_[index];
-        BatchResult* batch_result = &batch_results_[batch_index];
-
-        // Get the next input element.
-        std::vector<Tensor> input_element;
-        result->status =
-            input_impl_->GetNext(ctx, &input_element, &result->end_of_input);
-        if (result->end_of_input || !result->status.ok()) {
-          batch_result->counter->DecrementCount();
-          return;
+      Status ProcessBatch(IteratorContext* ctx, BatchResult* result,
+                          std::vector<Tensor>* out_tensors,
+                          bool* end_of_sequence) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        auto cleanup =
+            gtl::MakeCleanup([this, result]() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+              result->Initialize(dataset()->batch_size_);
+              input_batch_++;
+              cond_var_.notify_all();
+            });
+        mutex_lock l(result->mu);
+        if (result->num_elements == 0) {
+          *end_of_sequence = true;
+          return Status::OK();
         }
 
-        // Call `captured_func_(input_element)`, store the result in
-        // `result->return_values`, and notify `batch_result->counter`
-        // to unblock a consumer.
-        (*ctx->runner())(std::bind(
-            [this, result, batch_result, offset](
-                IteratorContext* ctx, std::vector<Tensor> input_element) {
-              dataset()->captured_func_->RunAsync(
-                  ctx, std::move(input_element), &result->return_values,
-                  [this, ctx, result, batch_result, offset](Status ret_status) {
-                    result->status.Update(ret_status);
-                    if (ret_status.ok()) {
-                      EnsureOutputAllocated(ctx, batch_result,
-                                            result->return_values);
-                      const size_t num_components =
-                          result->return_values.size();
-                      for (size_t i = 0; i < num_components; ++i) {
-                        const Tensor& tensor = result->return_values[i];
-                        Tensor* batch = &(batch_result->output)[i];
-                        if (tensor.NumElements() !=
-                            (batch->NumElements() / batch->dim_size(0))) {
-                          TensorShape batch_shape = batch->shape();
-                          batch_shape.RemoveDim(0);
-                          result->status.Update(errors::InvalidArgument(
-                              "Cannot add tensor to the batch: number of "
-                              "elements does not match. Shapes are: [tensor]: ",
-                              tensor.shape().DebugString(),
-                              ", [batch]: ", batch_shape.DebugString()));
-                          break;
-                        }
-                        // TODO(mrry): Add a version of DoParallelConcat that
-                        // allows us to move `tensor` where possible, to speed
-                        // up string tensor batching.
-                        Status copy_status =
-                            ::tensorflow::functor::DoParallelConcat(
-                                *dataset()->device_, tensor, offset, batch);
-                        if (!copy_status.ok()) {
-                          result->status.Update(copy_status);
-                          break;
-                        }
-                      }
-                    }
-                    delete ctx;
-                    // NOTE(mrry): We clear the return values here to release
-                    // any memory associated with them and to paralellize the
-                    // destruction of the tensors (which can be surprisingly
-                    // expensive for map functions with large numbers of return
-                    // values).
-                    result->return_values.clear();
-                    batch_result->counter->DecrementCount();
-                  });
-            },
-            new IteratorContext(*ctx), std::move(input_element)));
-      }
-
-      void StartInvocationBatch(IteratorContext* ctx, int64 batch_index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        tracing::ScopedActivity activity(strings::StrCat(prefix(), "::Start"));
-        // Initialize batch result.
-        {
-          mutex_lock l(batch_results_[batch_index].mu);
-          batch_results_[batch_index].output_allocated = false;
-          batch_results_[batch_index].counter.reset(
-              new BlockingCounter(dataset()->batch_size_));
-        }
-        // Initialize invocation results.
-        for (size_t i = 0; i < dataset()->batch_size_; ++i) {
-          size_t index = ComputeInvocationIndex(batch_index, i);
-          InvocationResult* result = &invocation_results_[index];
-          // Reset the state of `result`; `result->return_values` was cleared
-          // when the previous invocation completed.
-          result->end_of_input = false;
-          result->status = Status::OK();
-        }
-        // Start individual invocations.
-        for (size_t i = 0; i < dataset()->batch_size_; ++i) {
-          InvokeFunctionLocked(ctx, batch_index, i);
+        if (!result->status.ok()) {
+          // Deallocate tensors allocated for the output.
+          result->output.clear();
+        } else {
+          if (result->num_elements < dataset()->batch_size_) {
+            if (dataset()->drop_remainder_) {
+              // Deallocate tensors allocated for the output.
+              result->output.clear();
+              *end_of_sequence = true;
+              return Status::OK();
+            }
+            const std::vector<Tensor>& output = result->output;
+            for (size_t i = 0; i < output.size(); ++i) {
+              TensorShape component_shape(result->output[i].shape());
+              component_shape.set_dim(0, result->num_elements);
+              AllocatorAttributes attr;
+              attr.set_gpu_compatible(true);
+              Tensor component(ctx->allocator(attr), output[i].dtype(),
+                               component_shape);
+              TF_RETURN_IF_ERROR(CopyPartialBatch(&component, output[i],
+                                                  result->num_elements));
+              out_tensors->emplace_back(std::move(component));
+            }
+            // Deallocate tensors allocated for the output.
+            result->output.clear();
+          } else {
+            *out_tensors = std::move(result->output);
+          }
+          *end_of_sequence = false;
         }
+        return result->status;
       }
 
-      Status WaitForBatch(int64 batch_index, int64* num_elements)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        tracing::ScopedActivity activity(strings::StrCat(prefix(), "::Wait"));
-        batch_results_[batch_index].counter->Wait();
-        Status status = Status::OK();
-        for (size_t i = 0; i < dataset()->batch_size_; ++i, ++*num_elements) {
-          size_t index = ComputeInvocationIndex(batch_index, i);
-          InvocationResult* result = &invocation_results_[index];
-          if (result->end_of_input) {
-            VLOG(3) << "end of input encountered at element[" << i << "]: ";
-            return Status::OK();
-          }
-          if (!result->status.ok()) {
-            VLOG(3) << "failed to process element[" << i
-                    << "]: " << result->status;
-            status.Update(result->status);
+      void RunnerThread(const std::shared_ptr<IteratorContext>& ctx) {
+        mutex_lock l(mu_);
+        while (true) {
+          while (!cancelled_ &&
+                 (num_calls_ == dataset()->num_parallel_calls_ ||
+                  (output_batch_ - input_batch_ == batch_results_.size()))) {
+            cond_var_.wait(l);
           }
-        }
-        return status;
-      }
 
-      Status WriteInvocationResultLocked(IteratorStateWriter* writer,
-                                         size_t index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        const InvocationResult& result = invocation_results_[index];
-        string prefix = strings::StrCat("invocation_results_", index);
-        TF_RETURN_IF_ERROR(WriteStatusLocked(
-            writer, full_name(strings::StrCat(prefix, "_status")),
-            result.status));
-        if (result.end_of_input) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_end_of_input")), ""));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_return_values_size")),
-            result.return_values.size()));
-        for (size_t i = 0; i < result.return_values.size(); i++) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(strings::StrCat(prefix, "_return_values_", i)),
-              result.return_values[i]));
-        }
-        return Status::OK();
-      }
+          if (cancelled_) {
+            return;
+          }
 
-      Status ReadInvocationResultLocked(IteratorStateReader* reader,
-                                        size_t index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        InvocationResult* result = &invocation_results_[index];
-        string prefix = strings::StrCat("invocation_results_", index);
-        TF_RETURN_IF_ERROR(ReadStatusLocked(
-            reader, full_name(strings::StrCat(prefix, "_status")),
-            &result->status));
-        result->end_of_input = reader->Contains(
-            full_name(strings::StrCat(prefix, "_end_of_input")));
-        size_t return_values_size;
-        {
-          int64 temp;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat(prefix, "_return_values_size")),
-              &temp));
-          return_values_size = static_cast<size_t>(temp);
-          if (temp != return_values_size) {
-            return errors::Internal("Invalid value for return_values_size ",
-                                    return_values_size);
+          while (num_calls_ < dataset()->num_parallel_calls_ &&
+                 (output_batch_ - input_batch_ < batch_results_.size())) {
+            BatchResult* result = &batch_results_[ComputeIndex(output_batch_)];
+            int64 offset = call_counter_++ % dataset()->batch_size_;
+            num_calls_++;
+            mu_.unlock();
+            CallFunction(ctx, result, offset);
+            mu_.lock();
+            if (offset + 1 == dataset()->batch_size_) {
+              // Done scheduling calls for the current batch.
+              output_batch_++;
+            }
           }
         }
-        result->return_values.reserve(return_values_size);
-        for (size_t i = 0; i < return_values_size; i++) {
-          result->return_values.emplace_back();
-          TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(prefix, "_return_values_", i)),
-              &result->return_values.back()));
-        }
-        return Status::OK();
       }
 
-      Status WriteBatchResultLocked(IteratorStateWriter* writer, size_t index,
-                                    int64 num_elements)
+      void WaitForBatch(BatchResult* result, mutex_lock* l)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        const BatchResult& result = batch_results_[index];
-        string prefix = strings::StrCat("batch_results_", index);
-        {
-          mutex_lock l(batch_results_[index].mu);
-          if (result.output_allocated) {
-            TF_RETURN_IF_ERROR(writer->WriteScalar(
-                full_name(strings::StrCat(prefix, "_output_allocated")), ""));
-          }
+        while (result->num_calls > 0) {
+          result->cond_var.wait(*l);
         }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_output_size")),
-            result.output.size()));
-        for (size_t i = 0; i < result.output.size(); i++) {
-          // If the batch is not full, we only store the first
-          // `num_elements` values. The rest of the batch tensor is
-          // *uninitialized* and accessing that will raise msan errors.
-          if (num_elements < dataset()->batch_size_) {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                full_name(strings::StrCat(prefix, "_output_", i)),
-                result.output[i].Slice(0, num_elements)));
-          } else {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                full_name(strings::StrCat(prefix, "_output_", i)),
-                result.output[i]));
-          }
-        }
-        return Status::OK();
       }
 
-      Status ReadBatchResultLocked(IteratorContext* ctx,
-                                   IteratorStateReader* reader, size_t index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      Status ReadBatchResult(IteratorContext* ctx, IteratorStateReader* reader,
+                             size_t index) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         BatchResult* result = &batch_results_[index];
         string prefix = strings::StrCat("batch_results_", index);
-        {
-          mutex_lock l(batch_results_[index].mu);
-          result->output_allocated = reader->Contains(
-              full_name(strings::StrCat(prefix, "_output_allocated")));
-          // Simulate that the batch was fully generated.
-          batch_results_[index].counter.reset(new BlockingCounter(0));
-        }
-        size_t output_size;
-        {
-          int64 temp;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat(prefix, "_output_size")), &temp));
-          output_size = static_cast<size_t>(temp);
-          if (temp != output_size) {
-            return errors::Internal("Invalid value for output_size ",
-                                    output_size);
-          }
-        }
+        mutex_lock l(result->mu);
+        result->end_of_input = reader->Contains(
+            full_name(strings::StrCat(prefix, "_end_of_input")));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name(strings::StrCat(prefix, "_num_calls")),
+                               &result->num_calls));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_num_elements")),
+            &result->num_elements));
+        result->output_allocated = reader->Contains(
+            full_name(strings::StrCat(prefix, "_output_allocated")));
+        int64 output_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_output_size")), &output_size));
         result->output.reserve(output_size);
-        for (size_t i = 0; i < output_size; i++) {
+        for (int i = 0; i < output_size; i++) {
           Tensor t;
           TF_RETURN_IF_ERROR(reader->ReadTensor(
               full_name(strings::StrCat(prefix, "_output_", i)), &t));
@@ -653,25 +552,13 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
             result->output.emplace_back(std::move(t));
           }
         }
+        TF_RETURN_IF_ERROR(ReadStatus(
+            reader, strings::StrCat(prefix, "_status"), &result->status));
         return Status::OK();
       }
 
-      Status WriteStatusLocked(IteratorStateWriter* writer,
-                               const string& prefix, const Status& status)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
-                                static_cast<int64>(status.code())));
-        if (!status.ok()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
-                                  status.error_message()));
-        }
-        return Status::OK();
-      }
-
-      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
-                              Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      Status ReadStatus(IteratorStateReader* reader, const string& prefix,
+                        Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         int64 code_int;
         TF_RETURN_IF_ERROR(reader->ReadScalar(
             full_name(strings::StrCat(prefix, "_code")), &code_int));
@@ -687,17 +574,89 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
         return Status::OK();
       }
+
+      Status WriteBatchResult(IteratorStateWriter* writer, size_t index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        BatchResult* result = &batch_results_[index];
+        string prefix = strings::StrCat("batch_results_", index);
+        mutex_lock l(result->mu);
+        if (result->end_of_input) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_end_of_input")), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_num_calls")),
+            result->num_calls));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_num_elements")),
+            result->num_elements));
+        if (result->output_allocated) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_output_allocated")), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_output_size")),
+            result->output.size()));
+        for (int i = 0; i < result->output.size(); i++) {
+          // If the batch is not full, we only store the first `num_elements`
+          // values. The rest of the batch tensor is *uninitialized* and
+          // accessing that will raise msan errors.
+          if (result->num_elements < dataset()->batch_size_) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat(prefix, "_output_", i)),
+                result->output[i].Slice(0, result->num_elements)));
+          } else {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat(prefix, "_output_", i)),
+                result->output[i]));
+          }
+        }
+        TF_RETURN_IF_ERROR(WriteStatus(
+            writer, strings::StrCat(prefix, "_status"), result->status));
+        return Status::OK();
+      }
+
+      Status WriteStatus(IteratorStateWriter* writer, const string& prefix,
+                         const Status& status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
+                                static_cast<int64>(status.code())));
+        if (!status.ok()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
+                                  status.error_message()));
+        }
+        return Status::OK();
+      }
+
       mutex mu_;
-      int32 current_batch_index_ GUARDED_BY(mu_) = -1;
-      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      std::vector<InvocationResult> invocation_results_ GUARDED_BY(mu_);
+      // Used for coordination between the main thread, the runner thread, and
+      // the callback threads. In particular, the runner thread should only
+      // schedule new calls when the number of in-flight calls is less than the
+      // user specified level of parallelism and there are slots available in
+      // the `batch_results_` buffer.
+      condition_variable cond_var_;
+      // Counts the number of outstanding calls for this batch.
+      int64 num_calls_ GUARDED_BY(mu_) = 0;
+      // Counts the total number of calls.
+      int64 call_counter_ GUARDED_BY(mu_) = 0;
+      const std::unique_ptr<IteratorBase> input_impl_;
+      // Identifies the next batch to be read by the caller.
+      int64 input_batch_ GUARDED_BY(mu_) = 0;
+      // Identifies the next batch to create.
+      int64 output_batch_ GUARDED_BY(mu_) = 0;
+      // Circular buffer for storing the (intermediate) batch results. When
+      // using `input_batch_` and `output_batch_` to index into the buffer,
+      // their value should be interpreted modulo the size of the buffer.
       std::vector<BatchResult> batch_results_ GUARDED_BY(mu_);
+      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(mu_);
+      bool cancelled_ GUARDED_BY(mu_) = false;
     };
 
     const DatasetBase* const input_;
     const NameAttrList func_;
     const int64 batch_size_;
-    const int64 num_parallel_batches_;
+    const int64 num_parallel_calls_;
     const bool drop_remainder_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
@@ -707,6 +666,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
   };
 
   const int graph_def_version_;
+  const int op_version_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList func_;
@@ -715,6 +675,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 REGISTER_KERNEL_BUILDER(Name("MapAndBatchDataset").Device(DEVICE_CPU),
                         MapAndBatchDatasetOp);
 
+REGISTER_KERNEL_BUILDER(Name("MapAndBatchDatasetV2").Device(DEVICE_CPU),
+                        MapAndBatchDatasetOp);
+
 }  // namespace
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index cfb4efda9a56fde04994201f509cf3d9fb45ea82..e41800a8069ed5a5432395184e0ab3eb713c0523 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/data/slide_dataset_op.cc b/tensorflow/core/kernels/data/slide_dataset_op.cc
index 4f3537b6912bffb1c76d8a315790b4e5585a02d7..78c8363f91a7efcf9ba3355aa4e6b21d1b5eeff7 100644
--- a/tensorflow/core/kernels/data/slide_dataset_op.cc
+++ b/tensorflow/core/kernels/data/slide_dataset_op.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/data/sql_dataset_ops.cc b/tensorflow/core/kernels/data/sql_dataset_ops.cc
index d50e9c9cf9739044379c7bbe753fc4acc2de311e..634b3c280fedabfca1b69c32a99034a8c198470c 100644
--- a/tensorflow/core/kernels/data/sql_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/sql_dataset_ops.cc
@@ -70,17 +70,19 @@ class SqlDatasetOp : public DatasetOpKernel {
                     "The set of supported databases is: {'sqlite'}.",
                     driver_name.c_str())));
 
-    *output = new Dataset(driver_name, data_source_name, query, output_types_,
-                          output_shapes_);
+    *output = new Dataset(ctx, driver_name, data_source_name, query,
+                          output_types_, output_shapes_);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const string& driver_name, const string& data_source_name,
-            const string& query, const DataTypeVector& output_types,
+    Dataset(OpKernelContext* ctx, const string& driver_name,
+            const string& data_source_name, const string& query,
+            const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : driver_name_(driver_name),
+        : GraphDatasetBase(ctx),
+          driver_name_(driver_name),
           data_source_name_(data_source_name),
           query_(query),
           output_types_(output_types),
@@ -102,6 +104,21 @@ class SqlDatasetOp : public DatasetOpKernel {
 
     string DebugString() override { return "SqlDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* driver_name_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(driver_name_, &driver_name_node));
+      Node* data_source_name_node;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(data_source_name_, &data_source_name_node));
+      Node* query_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(query_, &query_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {driver_name_node, data_source_name_node, query_node}, output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -121,22 +138,62 @@ class SqlDatasetOp : public DatasetOpKernel {
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
         if (!query_connection_initialized_) {
-          query_connection_initialized_ = true;
-          query_connection_ = sql::DriverManager::CreateQueryConnection(
-              dataset()->driver_name_);
-          Status s = query_connection_->Open(dataset()->data_source_name_,
-                                             dataset()->query_,
-                                             dataset()->output_types_);
-          if (!s.ok()) {
-            LOG(WARNING) << "Failed to connect to database: " << s;
-            return s;
-          }
+          TF_RETURN_IF_ERROR(InitializeQueryConnection());
         }
+        next_calls_++;
         return query_connection_->GetNext(ctx, out_tensors, end_of_sequence);
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (query_connection_initialized_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("next_calls"), next_calls_));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("next_calls"))) {
+          TF_RETURN_IF_ERROR(InitializeQueryConnection());
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("next_calls"), &next_calls_));
+          int64 rem_next_calls = next_calls_;
+          std::vector<Tensor> out_tensors;
+          bool end_of_sequence = false;
+          while (rem_next_calls--) {
+            TF_RETURN_IF_ERROR(query_connection_->GetNext(ctx, &out_tensors,
+                                                          &end_of_sequence));
+            out_tensors.clear();
+          }
+        } else {
+          query_connection_initialized_ = false;
+        }
+        return Status::OK();
+      }
+
      private:
+      Status InitializeQueryConnection() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        query_connection_initialized_ = true;
+        query_connection_ =
+            sql::DriverManager::CreateQueryConnection(dataset()->driver_name_);
+        Status s = query_connection_->Open(dataset()->data_source_name_,
+                                           dataset()->query_,
+                                           dataset()->output_types_);
+        next_calls_ = 0;
+        if (!s.ok()) {
+          LOG(WARNING) << "Failed to connect to database: " << s;
+          return s;
+        }
+        return Status::OK();
+      }
+
       mutex mu_;
+      // TODO(shivaniagrawal): explore ways to seek into a SQLite databases.
+      int64 next_calls_ GUARDED_BY(mu_) = 0;
       std::unique_ptr<sql::QueryConnection> query_connection_ GUARDED_BY(mu_);
       bool query_connection_initialized_ GUARDED_BY(mu_) = false;
     };
diff --git a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
index ff412a4671bd0307e4975027ebd1e098353de238..e271a42b2ac679e837d714567e9f26aa23b6c1a2 100644
--- a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index d5be4c778074e406122dc3a1a9c23681fca491d0..95708cc01ce6b63275f0c9c562d4f8c4782af5f4 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
index 241b615aca11ccdbf015bcab6634a243ba086fea..2b383e50977a306e4512b3693568b9d88e38f0d1 100644
--- a/tensorflow/core/kernels/data/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/decode_proto_op.cc b/tensorflow/core/kernels/decode_proto_op.cc
index 24f8a4f72fd55167a61de9b8e0f8808b93584cf9..6d3dcc1c59bbc06a62d14f8640903750586f0360 100644
--- a/tensorflow/core/kernels/decode_proto_op.cc
+++ b/tensorflow/core/kernels/decode_proto_op.cc
@@ -287,8 +287,7 @@ struct FieldInfo {
 // It is more complex and provides better motivation for the API here.
 class CountCollector {
  public:
-  // Default constructor allows the collector to be a vector element.
-  CountCollector() = default;
+  CountCollector() = delete;
 
   // The count may be stored inside an Eigen Tensor to eliminate copying.
   explicit CountCollector(int32* count) : count_ptr_(count) {}
@@ -517,8 +516,7 @@ class CountCollector {
 // the user requests it.
 class DenseCollector {
  public:
-  // Default constructor allows the collector to be a vector element.
-  DenseCollector() = default;
+  DenseCollector() = delete;
 
   // A DenseCollector applies to one field of a serialized message.
   // Note that default_value.dtype is the type of the output tensor.
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index 829155fb313bd354d28432be6212af0760630c44..014684de642764c386d39b9af7e0289fef66e0b1 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -393,9 +393,8 @@ struct TransformFilters {
 
     // Calculate filter transform batch based on cache/filter sizes.
 
-    // Cache budget (based on L2 cache size = 256KB).
-    // TODO(andydavis) Read cache size from system.
-    const int64 cache_size = (256LL << 10) / sizeof(T);
+    // Cache budget (based on L2 cache size).
+    const int64 cache_size = Eigen::l2CacheSize() / sizeof(T);
 
     // Fixed cost.
     const int64 filter_transform_matrix_size =
@@ -1017,9 +1016,8 @@ struct DeepConv2D<CPUDevice, T> {
       const int64 filter_shard_size = filter_shards_row * filter_shards_col;
       const int64 out_tile_spatial_size = out_tile_rows * out_tile_cols;
 
-      // Cache budget (based on L2 cache size = 256KB).
-      // TODO(andydavis) Read cache size from the system.
-      const int64 cache_size = (256LL << 10) / sizeof(T);
+      // Cache budget (based on L2 cache size).
+      const int64 cache_size = Eigen::l2CacheSize() / sizeof(T);
 
       // Fixed costs.
       const int64 tile_transform_matrix_size =
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 9a3b2303a3bf6718009b5055c4ef25464ec01136..17a85d97736c7e01f5cd19c67215567ba67ed1e4 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -57,6 +57,7 @@ struct DenseUpdate<GPUDevice, T, SUB> {
   template struct functor::DenseUpdate<GPUDevice, T, ADD>; \
   template struct functor::DenseUpdate<GPUDevice, T, SUB>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+TF_CALL_int32(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 7afa21acb919e148a14f9a83c002a862c228ba19..da3bdb475e274d73751e22334628e3431023b9e4 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -564,7 +564,7 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
 
     // For in_depth == 1 and grouped convolutions.
-    use_cudnn_ = CanUseCudnn();
+    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
     cudnn_use_autotune_ = CudnnUseAutotune();
     use_cudnn_grouped_conv_ = false;
     dtype_ = DataTypeToEnum<T>::value;
@@ -1037,7 +1037,7 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
 
     // For in_depth == 1 and grouped convolutions.
-    use_cudnn_ = CanUseCudnn();
+    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
     cudnn_use_autotune_ = CudnnUseAutotune();
     use_cudnn_grouped_conv_ = false;
 
@@ -1076,7 +1076,7 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
                                 {1}, 0, filter_shape, &filter_backprop));
 
     // If there is nothing to compute, return.
-    if (filter_shape.num_elements() == 0) {
+    if (out_backprop.shape().num_elements() == 0) {
       return;
     }
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index d5f4a68120aabb3bf95addaabf0dd05f5c059770..f0902fdba6921b46fd7a0d0adb16e470ed83f65c 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -290,7 +290,7 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
 
     // For in_depth == 1 and grouped convolutions.
-    use_cudnn_ = CanUseCudnn();
+    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
     cudnn_use_autotune_ = CudnnUseAutotune();
     use_cudnn_grouped_conv_ = false;
     dtype_ = DataTypeToEnum<T>::value;
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 0abd64030fbaecb4f04810b0cfdbe1ddfe47f439..5390222b3abf4828646574bcfe78789d1d7c8e62 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -613,8 +613,8 @@ void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& device,
                                             kKnownFilterHeight, kBlockDepth,
                                             kKnownEvenHeight>;
       break;
-    case FORMAT_NCHW_VECT_C:
-      LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported";
+    default:
+      LOG(ERROR) << "FORMAT_" << ToString(data_format) << " is not supported";
       return;
   }
   const int tile_width = args.in_cols + args.filter_cols - 1;
@@ -690,8 +690,8 @@ void LaunchDepthwiseConv2dGPU(const GpuDevice& device,
           DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
                                        kKnownDepthMultiplier>;
       break;
-    case FORMAT_NCHW_VECT_C:
-      LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported";
+    default:
+      LOG(ERROR) << "FORMAT_" << ToString(data_format) << " is not supported";
       return;
   }
   const int num_outputs =
@@ -919,8 +919,8 @@ void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& device,
       kernel = DepthwiseConv2dBackpropInputGPUKernelNCHW<
           T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>;
       break;
-    case FORMAT_NCHW_VECT_C:
-      LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported";
+    default:
+      LOG(ERROR) << "FORMAT_" << ToString(data_format) << " is not supported";
       return;
   }
   const int num_in_backprop =
@@ -1559,8 +1559,8 @@ bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
       kernel = DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall<
           T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, kAccumPixels>;
       break;
-    case FORMAT_NCHW_VECT_C:
-      LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported";
+    default:
+      LOG(ERROR) << "FORMAT_" << ToString(data_format) << " is not supported";
       return false;
   }
   const int num_out_backprop = args.out_rows * args.out_cols * block_count;
@@ -1662,8 +1662,8 @@ void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& device,
       kernel = DepthwiseConv2dBackpropFilterGPUKernelNCHW<
           T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>;
       break;
-    case FORMAT_NCHW_VECT_C:
-      LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported";
+    default:
+      LOG(ERROR) << "FORMAT_" << ToString(data_format) << " is not supported";
       return;
   }
   const int num_out_backprop =
diff --git a/tensorflow/core/kernels/dequantize_op.cc b/tensorflow/core/kernels/dequantize_op.cc
index 3f644a61bfc270d86d1b6613b24cd80cf726eff4..42fbf95cd3681ce1237ec289aabeb8242c55b938 100644
--- a/tensorflow/core/kernels/dequantize_op.cc
+++ b/tensorflow/core/kernels/dequantize_op.cc
@@ -96,27 +96,17 @@ class DequantizeOp : public OpKernel {
             output);
       }
     } else if (mode_ == QUANTIZE_MODE_SCALED) {
-      // The quantization logic for mode SCALED matches that of
-      // QuantizeAndDequantizeV2 and QuantizeAndDequantizeV3.
-      static constexpr int num_bits = sizeof(T) * 8;
-      const float max_abs = std::max(std::abs(min_range), std::abs(max_range));
-      bool is_signed = std::is_signed<T>::value;
-      // If it is signed, we try to keep 0.0 being 0 and drop one bucket. For
-      // example, if it is 8 bits, we have the range [-127, 127]. So for input
-      // range of [-x, x], the scale should be 254/(2*x).
-      //
-      // If it is unsigned and num_bits == 8, the range with 8 bits is [0, 255].
-      // If the input range is [0, x], then the scale is x/255 instead of 254 as
-      // in the case above.
-      const int target_bits = is_signed ? (num_bits - 1) : num_bits;
-      const float target_range =
-          static_cast<float>((uint64_t{1} << target_bits) - 1);
-      const float scale_factor = max_abs / target_range;
+      // TODO(pauldonnelly): Update QuantizeAndDequantizeV2 and
+      // QuantizeAndDequantizeV3 to match this SCALED mode again.
+      const float scale_factor =
+          std::numeric_limits<T>::min() == 0
+              ? (max_range / std::numeric_limits<T>::max())
+              : std::max(min_range / std::numeric_limits<T>::min(),
+                         max_range / std::numeric_limits<T>::max());
       float* out_ptr = output->flat<float>().data();
       const T* in_ptr = input.flat<T>().data();
-
       const int64 num_elements = input.NumElements();
-      for (int i = 0; i < num_elements; ++i) {
+      for (int64 i = 0; i < num_elements; ++i) {
         out_ptr[i] = static_cast<int>(in_ptr[i]) * scale_factor;
       }
     }
diff --git a/tensorflow/core/kernels/dequantize_op_test.cc b/tensorflow/core/kernels/dequantize_op_test.cc
index 9938eb61aa4d8196a2b19560addf80a0ef69c598..63b18d72631949c2e2625377b33e80bdfb608082 100644
--- a/tensorflow/core/kernels/dequantize_op_test.cc
+++ b/tensorflow/core/kernels/dequantize_op_test.cc
@@ -127,8 +127,8 @@ TEST_F(DequantizeOpTest, DequantizeMinCombinedQuint16) {
 TEST_F(DequantizeOpTest, DequantizeScaledQuint8Zero) {
   RunDequantizeScaledTest<quint8>(-255.0f, 127.0f, 0, 0.0);
 }
-TEST_F(DequantizeOpTest, DequantizeScaledQuint8ScaleIdentity) {
-  RunDequantizeScaledTest<quint8>(-255.0f, 127.0f, 127, 127.0);
+TEST_F(DequantizeOpTest, DequantizeScaledQuint8CheckIgnoresNegative) {
+  RunDequantizeScaledTest<quint8>(-512.0f, 255.0f, 255, 255.0);
 }
 TEST_F(DequantizeOpTest, DequantizeScaledQuint8ScaleDown) {
   RunDequantizeScaledTest<quint8>(-1.0f, 2.0f, 255, 2.0);
@@ -144,7 +144,7 @@ TEST_F(DequantizeOpTest, DequantizeScaledQint8ScaleIdentity) {
   RunDequantizeScaledTest<qint8>(-10.0f, 127.0f, -127, -127.0);
 }
 TEST_F(DequantizeOpTest, DequantizeScaledQint8ScaleDown) {
-  RunDequantizeScaledTest<qint8>(-2.0f, 1.0f, -127, -2.0);
+  RunDequantizeScaledTest<qint8>(-2.0f, 1.0f, -128, -2.0);
 }
 TEST_F(DequantizeOpTest, DequantizeScaledQint8ScaleUp) {
   RunDequantizeScaledTest<qint8>(-1.0f, 300.0f, 42, 99.212601);
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
index 479f7be4b506e4f8721216fb00ea0eff7e0394c2..a23478af5b5ca339878a44249f0732e4cb7fefc0 100644
--- a/tensorflow/core/kernels/fifo_queue.cc
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -23,13 +23,13 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/fifo_queue.h"
 #include "tensorflow/core/kernels/queue_base.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 911aa3a78fff2f6f7272e7408388e6625df52037..9ae04a1062fe21eb619b2f967358adae53c1b409 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -15,16 +15,16 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device_base.h"
+#endif
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
-#if GOOGLE_CUDA
-#include "tensorflow/stream_executor/stream.h"
-#endif  // GOOGLE_CUDA
-
 namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -39,6 +39,21 @@ Status Instantiate(FunctionLibraryRuntime* lib, const NameAttrList& func,
   return lib->Instantiate(func.name(), AttrSlice(&func.attr()), handle);
 }
 
+template <typename To, typename From>  // use like this: down_cast<T*>(foo);
+inline To down_cast(From* f) {         // so we only accept pointers
+  static_assert(
+      (std::is_base_of<From, typename std::remove_pointer<To>::type>::value),
+      "target type not derived from source type");
+
+  // We skip the assert and hence the dynamic_cast if RTTI is disabled.
+#if !defined(__GNUC__) || defined(__GXX_RTTI)
+  // Uses RTTI in dbg and fastbuild. asserts are disabled in opt builds.
+  assert(f == nullptr || dynamic_cast<To>(f) != nullptr);
+#endif  // !defined(__GNUC__) || defined(__GXX_RTTI)
+
+  return static_cast<To>(f);
+}
+
 // If "t" is a scalar of a supported type, returns t != 0 in "*v".
 Status ToBool(gtl::ArraySlice<Tensor> t, bool* v) {
   if (t.size() != 1) {
@@ -279,8 +294,46 @@ class WhileOp : public AsyncOpKernel {
     }
 
     void StartBody() {
+      Status s;
+      if (rets_.size() != 1) {
+        s = errors::InvalidArgument(
+            "Expected a single scalar return value from WhileOp cond, got ",
+            rets_.size(), " tensors.");
+        return Finish(s);
+      }
+      Tensor cond_t;
+#if GOOGLE_CUDA
+      const DeviceBase::GpuDeviceInfo* gpu_device_info =
+          ctx_->device()->tensorflow_gpu_device_info();
+      const bool is_hostmem_dtype =
+          rets_[0].dtype() == DT_INT32 || rets_[0].dtype() == DT_INT64;
+      if (!is_hostmem_dtype && gpu_device_info &&
+          (opts_.rets_alloc_attrs.empty() ||
+           !opts_.rets_alloc_attrs[0].on_host())) {
+        // Copy the ret value to host if it's allocated on device.
+        Device* device = down_cast<Device*>(ctx_->device());
+        DeviceContext* device_ctx = ctx_->op_device_context();
+        cond_t = Tensor(rets_[0].dtype(), rets_[0].shape());
+        Notification done_copy;
+        device_ctx->CopyDeviceTensorToCPU(
+            &rets_[0], /*tensor_name=*/"", device, &cond_t,
+            [&done_copy, &s](const Status& status) {
+              s = status;
+              done_copy.Notify();
+            });
+        done_copy.WaitForNotification();
+        if (!s.ok()) {
+          return Finish(s);
+        }
+      } else {
+        cond_t = rets_[0];
+      }
+#else
+      cond_t = rets_[0];
+#endif
       bool cond;
-      Status s = ToBool(rets_, &cond);
+      s = ToBool({cond_t}, &cond);
+
       if (!s.ok()) {
         return Finish(s);
       }
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index e6fefe643b72bd5a169f0c152ac2fee2568462aa..5cd8e049272e61c84950226893c281785e56a51a 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -37,6 +37,7 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
index 39b6924d74a004b4daa35a8cdfe656963d564490..4563fc635353e93f523ac035ff7f74bcd0410c09 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
@@ -31,6 +31,7 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 7e5a9e1ec5aac26706d95646a29539dd0f4be2ed..4e53291b7fe715fb72e81fa275608cd0e501723a 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -228,6 +228,8 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
+TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
@@ -239,6 +241,8 @@ TF_CALL_complex128(DECLARE_GPU_SPECS);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_ND_GPU(type) REGISTER_GATHER_ND_ALL_INDICES(GPU, type)
 
+TF_CALL_int32(REGISTER_GATHER_ND_GPU);
+TF_CALL_int64(REGISTER_GATHER_ND_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex64(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex128(REGISTER_GATHER_ND_GPU);
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index b03efc684ffca4abde99b31952983aad5f805ee3..da8d2e9e3cb35235dc524545aebac80065945a11 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -119,6 +119,8 @@ struct GatherNdSlice<GPUDevice, T, Index, IXDIM> {
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+TF_CALL_int32(DEFINE_GPU_SPECS);
+TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index ef332ebee39cc31f71d6a03ce61d7bded39292de..094504d6b95d4c0f9890e8eb455018477d4261a5 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -153,6 +153,7 @@ TF_CALL_uint64(REGISTER_GATHER_CPU);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
+TF_CALL_int64(REGISTER_GATHER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
 TF_CALL_complex64(REGISTER_GATHER_GPU);
 TF_CALL_complex128(REGISTER_GATHER_GPU);
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index 66aeec51050c9fe16f54f5bb5e29c93ae4b2b8df..4870d9ae200cd55adc4833c044e5588aa1d6aa89 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -70,7 +70,6 @@ tf_kernel_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
         "//third_party/eigen3",
-        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index 3664f95c3b1ed44fdc5d82390aa8d661ab2cfd30..f9c15ce6d745eb2571aedf12d04dfb271461f5a0 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -577,9 +577,7 @@ TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
-#if CUDA_VERSION >= 7050
 TF_CALL_half(REGISTER_GPU);
-#endif
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 9ab95d765c39d70448e5a99aeb3fad6101827daf..a9b952095d5798fd7f53907850a186951f88efc9 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 
 #include <limits>
 #include <vector>
+#include <unordered_map>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -589,8 +590,8 @@ class MklConcatOp : public OpKernel {
       const int N = input_tensors.size();
 
       // Get Tensor shapes.
-      std::vector<MklDnnShape> input_shapes(N);
-      GetMklShapeList(context, "values", &input_shapes);
+      std::vector<MklDnnShape> mkl_input_shapes(N);
+      GetMklShapeList(context, "values", &mkl_input_shapes);
 
       const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
                                             ? MklGetInput(context, 0)
@@ -609,19 +610,14 @@ class MklConcatOp : public OpKernel {
       int i = 0;
       bool invoke_eigen = false;
       bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
-      const TensorShape expected_shape = input_shapes[0].IsMklTensor()
-                                             ? input_shapes[0].GetTfShape()
-                                             : input_tensors[0].shape();
+      const TensorShape expected_shape = mkl_input_shapes[0].IsMklTensor()
+                                       ? mkl_input_shapes[0].GetTfShape()
+                                       : input_tensors[0].shape();
       size_t expected_dims = expected_shape.dims();
 
       if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
 
-      for (auto& s : input_shapes) {
-        if (s == expected_shape) {
-          ++i;
-          continue;
-        }
-
+      for (auto& s : mkl_input_shapes) {
         TensorShape s_shape =
             s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
         size_t s_dims = s_shape.dims();
@@ -664,21 +660,14 @@ class MklConcatOp : public OpKernel {
 
       // Call Eigen library
       if (invoke_eigen) {
-        TensorShapeList tf_input_shapes;
-        i = 0;
-        for (auto& s : input_shapes) {
-          TensorShape s_shape =
-              s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
-          tf_input_shapes.push_back(s_shape);
-          ++i;
-        }
-        CallEigenVersion(context, input_tensors, tf_input_shapes);
+        CallEigenVersion(context, input_tensors, mkl_input_shapes);
         return;
       }
 
       memory::dims dst_dims;
+
       if (are_all_mkl_inputs)
-        dst_dims = TFShapeToMklDnnDims(input_shapes[0].GetTfShape());
+        dst_dims = TFShapeToMklDnnDims(mkl_input_shapes[0].GetTfShape());
       else
         // When all the inputs are in Tensorflow format, we don't know
         // what is the input data format. In that case, we just use
@@ -688,26 +677,61 @@ class MklConcatOp : public OpKernel {
       std::vector<memory::primitive_desc> srcs_pd;
       std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
       int64 dst_concat_dim_size = 0;
-      for (int k = 0; k < N; k++) {
-        bool is_mkl_tensor = input_shapes[k].IsMklTensor();
-        memory::dims src_dims;
-
-        // Same comment as dst_dims for src_dims.
-        src_dims = (is_mkl_tensor)
-                       ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape())
-                       : TFShapeToMklDnnDims(input_tensors[k].shape());
-
-        dst_concat_dim_size += src_dims[concat_dim];
-        auto src_md =
-            is_mkl_tensor ? input_shapes[k].GetMklLayout() :
-                          // It does not matter what data format we use here
-                          // (NHWC or NCHW). We just need to ensure that output
-                          // of Concat uses same data format as input.
-                memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
-
-        srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-        auto src_mpd = srcs[k].GetUsrMemPrimDesc();
-        srcs_pd.push_back(src_mpd);
+
+      bool isMklReorderNeeded = false;
+      memory::format mkl_common_format = memory::format::any;
+      if (are_all_mkl_inputs) {
+        mkl_common_format =
+            FindMklCommonFormat(mkl_input_shapes, concat_dim,
+               &isMklReorderNeeded, &dst_concat_dim_size);
+
+        if (!isMklReorderNeeded) {
+          // All MKL tensors have a same format. Reorder is not needed.
+          for (int k = 0; k < N; k++) {
+            if (input_tensors[k].NumElements() == 0)
+              continue;
+
+            auto src_md = mkl_input_shapes[k].GetMklLayout();
+            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+            auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+            srcs_pd.push_back(src_mpd);
+          }
+        } else {
+          // MKL tensors have different formats.
+          // Reorder them to most common format.
+          for (int k = 0; k < N; k++) {
+            if (input_tensors[k].NumElements() == 0)
+              continue;
+
+            auto src_dims = TFShapeToMklDnnDims(
+                mkl_input_shapes[k].GetTfShape());
+            auto src_md = mkl_input_shapes[k].GetMklLayout();
+            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+
+            if (src_md.data.format != mkl_common_format)
+              src_md = memory::desc(src_dims, MklDnnType<T>(),
+                           mkl_common_format);
+
+            srcs_pd.push_back(memory::primitive_desc(src_md, cpu_engine));
+          }
+        }
+      } else {  // All TF inputs
+        for (int k = 0; k < N; k++) {
+          if (input_tensors[k].NumElements() == 0)
+            continue;
+
+          memory::dims src_dims = TFShapeToMklDnnDims(input_tensors[k].shape());
+          dst_concat_dim_size += src_dims[concat_dim];
+
+          // It does not matter what data format to be used (NHWC versus NCHW).
+          // We just need to ensure that output uses same data format as inputs.
+          auto src_md =
+              memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
+
+          srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+          auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+          srcs_pd.push_back(src_mpd);
+        }
       }
       dst_dims[concat_dim] = dst_concat_dim_size;
 
@@ -717,25 +741,33 @@ class MklConcatOp : public OpKernel {
       if (are_all_mkl_inputs) {
         // Since we are passing a specific format for destination,
         // we need to have dst_dims in MklDnn order (NCHW).
-        auto orig_tf_format = input_shapes[0].GetTfDataFormat();
+        auto orig_tf_format = mkl_input_shapes[0].GetTfDataFormat();
         dst_dims_in_nchw = MklDnnDimsInNCHW(
             dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
-        // We will set the output in the same format as input to avoid layout
-        // conversions.
-        // Currently we are setting dst format same as input format.
-        // See if we can make this choice in a better way.
+        // Set the output format same as the most common format of inputs
+        // to avoid layout conversions.
         dst_md = memory::desc(
-            dst_dims_in_nchw, MklDnnType<T>(),
-            (memory::format)input_shapes[0].GetMklLayout().data.format);
+            dst_dims_in_nchw, MklDnnType<T>(), mkl_common_format);
       } else {
-        // Again, format does not matter here. We just need to make it same as
-        // input format.
+        // All inputs are TF tensors.
+        // Set the output format same as input format (nchw).
         dst_md = memory::desc(dst_dims, MklDnnType<T>(), memory::format::nchw);
       }
 
       std::vector<primitive::at> inputs;
-      for (int k = 0; k < input_tensors.size(); k++)
-        inputs.push_back(srcs[k].GetOpMem());
+      std::vector<primitive> net;
+      if (isMklReorderNeeded) {
+        for (int k = 0; k < input_tensors.size(); k++) {
+          if (input_tensors[k].NumElements() > 0) {
+            srcs[k].CheckReorderToOpMem(srcs_pd[k], &net);
+          }
+        }
+      }
+      for (int k = 0; k < input_tensors.size(); k++) {
+        if (input_tensors[k].NumElements() > 0) {
+          inputs.push_back(srcs[k].GetOpMem());
+        }
+      }
 
       // If all inputs are in MKL format, then meaning of concat_dim needs to
       // change. Value of concat_dim is tied to input Tensorflow data format
@@ -744,7 +776,8 @@ class MklConcatOp : public OpKernel {
       // But ifinput tensors are in NHWC order, then semantics need to change.
       // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
       // then since MklDnn order is NCHW, concat_dim needs to be 1.
-      if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim);
+      if (are_all_mkl_inputs)
+         concat_dim = mkl_input_shapes[0].TfDimIdx(concat_dim);
 
       auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
 
@@ -757,7 +790,7 @@ class MklConcatOp : public OpKernel {
         dnn_shape_dst.SetMklLayout(&dst_pd);
         dnn_shape_dst.SetElemType(MklDnnType<T>());
         dnn_shape_dst.SetTfLayout(dst_dims.size(), dst_dims_in_nchw,
-                                  input_shapes[0].GetTfDataFormat());
+                                  mkl_input_shapes[0].GetTfDataFormat());
         tf_shape_dst.AddDim((dst_pd.get_size() / sizeof(T)));
       } else {
         dnn_shape_dst.SetMklTensor(false);
@@ -772,7 +805,6 @@ class MklConcatOp : public OpKernel {
       dst.SetUsrMem(dst_md, dst_tensor);
 
       auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
-      std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
     } catch (mkldnn::error& e) {
@@ -786,15 +818,27 @@ class MklConcatOp : public OpKernel {
   }
 
   void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
-                        const TensorShapeList& input_shapes) {
-    CHECK_EQ(values.size(), input_shapes.size());
+                        const MklDnnShapeList& mkl_input_shapes) {
+    CHECK_EQ(values.size(), mkl_input_shapes.size());
 
     std::vector<Tensor> converted_values;
-    for (int i = 0; i < input_shapes.size(); i++)
-      converted_values.push_back(values[i]);
+    TensorShapeList tf_input_shapes;
+    for (int i = 0; i < mkl_input_shapes.size(); i++) {
+      if (mkl_input_shapes[i].IsMklTensor()) {
+        // do conversion from MKL to TF
+        Tensor tmp_tensor =
+            ConvertMklToTF<T>(context, values[i], mkl_input_shapes[i]);
+        converted_values.push_back(tmp_tensor);
+        tf_input_shapes.push_back(mkl_input_shapes[i].GetTfShape());
+      } else {
+        // no conversion since it is TF tensor already
+        converted_values.push_back(values[i]);
+        tf_input_shapes.push_back(values[i].shape());
+      }
+    }
 
     // Call Eigen concat.
-    eigen_concat_op_.Compute(context, converted_values, input_shapes);
+    eigen_concat_op_.Compute(context, converted_values, tf_input_shapes);
 
     // Set output Mkl tensor for this op.
     MklDnnShape dnn_shape_output;
@@ -811,6 +855,55 @@ class MklConcatOp : public OpKernel {
         output_tensor->flat<uint8>().data(),
         output_tensor->flat<uint8>().size() * sizeof(uint8));
   }
+
+  // This method finds the most commom format accross all MKL inputs
+  // Inputs:
+  //   1. input_shapes: shapes of input (MKL) tensors.
+  //   2. concat_dim: concat dimension.
+  // Outputs:
+  //   1. is_reorder_needed is set to true if inputs have difference formats
+  //      It is set to false otherwise.
+  //   2. concat_dim_size is the size of concat_dim.
+  // Return:
+  //   return the common MKL format.
+  memory::format FindMklCommonFormat(const MklDnnShapeList& input_shapes,
+      int concat_dim, bool* is_reorder_needed, int64* concat_dim_size) {
+    *is_reorder_needed = false;
+    *concat_dim_size = 0;
+    std::unordered_map<int, int> occurrence_map;
+    if (input_shapes.size() == 0)
+      return memory::format::any;
+
+    // Compute ocurrences of each format of all inputs.
+    for (int k=0; k <input_shapes.size(); k++) {
+      auto src_dims = TFShapeToMklDnnDims(input_shapes[k].GetTfShape());
+      *concat_dim_size += src_dims[concat_dim];
+      int fmt = static_cast<int>(
+          input_shapes[k].GetMklLayout().data.format);
+      occurrence_map[fmt] += 1;
+    }
+
+    if (occurrence_map.size() == 1) {
+       // this means that all inputs have a same format
+       // return it with is_reorder_needed set false.
+       return static_cast<memory::format>(
+           input_shapes[0].GetMklLayout().data.format);
+    }
+
+    // Input tensors have different formats. Thus, reorder is needed.
+    // We pick up the most common format to minimize the total
+    // number of input reorder.
+    memory::format commonest_format = memory::format::any;
+    int max_occurrence = 0;
+    *is_reorder_needed = true;
+    for (auto item : occurrence_map) {
+      if (item.second > max_occurrence) {
+        commonest_format = static_cast<memory::format>(item.first);
+        max_occurrence = item.second;
+      }
+    }
+    return commonest_format;
+  }
 };
 
 #endif
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index d23027a54d169b5e597bd26a63f26d38a23239ae..a6698a1a0780ad90dea8485c413786d982d2c952 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -18,6 +18,7 @@ limitations under the License.
 // bias.
 
 #ifdef INTEL_MKL
+#ifdef INTEL_MKL_ML
 
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
@@ -262,4 +263,5 @@ class MklConv2DCustomBackpropBiasOp : public OpKernel {
 TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 } /* namespace tensorflow */
+#endif /* INTEL_MKL_ML */
 #endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 279167aba24863441774b0665e9793e52d84ccfa..c0dfed7d7d079c2b837afdb440c01687e6b6d4db 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -199,13 +199,15 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(dnn_data_input);
     TensorShape input_tensor_shape = input_tensor.shape();
-    memory::desc input_md =
+    if (input_tensor.NumElements() != 0) {
+      memory::desc input_md =
         input_mkl_shape.IsMklTensor()
             ? input_mkl_shape.GetMklLayout()
             : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
                                                      this->data_format_tf_),
                            MklDnnType<T>(), this->data_format_mkldnn_);
-    dnn_data_input->SetUsrMem(input_md, &input_tensor);
+      dnn_data_input->SetUsrMem(input_md, &input_tensor);
+    }
     this->InitMklPoolParameters(context, pool_params, input_mkl_shape,
                                 input_tensor_shape);
   }
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 903b898d0ac850e88c216cb1cc266cdb29fb4ca7..2b010f816d4e503b3b5a89a7b406273511cad954 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/non_max_suppression_op.h"
 
+#include <queue>
 #include <vector>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -56,20 +57,9 @@ static inline void ParseAndCheckBoxSizes(OpKernelContext* context,
               errors::InvalidArgument("scores has incompatible shape"));
 }
 
-static inline void DecreasingArgSort(const std::vector<float>& values,
-                                     std::vector<int>* indices) {
-  indices->resize(values.size());
-  for (int i = 0; i < values.size(); ++i) (*indices)[i] = i;
-  std::sort(
-      indices->begin(), indices->end(),
-      [&values](const int i, const int j) { return values[i] > values[j]; });
-}
-
-// Return true if intersection-over-union overlap between boxes i and j
-// is greater than iou_threshold.
-static inline bool IOUGreaterThanThreshold(
-    typename TTypes<float, 2>::ConstTensor boxes, int i, int j,
-    float iou_threshold) {
+// Return intersection-over-union overlap between boxes i and j
+static inline float IOU(typename TTypes<float, 2>::ConstTensor boxes, int i,
+                        int j) {
   const float ymin_i = std::min<float>(boxes(i, 0), boxes(i, 2));
   const float xmin_i = std::min<float>(boxes(i, 1), boxes(i, 3));
   const float ymax_i = std::max<float>(boxes(i, 0), boxes(i, 2));
@@ -88,13 +78,13 @@ static inline bool IOUGreaterThanThreshold(
   const float intersection_area =
       std::max<float>(intersection_ymax - intersection_ymin, 0.0) *
       std::max<float>(intersection_xmax - intersection_xmin, 0.0);
-  const float iou = intersection_area / (area_i + area_j - intersection_area);
-  return iou > iou_threshold;
+  return intersection_area / (area_i + area_j - intersection_area);
 }
 
 void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
                            const Tensor& scores, const Tensor& max_output_size,
-                           const float iou_threshold) {
+                           const float iou_threshold,
+                           const float score_threshold) {
   OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1,
               errors::InvalidArgument("iou_threshold must be in [0, 1]"));
 
@@ -109,37 +99,61 @@ void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
 
   std::vector<float> scores_data(num_boxes);
   std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
-  std::vector<int> sorted_indices;
-  DecreasingArgSort(scores_data, &sorted_indices);
+
+  // Data structure for selection candidate in NMS.
+  struct Candidate {
+    int box_index;
+    float score;
+  };
+
+  auto cmp = [](const Candidate bs_i, const Candidate bs_j) {
+    return bs_i.score < bs_j.score;
+  };
+  std::priority_queue<Candidate, std::deque<Candidate>, decltype(cmp)>
+      candidate_priority_queue(cmp);
+  for (int i = 0; i < scores_data.size(); ++i) {
+    if (scores_data[i] > score_threshold) {
+      candidate_priority_queue.emplace(Candidate({i, scores_data[i]}));
+    }
+  }
+
+  auto suppress_func = [iou_threshold](const float x) {
+    return x <= iou_threshold ? 1 : 0;
+  };
 
   std::vector<int> selected;
-  std::vector<int> selected_indices(output_size, 0);
-  int num_selected = 0;
-  for (int i = 0; i < num_boxes; ++i) {
-    if (selected.size() >= output_size) break;
-    bool should_select = true;
+  std::vector<float> selected_scores;
+  Candidate next_candidate;
+  float iou, original_score;
+
+  while (selected.size() < output_size && !candidate_priority_queue.empty()) {
+    next_candidate = candidate_priority_queue.top();
+    original_score = next_candidate.score;
+    candidate_priority_queue.pop();
+
     // Overlapping boxes are likely to have similar scores,
-    // therefore we iterate through the selected boxes backwards.
-    for (int j = num_selected - 1; j >= 0; --j) {
-      if (IOUGreaterThanThreshold(boxes_data, sorted_indices[i],
-                                  sorted_indices[selected_indices[j]],
-                                  iou_threshold)) {
-        should_select = false;
-        break;
-      }
+    // therefore we iterate through the previously selected boxes backwards
+    // in order to see if `next_candidate` should be suppressed.
+    for (int j = selected.size() - 1; j >= 0; --j) {
+      iou = IOU(boxes_data, next_candidate.box_index, selected[j]);
+      if (iou == 0.0) continue;
+      next_candidate.score *= suppress_func(iou);
+      if (next_candidate.score <= score_threshold) break;
     }
-    if (should_select) {
-      selected.push_back(sorted_indices[i]);
-      selected_indices[num_selected++] = i;
+
+    if (original_score == next_candidate.score) {
+      selected.push_back(next_candidate.box_index);
+      selected_scores.push_back(next_candidate.score);
     }
   }
 
-  // Allocate output tensor
-  Tensor* output = nullptr;
+  // Allocate output tensors
+  Tensor* output_indices = nullptr;
   TensorShape output_shape({static_cast<int>(selected.size())});
-  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
-  TTypes<int, 1>::Tensor selected_indices_data = output->tensor<int, 1>();
-  std::copy_n(selected.begin(), selected.size(), selected_indices_data.data());
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(0, output_shape, &output_indices));
+  TTypes<int, 1>::Tensor output_indices_data = output_indices->tensor<int, 1>();
+  std::copy_n(selected.begin(), selected.size(), output_indices_data.data());
 }
 
 }  // namespace
@@ -164,8 +178,9 @@ class NonMaxSuppressionOp : public OpKernel {
         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
                                 max_output_size.shape().DebugString()));
 
+    const float score_threshold_val = 0.0;
     DoNonMaxSuppressionOp(context, boxes, scores, max_output_size,
-                          iou_threshold_);
+                          iou_threshold_, score_threshold_val);
   }
 
  private:
@@ -194,11 +209,48 @@ class NonMaxSuppressionV2Op : public OpKernel {
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
                 errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
                                         iou_threshold.shape().DebugString()));
+    const float iou_threshold_val = iou_threshold.scalar<float>()();
 
+    const float score_threshold_val = 0.0;
+    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size,
+                          iou_threshold_val, score_threshold_val);
+  }
+};
+
+template <typename Device>
+class NonMaxSuppressionV3Op : public OpKernel {
+ public:
+  explicit NonMaxSuppressionV3Op(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // boxes: [num_boxes, 4]
+    const Tensor& boxes = context->input(0);
+    // scores: [num_boxes]
+    const Tensor& scores = context->input(1);
+    // max_output_size: scalar
+    const Tensor& max_output_size = context->input(2);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(max_output_size.shape()),
+        errors::InvalidArgument("max_output_size must be 0-D, got shape ",
+                                max_output_size.shape().DebugString()));
+    // iou_threshold: scalar
+    const Tensor& iou_threshold = context->input(3);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+                errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                        iou_threshold.shape().DebugString()));
     const float iou_threshold_val = iou_threshold.scalar<float>()();
 
+    // score_threshold: scalar
+    const Tensor& score_threshold = context->input(4);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(score_threshold.shape()),
+        errors::InvalidArgument("score_threshold must be 0-D, got shape ",
+                                score_threshold.shape().DebugString()));
+    const float score_threshold_val = score_threshold.scalar<float>()();
+
     DoNonMaxSuppressionOp(context, boxes, scores, max_output_size,
-                          iou_threshold_val);
+                          iou_threshold_val, score_threshold_val);
   }
 };
 
@@ -208,4 +260,7 @@ REGISTER_KERNEL_BUILDER(Name("NonMaxSuppression").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV2").Device(DEVICE_CPU),
                         NonMaxSuppressionV2Op<CPUDevice>);
 
+REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV3").Device(DEVICE_CPU),
+                        NonMaxSuppressionV3Op<CPUDevice>);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/non_max_suppression_op.h b/tensorflow/core/kernels/non_max_suppression_op.h
index d4349edf176f705fcc07de3c6838366ed5c9c6bf..933b1af447d94b1d8c7a91ecb26aa84263fbe265 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.h
+++ b/tensorflow/core/kernels/non_max_suppression_op.h
@@ -27,7 +27,8 @@ template <typename Device, typename T>
 struct NonMaxSuppression {
   void operator()(const Device& d, typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<float, 1>::ConstTensor scores,
-                  float iou_threshold, int max_output_size,
+                  float iou_threshold, float score_threshold,
+                  int max_output_size,
                   typename TTypes<int, 1>::Tensor selected_indices);
 };
 
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index 9387fb13bc252418d247b25464c67b90a864901d..c71aa23e01d1a552ae0ded1e196a031ea018c68c 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -340,4 +340,195 @@ TEST_F(NonMaxSuppressionV2OpTest, TestEmptyInput) {
   test::ExpectTensorEqual<int>(expected, *GetOutput(0));
 }
 
+//
+// NonMaxSuppressionV3Op Tests
+//
+
+class NonMaxSuppressionV3OpTest : public OpsTestBase {
+ protected:
+  void MakeOp() {
+    TF_EXPECT_OK(NodeDefBuilder("non_max_suppression_op", "NonMaxSuppressionV3")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(NonMaxSuppressionV3OpTest, TestSelectFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3OpTest,
+       TestSelectFromThreeClustersWithScoreThreshold) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {0.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.4f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected, {3, 0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3OpTest,
+       TestSelectFromThreeClustersFlippedCoordinates) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({6, 4}),
+                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f, 1, -0.1f,
+                            0, 10, 1, 11, 1, 10.1f, 0, 11.1f, 1, 101, 0, 100});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3OpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {2});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected, {3, 0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3OpTest,
+       TestSelectAtMostThirtyBoxesFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3OpTest, TestSelectSingleBox) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<float>(TensorShape({1}), {.9f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected, {0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3OpTest, TestSelectFromTenIdenticalBoxes) {
+  MakeOp();
+
+  int num_boxes = 10;
+  std::vector<float> corners(num_boxes * 4);
+  std::vector<float> scores(num_boxes);
+  for (int i = 0; i < num_boxes; ++i) {
+    corners[i * 4 + 0] = 0;
+    corners[i * 4 + 1] = 0;
+    corners[i * 4 + 2] = 1;
+    corners[i * 4 + 3] = 1;
+    scores[i] = .9;
+  }
+  AddInputFromArray<float>(TensorShape({num_boxes, 4}), corners);
+  AddInputFromArray<float>(TensorShape({num_boxes}), scores);
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected, {0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3OpTest, TestInconsistentBoxAndScoreShapes) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  Status s = RunOpKernel();
+
+  ASSERT_FALSE(s.ok());
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
+      << s;
+}
+
+TEST_F(NonMaxSuppressionV3OpTest, TestInvalidIOUThreshold) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<float>(TensorShape({1}), {.9f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {1.2f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  Status s = RunOpKernel();
+
+  ASSERT_FALSE(s.ok());
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
+      << s;
+}
+
+TEST_F(NonMaxSuppressionV3OpTest, TestEmptyInput) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({0, 4}), {});
+  AddInputFromArray<float>(TensorShape({0}), {});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({0}));
+  test::FillValues<int>(&expected, {});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ops_testutil.cc b/tensorflow/core/kernels/ops_testutil.cc
index cd13d31bbc2d2d77ed3d1aabfa17af40bfff0272..7aa7d1a5861608658408189a8ce8d93ff1a6c9de 100644
--- a/tensorflow/core/kernels/ops_testutil.cc
+++ b/tensorflow/core/kernels/ops_testutil.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 
 void OpsTestBase::SetDevice(const DeviceType& device_type,
                             std::unique_ptr<Device> device) {
-  CHECK(device_.get()) << "No device provided";
+  CHECK(device_) << "No device provided";
   device_type_ = device_type;
   device_ = std::move(device);
 #ifdef GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/padding_fifo_queue.cc b/tensorflow/core/kernels/padding_fifo_queue.cc
index 9d35ecb66c00e0cf7a2298a9d324c910ed33c7cc..ff553f11c9fdfa3e49319d9c0778cfb41b40af8c 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue.cc
@@ -23,13 +23,13 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/padding_fifo_queue.h"
 #include "tensorflow/core/kernels/queue_base.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/priority_queue.cc b/tensorflow/core/kernels/priority_queue.cc
index bab94f7f0ad1fd7609761aaabc4f76ae6eafeb7b..342e66a7c7d0e0bc74e39efcd971c105ce8ea929 100644
--- a/tensorflow/core/kernels/priority_queue.cc
+++ b/tensorflow/core/kernels/priority_queue.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/priority_queue.h"
 #include "tensorflow/core/kernels/queue_base.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -30,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/quantize_op.cc b/tensorflow/core/kernels/quantize_op.cc
index fc26813a08e5afabf893f963eefa59fba18873d9..857273e04eec2f9a132133a9289612ad8277a302 100644
--- a/tensorflow/core/kernels/quantize_op.cc
+++ b/tensorflow/core/kernels/quantize_op.cc
@@ -131,6 +131,7 @@ class QuantizeV2Op : public OpKernel {
 
     Tensor* output = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
+    typename TTypes<T>::Vec o = output->template flat<T>();
     if (mode_ == QUANTIZE_MODE_MIN_COMBINED) {
       const float scale_factor =
           (static_cast<double>(std::numeric_limits<T>::max()) -
@@ -147,7 +148,6 @@ class QuantizeV2Op : public OpKernel {
       // semantic of std::round, which implements "round-half-away-zero",
       // e.g., -5.5 gets rounded to -6, -5.4 goes to -5, 5.4 goes to 5,
       // and 5.5 goes to 6.
-      typename TTypes<T>::Vec o = output->template flat<T>();
       bool is_signed = std::is_signed<T>::value;
       if (is_signed) {
         // The slow path.
@@ -180,29 +180,20 @@ class QuantizeV2Op : public OpKernel {
             output);
       }
     } else if (mode_ == QUANTIZE_MODE_SCALED) {
-      // The quantization logic for mode SCALED matches that of
-      // QuantizeAndDequantizeV2 and QuantizeAndDequantizeV3.
-      typename TTypes<T>::Vec o = output->template flat<T>();
-      static constexpr int num_bits = sizeof(T) * 8;
-      const float max_abs = std::max(std::abs(min_range), std::abs(max_range));
-      const bool is_signed = std::is_signed<T>::value;
-      float target_range;
-      if (is_signed) {
-        max_range = max_abs;
-        min_range = -max_abs;
-        // If it is signed, we try to keep 0.0 being 0 and drop one bucket. For
-        // example, if it is 8 bits, we have the range [-127, 127]. So for input
-        // range of [-x, x], the scale should be 254/(2*x).
-        target_range = static_cast<float>((uint64_t{1} << (num_bits - 1)) - 1);
-      } else {
-        max_range = max_abs;
-        min_range = 0.0;
-        // If it is unsigned and num_bits == 8, the range with 8 bits is [0,
-        // 255].  If the input range is [0, x], then the scale is x/255 instead
-        // of 254 as in the case above.
-        target_range = static_cast<float>((uint64_t{1} << num_bits) - 1);
-      }
-      const float scale_factor = target_range / max_abs;
+      const int min_output_value = std::numeric_limits<T>::min();
+      const int max_output_value = std::numeric_limits<T>::max();
+      const float scale_factor_from_min_side =
+          (min_output_value * min_range > 0)
+              ? min_output_value / min_range
+              : std::numeric_limits<float>::max();
+      const float scale_factor_from_max_side =
+          (max_output_value * max_range > 0)
+              ? max_output_value / max_range
+              : std::numeric_limits<float>::max();
+      const float scale_factor =
+          std::min(scale_factor_from_min_side, scale_factor_from_max_side);
+      min_range = min_output_value / scale_factor;
+      max_range = max_output_value / scale_factor;
       if (round_mode_ == ROUND_HALF_TO_EVEN) {
         // scalar_round_op_google implements "round-half-to-even".
         o.device(ctx->template eigen_device<Device>()) =
diff --git a/tensorflow/core/kernels/quantize_op_test.cc b/tensorflow/core/kernels/quantize_op_test.cc
index 57982bdf76e3969b31f4ee73cbf47c564b2b53e6..0a672686a2d49b59e6b672660e65fdd2c344f229 100644
--- a/tensorflow/core/kernels/quantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_op_test.cc
@@ -61,17 +61,17 @@ TEST_F(QuantizedOpTest, QuantizeV2Quint8Scaled) {
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInputFromArray<float>(TensorShape({8}),
-                           {-255.0, 0.0, 1.0, 1.25, 1.75, 127.0, 255.0, 500.0});
+                           {-255.0, 0.0, 1.0, 1.25, 1.75, 64.0, 127.0, 500.0});
   AddInputFromArray<float>(TensorShape({1}), {-255.0f});
   AddInputFromArray<float>(TensorShape({1}), {127.0f});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_QUINT8, TensorShape({8}));
-  // Input element -5.0 should map to 0 even though min_range = -255, because
+  // Input values < 0 should map to 0 even though min_range = -255, because
   // we are performing quantization by scaling to quint8.
-  // Input element 0.0 should map to 0.
-  // Input element 500.0 is quantized to 127 because
-  // max(abs(-255), abs(127)) = 255.
-  test::FillValues<quint8>(&expected, {0, 0, 1, 1, 2, 127, 255, 255});
+  // Input value 0.0 should map to 0.
+  // The scale factor chosen should be 255 / 127 =  2.00787
+  // Output values are clipped to 255.
+  test::FillValues<quint8>(&expected, {0, 0, 2, 3, 4, 129, 255, 255});
   test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
 
   Tensor expected_output_min(allocator(), DT_FLOAT, TensorShape({}));
@@ -79,7 +79,7 @@ TEST_F(QuantizedOpTest, QuantizeV2Quint8Scaled) {
   test::ExpectTensorEqual<float>(expected_output_min, *GetOutput(1));
 
   Tensor expected_output_max(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_max, {255.0});
+  test::FillValues<float>(&expected_output_max, {127.0});
   test::ExpectTensorEqual<float>(expected_output_max, *GetOutput(2));
 }
 
@@ -123,19 +123,19 @@ TEST_F(QuantizedOpTest, QuantizeV2Qint8Scaled) {
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInputFromArray<float>(TensorShape({7}),
-                           {-127.0, 0.0, 1.0, 1.25, 1.75, 64.0, 127.0});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
+                           {-128.0, 0.0, 1.0, 1.25, 1.75, 64.0, 127.0});
+  AddInputFromArray<float>(TensorShape({1}), {-128.0f});
   AddInputFromArray<float>(TensorShape({1}), {100.0f});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_QINT8, TensorShape({7}));
   // Input element 0.0 should map to 0.
   // Input element 127.0 maps to 127 instead of 100 because
   // max(abs(-127), abs(100)) = 127.
-  test::FillValues<qint8>(&expected, {-127, 0, 1, 1, 2, 64, 127});
+  test::FillValues<qint8>(&expected, {-128, 0, 1, 1, 2, 64, 127});
   test::ExpectTensorEqual<qint8>(expected, *GetOutput(0));
 
   Tensor expected_output_min(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_min, {-127.0});
+  test::FillValues<float>(&expected_output_min, {-128.0});
   test::ExpectTensorEqual<float>(expected_output_min, *GetOutput(1));
 
   Tensor expected_output_max(allocator(), DT_FLOAT, TensorShape({}));
@@ -152,9 +152,9 @@ TEST_F(QuantizedOpTest, QuantizeV2Qint8ScaledSmallInputRange) {
                    .Attr("mode", "SCALED")
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  AddInputFromArray<float>(TensorShape({3}), {-1.0, 0.0, 2.0});
-  AddInputFromArray<float>(TensorShape({1}), {-1.0f});
-  AddInputFromArray<float>(TensorShape({1}), {2.0f});
+  AddInputFromArray<float>(TensorShape({3}), {-0.064, 0.0, 0.127});
+  AddInputFromArray<float>(TensorShape({1}), {-0.064f});
+  AddInputFromArray<float>(TensorShape({1}), {0.127f});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_QINT8, TensorShape({3}));
   // Input element 0.0 should map to 0.
@@ -163,11 +163,11 @@ TEST_F(QuantizedOpTest, QuantizeV2Qint8ScaledSmallInputRange) {
   test::ExpectTensorEqual<qint8>(expected, *GetOutput(0));
 
   Tensor expected_output_min(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_min, {-2.0});
+  test::FillValues<float>(&expected_output_min, {-0.128});
   test::ExpectTensorEqual<float>(expected_output_min, *GetOutput(1));
 
   Tensor expected_output_max(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_max, {2.0});
+  test::FillValues<float>(&expected_output_max, {0.127});
   test::ExpectTensorEqual<float>(expected_output_max, *GetOutput(2));
 }
 
@@ -183,8 +183,8 @@ TEST_F(QuantizedOpTest, QuantizeV2Qint8ScaledRoundToEven) {
   TF_ASSERT_OK(InitOp());
   AddInputFromArray<float>(TensorShape({7}),
                            {-126.5, 0.0, 1.0, 2.5, 3.5, 64.0, 127.0});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({1}), {-128.0f});
+  AddInputFromArray<float>(TensorShape({1}), {-128.0f});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_QINT8, TensorShape({7}));
   // Input element 0.0 should map to 0.
@@ -193,7 +193,7 @@ TEST_F(QuantizedOpTest, QuantizeV2Qint8ScaledRoundToEven) {
   test::ExpectTensorEqual<qint8>(expected, *GetOutput(0));
 
   Tensor expected_output_min(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_min, {-127.0});
+  test::FillValues<float>(&expected_output_min, {-128.0});
   test::ExpectTensorEqual<float>(expected_output_min, *GetOutput(1));
 
   Tensor expected_output_max(allocator(), DT_FLOAT, TensorShape({}));
@@ -213,8 +213,8 @@ TEST_F(QuantizedOpTest, QuantizeV2Qint8ScaledRoundAwayFromZero) {
   TF_ASSERT_OK(InitOp());
   AddInputFromArray<float>(TensorShape({7}),
                            {-126.5, 0.0, 1.0, 2.5, 3.5, 64.0, 127.0});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({1}), {-128.0f});
+  AddInputFromArray<float>(TensorShape({1}), {-128.0f});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_QINT8, TensorShape({7}));
   // Input element 0.0 should map to 0.
@@ -223,7 +223,7 @@ TEST_F(QuantizedOpTest, QuantizeV2Qint8ScaledRoundAwayFromZero) {
   test::ExpectTensorEqual<qint8>(expected, *GetOutput(0));
 
   Tensor expected_output_min(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_min, {-127.0});
+  test::FillValues<float>(&expected_output_min, {-128.0});
   test::ExpectTensorEqual<float>(expected_output_min, *GetOutput(1));
 
   Tensor expected_output_max(allocator(), DT_FLOAT, TensorShape({}));
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index de495c19cba300fbd034cda01adfd0518548ce68..acfbc02ee7ad5502539834f4a14c5cac7ce1f7b5 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index 87fc94333162c4b721fa3608f282bf9d28fc792e..31e8ce944fef913fd241801f4931fcb4dfd2025c 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/queue_op.h"
 #include "tensorflow/core/kernels/typed_queue.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -36,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/regex_full_match_op.cc b/tensorflow/core/kernels/regex_full_match_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5863a2c8e464196347173bdcf9a201889271e07a
--- /dev/null
+++ b/tensorflow/core/kernels/regex_full_match_op.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "re2/re2.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class RegexFullMatchOp : public OpKernel {
+ public:
+  explicit RegexFullMatchOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    const auto& input_flat = input_tensor->flat<string>();
+
+    const Tensor* pattern_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("pattern", &pattern_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(pattern_tensor->shape()),
+                errors::InvalidArgument("Pattern must be scalar, but received ",
+                                        pattern_tensor->shape().DebugString()));
+    const string pattern = pattern_tensor->flat<string>()(0);
+    const RE2 match(pattern);
+    OP_REQUIRES(ctx, match.ok(),
+                errors::InvalidArgument("Invalid pattern: ", pattern,
+                                        ", error: ", match.error()));
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("output", input_tensor->shape(),
+                                             &output_tensor));
+    auto output_flat = output_tensor->flat<bool>();
+    for (size_t i = 0; i < input_flat.size(); ++i) {
+      output_flat(i) = RE2::FullMatch(input_flat(i), match);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("RegexFullMatch").Device(DEVICE_CPU),
+                        RegexFullMatchOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index 6e46c979f33496f8da2c561683723728e28a610e..089ca8ed2796f6803b471c96ede0d68b7f0abe11 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -31,8 +31,6 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
-#ifdef TF_HAS_CUDA_FP16
-
 // This kernel computes ReluGrad by processing one half2, two fp16, at a time.
 // It effectively does: backdrops = (feature > 0) ? gradient : 0
 // It also tries to use native half2 primitives as much as possible.
@@ -113,8 +111,6 @@ struct ReluGrad<Device, Eigen::half> {
                                        backprop.data(), count);
   }
 };
-
-#endif  // TF_HAS_CUDA_FP16
 }  // namespace functor
 
 // Definition of the GPU implementations declared in relu_op.cc.
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index a8bcc7f7dc2677e0fd07157915986c4663dd6556..03cc414905c66ab483549098a88e6f433a00ec9f 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -703,6 +703,8 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_MINMAX_CPU);
 
 REGISTER_SCATTER_KERNEL(string, CPU, "ResourceScatterUpdate",
                         scatter_op::UpdateOp::ASSIGN);
+REGISTER_SCATTER_KERNEL(bool, CPU, "ResourceScatterUpdate",
+                        scatter_op::UpdateOp::ASSIGN);
 REGISTER_SCATTER_KERNEL(Variant, CPU, "ResourceScatterUpdate",
                         scatter_op::UpdateOp::ASSIGN);
 
@@ -725,6 +727,13 @@ REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
                             .TypeConstraint<int32>("Tindices"),
                         ResourceScatterUpdateOp<GPUDevice, Variant, int32,
                                                 scatter_op::UpdateOp::ASSIGN>)
+REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .TypeConstraint<bool>("dtype")
+                            .TypeConstraint<int32>("Tindices"),
+                        ResourceScatterUpdateOp<GPUDevice, bool, int32,
+                                                scatter_op::UpdateOp::ASSIGN>)
 REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
                             .Device(DEVICE_GPU)
                             .HostMemory("resource")
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index f5ebf0ea2e29ea9c7fb665c69ee03a8ee40bf4b3..722116f86fd131d3e686f9fc14ce0c1d056addc7 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -285,7 +285,7 @@ class RollOp : public OpKernel {
       dim_range[i] = dim_size_prod;
     }
 
-    Tensor* output = NULL;
+    Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
     auto input_flat = input.flat<T>().data();
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.cc b/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
index 59911bf0d26afe57a902b1533b75b76797070c06..bdc878594a3862fd4a7b948795826bcfd56ac92b 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
@@ -42,6 +42,8 @@ typedef Eigen::GpuDevice GPUDevice;
 
 DEFINE_GPU_SPECS(float);
 DEFINE_GPU_SPECS(double);
+DEFINE_GPU_SPECS_OP(bool, int32, scatter_op::UpdateOp::ASSIGN);
+DEFINE_GPU_SPECS_OP(bool, int64, scatter_op::UpdateOp::ASSIGN);
 // TODO(b/27222123): The following fails to compile due to lack of support for
 // fp16.
 // TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 0caa7bd3179a794b0c03fe0ca50729b6b3fa05bd..ff38026ac72e9951fabd36f1a35e273cb7f3db70 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -62,14 +62,57 @@ class ScatterNdOp : public OpKernel {
     const Tensor& updates = c->input(1);
     const Tensor& shape_input = c->input(2);
 
-    OP_REQUIRES(c, shape_input.dims() == 1,
-                errors::InvalidArgument("Shape must be a vector"));
+    OP_REQUIRES(c, indices.shape().dims() >= 1,
+                errors::InvalidArgument(
+                    "Indices shape must have rank at least one. Found:",
+                    indices.shape().DebugString()));
+    OP_REQUIRES(c, updates.shape().dims() >= 1,
+                errors::InvalidArgument(
+                    "Updates shape must have rank at least one. Found:",
+                    updates.shape().DebugString()));
 
     auto vec = shape_input.flat<Index>();
     TensorShape shape;
     OP_REQUIRES_OK(c,
                    TensorShapeUtils::MakeShape(vec.data(), vec.size(), &shape));
 
+    OP_REQUIRES(
+        c,
+        (shape.num_elements() > 0 || (indices.shape().num_elements() == 0 &&
+                                      updates.shape().num_elements() == 0)),
+        errors::InvalidArgument(
+            "Indices and updates specified for empty output shape"));
+
+    const int64 outer_dims = indices.shape().dims() - 1;
+
+    for (int i = 0; i < outer_dims; ++i) {
+      OP_REQUIRES(c, indices.shape().dim_size(i) == updates.shape().dim_size(i),
+                  errors::InvalidArgument(
+                      "Outer dimensions of indices and update must match. "
+                      "Indices shape: ",
+                      indices.shape().DebugString(),
+                      ", updates shape:", updates.shape().DebugString()));
+    }
+
+    const int64 ix = indices.shape().dim_size(outer_dims);
+    OP_REQUIRES(
+        c, updates.shape().dims() - outer_dims == shape.dims() - ix,
+        errors::InvalidArgument("Inner dimensions of output shape must match "
+                                "inner dimensions of updates shape. Output: ",
+                                shape.DebugString(),
+                                " updates: ", updates.shape().DebugString()));
+    for (int i = 0; i + outer_dims < updates.shape().dims(); ++i) {
+      OP_REQUIRES(
+          c, updates.shape().dim_size(i + outer_dims) == shape.dim_size(ix + i),
+          errors::InvalidArgument(
+              "The inner ", shape.dims() - ix,
+              " dimensions of output.shape=", shape.DebugString(),
+              " must match the inner ", updates.shape().dims() - outer_dims,
+              " dimensions of updates.shape=", updates.shape().DebugString()));
+    }
+    OP_REQUIRES(c, shape_input.dims() == 1,
+                errors::InvalidArgument("Shape must be a vector"));
+
     Tensor out;
     OP_REQUIRES_OK(
         c, functor::DoScatterNd<Device, T, Index, scatter_nd_op::UpdateOp::ADD>(
@@ -257,6 +300,7 @@ TF_CALL_string(REGISTER_SCATTER_ND_CPU);
   REGISTER_SCATTER_ND_UPDATE_GPU(type);   \
   REGISTER_SCATTER_ND_GPU(type);
 
+TF_CALL_int32(REGISTER_SCATTER_ND_ALL_GPU);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_complex64(REGISTER_SCATTER_ND_ALL_GPU);
@@ -271,6 +315,8 @@ TF_CALL_complex128(REGISTER_SCATTER_ND_ALL_GPU);
 #define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
   REGISTER_SCATTER_ND_UPDATE(type, SYCL);
 
+TF_CALL_int32(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
+TF_CALL_int32(REGISTER_SCATTER_ND_UPDATE_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
@@ -541,6 +587,7 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index a3c21edc15f684e51c7f1806aeeeeead679ea22e..08b657f4c38807cd99bd6f03cacf589e9d8fd22c 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -170,6 +170,7 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scoped_allocator_ops.cc b/tensorflow/core/kernels/scoped_allocator_ops.cc
index 1800ee8c1f975b458d11bab836cfdb9b269f7152..1d2fb6996a3fcf5d2a7f2798c139c157cbf055e8 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops.cc
@@ -113,7 +113,7 @@ class ScopedAllocatorConcatOp : public OpKernel {
     OP_REQUIRES(context, backing_tensor.NumElements() >= shape_.num_elements(),
                 errors::InvalidArgument("Backing tensor num elements ",
                                         backing_tensor.NumElements(),
-                                        " is not equal to expected ",
+                                        " is not >= to expected ",
                                         shape_.num_elements()));
     Tensor output(dtype_);
     if (reshape_) {
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
index 019c6619ee1e7fcb879736a3c72a446f55a08b8a..d2918d20426af351a3613dea41eb7d2802d5a672 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -212,8 +212,13 @@ TEST_F(ScopedAllocatorConcatOpTest, Success3) {
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, Reshape) {
-  MakeOp({2, 2, 2}, DT_DOUBLE, true, "test", 120, 2);
-  ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
+  MakeOp({2, 2, 4}, DT_DOUBLE, true, "test", 120, 2);
+
+  // The elements of the third parameter to ExecOp must be multiples of
+  // Allocator::kAllocatorAlignment in size.  If they are not, the backing
+  // tensor allocated by PrepOp will have too many elements and reshaping
+  // will fail.
+  ExecOp(DT_DOUBLE, 120, {{2, 4}, {2, 4}});
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, NoReshapeAttr) {
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index c87ce78e051a4552e4c1e44c7ab5767bb7c72310..2328fc6afd8e7b7c24351e612ea6b760a2d522c3 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -320,7 +320,9 @@ class SegmentSumGPUOp : public AsyncOpKernel {
   REGISTER_CPU_KERNEL_SEGMENT("SegmentSum", Eigen::internal::SumReducer<type>, \
                               type, index_type, 0);                            \
   REGISTER_CPU_KERNEL_SEGMENT(                                                 \
-      "SegmentProd", Eigen::internal::ProdReducer<type>, type, index_type, 1)
+      "SegmentMean", Eigen::internal::MeanReducer<type>, type, index_type, 0); \
+  REGISTER_CPU_KERNEL_SEGMENT(                                                 \
+      "SegmentProd", Eigen::internal::ProdReducer<type>, type, index_type, 1);
 
 #define REGISTER_REAL_CPU_KERNELS_ALL(type) \
   REGISTER_REAL_CPU_KERNELS(type, int32);   \
diff --git a/tensorflow/core/lib/db/BUILD b/tensorflow/core/lib/db/BUILD
index ce09c2009ac81b5cd2736800852a148bfefff6a9..7a64306c6e96201ec4bbaa71ac6751a9b606f4ba 100644
--- a/tensorflow/core/lib/db/BUILD
+++ b/tensorflow/core/lib/db/BUILD
@@ -29,6 +29,7 @@ cc_library(
         "@org_sqlite",
         "@snappy",
     ],
+    alwayslink = 1,
 )
 
 tf_cc_test(
diff --git a/tensorflow/core/lib/gtl/flatmap.h b/tensorflow/core/lib/gtl/flatmap.h
index 889d2ddaa6be36332a3b810c0aefef6ecb684e40..9dc439c163733cf1097dd1a07a93cfaed19c6907 100644
--- a/tensorflow/core/lib/gtl/flatmap.h
+++ b/tensorflow/core/lib/gtl/flatmap.h
@@ -76,6 +76,10 @@ class FlatMap {
 
   FlatMap(const FlatMap& src) : rep_(src.rep_) {}
 
+  // Move constructor leaves src in a valid but unspecified state (same as
+  // std::unordered_map).
+  FlatMap(FlatMap&& src) : rep_(std::move(src.rep_)) {}
+
   template <typename InputIter>
   FlatMap(InputIter first, InputIter last, size_t N = 1,
           const Hash& hf = Hash(), const Eq& eq = Eq())
@@ -92,6 +96,13 @@ class FlatMap {
     return *this;
   }
 
+  // Move-assignment operator leaves src in a valid but unspecified state (same
+  // as std::unordered_map).
+  FlatMap& operator=(FlatMap&& src) {
+    rep_.MoveFrom(std::move(src.rep_));
+    return *this;
+  }
+
   ~FlatMap() {}
 
   void swap(FlatMap& x) { rep_.swap(x.rep_); }
diff --git a/tensorflow/core/lib/gtl/flatmap_test.cc b/tensorflow/core/lib/gtl/flatmap_test.cc
index 0901eba9265a48351d108a73a620dd753f4ec92f..0fd22ab37be6bea76c6b37a753a39132851bf43a 100644
--- a/tensorflow/core/lib/gtl/flatmap_test.cc
+++ b/tensorflow/core/lib/gtl/flatmap_test.cc
@@ -656,19 +656,33 @@ TEST(FlatMap, UniqueMap) {
   }
   EXPECT_EQ(map.size(), N);
 
+  // move constructor
+  UniqMap map2(std::move(map));
+
   // Lookups
   for (int i = 0; i < N; i++) {
-    EXPECT_EQ(*map.at(MakeUniq(i)), i + 100);
+    EXPECT_EQ(*map2.at(MakeUniq(i)), i + 100);
   }
 
+  // move assignment
+  UniqMap map3;
+  map3 = std::move(map2);
+
   // find+erase
-  EXPECT_EQ(map.count(MakeUniq(2)), 1);
-  map.erase(MakeUniq(2));
-  EXPECT_EQ(map.count(MakeUniq(2)), 0);
+  EXPECT_EQ(map3.count(MakeUniq(2)), 1);
+  map3.erase(MakeUniq(2));
+  EXPECT_EQ(map3.count(MakeUniq(2)), 0);
 
   // clear
-  map.clear();
-  EXPECT_EQ(map.size(), 0);
+  map3.clear();
+  EXPECT_EQ(map3.size(), 0);
+
+  // Check that moved-from maps are in a valid (though unspecified) state.
+  EXPECT_GE(map.size(), 0);
+  EXPECT_GE(map2.size(), 0);
+  // This insert should succeed no matter what state `map` is in, because
+  // MakeUniq(-1) is never called above: This key can't possibly exist.
+  EXPECT_TRUE(map.emplace(MakeUniq(-1), MakeUniq(-1)).second);
 }
 
 TEST(FlatMap, UniqueMapIter) {
diff --git a/tensorflow/core/lib/gtl/flatrep.h b/tensorflow/core/lib/gtl/flatrep.h
index 0d7e7487fc33353603bf3c4d56d8d04466e326a1..65a076b0f39d5e7064995ed6c92152d155f32728 100644
--- a/tensorflow/core/lib/gtl/flatrep.h
+++ b/tensorflow/core/lib/gtl/flatrep.h
@@ -51,10 +51,23 @@ class FlatRep {
   FlatRep(size_t N, const Hash& hf, const Eq& eq) : hash_(hf), equal_(eq) {
     Init(N);
   }
-  explicit FlatRep(const FlatRep& src) : hash_(src.hash_), equal_(src.equal_) {
+  FlatRep(const FlatRep& src) : hash_(src.hash_), equal_(src.equal_) {
     Init(src.size());
     CopyEntries(src.array_, src.end_, CopyEntry());
   }
+
+  FlatRep(FlatRep&& src)
+      // Copy rather than move src.hash_ and src.equal_.  This is necessary to
+      // leave src in a valid state -- otherwise e.g. if hash_ is an
+      // std::function, moving it would null it out.
+      : hash_(src.hash_), equal_(src.equal_) {
+    // TODO(jlebar): Init(1) still allocates some memory, so this isn't as cheap
+    // as it could be.  The fundamental problem is that we need to leave src in
+    // a valid state, and FlatRep *always* owns a nonzero amount of memory.
+    Init(1);
+    swap(src);
+  }
+
   ~FlatRep() {
     clear_no_resize();
     delete[] array_;
@@ -78,6 +91,12 @@ class FlatRep {
     }
   }
 
+  void MoveFrom(FlatRep&& src) {
+    if (this != &src) {
+      swap(src);
+    }
+  }
+
   void clear_no_resize() {
     for (Bucket* b = array_; b != end_; b++) {
       for (uint32 i = 0; i < kWidth; i++) {
diff --git a/tensorflow/core/lib/gtl/flatset.h b/tensorflow/core/lib/gtl/flatset.h
index f31e3abe4115887ed1f2ed3bec52c73b2622715c..bb4356e46de1940a186c8c6b741dc716d7cc41b9 100644
--- a/tensorflow/core/lib/gtl/flatset.h
+++ b/tensorflow/core/lib/gtl/flatset.h
@@ -59,6 +59,10 @@ class FlatSet {
 
   FlatSet(const FlatSet& src) : rep_(src.rep_) {}
 
+  // Move constructor leaves src in a valid but unspecified state (same as
+  // std::unordered_set).
+  FlatSet(FlatSet&& src) : rep_(std::move(src.rep_)) {}
+
   template <typename InputIter>
   FlatSet(InputIter first, InputIter last, size_t N = 1,
           const Hash& hf = Hash(), const Eq& eq = Eq())
@@ -75,6 +79,13 @@ class FlatSet {
     return *this;
   }
 
+  // Move-assignment operator leaves src in a valid but unspecified state (same
+  // as std::unordered_set).
+  FlatSet& operator=(FlatSet&& src) {
+    rep_.MoveFrom(std::move(src.rep_));
+    return *this;
+  }
+
   ~FlatSet() {}
 
   void swap(FlatSet& x) { rep_.swap(x.rep_); }
@@ -169,6 +180,7 @@ class FlatSet {
   }
 
   std::pair<iterator, bool> insert(const Key& k) { return Insert(k); }
+  std::pair<iterator, bool> insert(Key&& k) { return Insert(std::move(k)); }
   template <typename InputIter>
   void insert(InputIter first, InputIter last) {
     for (; first != last; ++first) {
@@ -265,9 +277,10 @@ class FlatSet {
     }
   };
 
-  std::pair<iterator, bool> Insert(const Key& k) {
+  template <typename K>
+  std::pair<iterator, bool> Insert(K&& k) {
     rep_.MaybeResize();
-    auto r = rep_.FindOrInsert(k);
+    auto r = rep_.FindOrInsert(std::forward<K>(k));
     const bool inserted = !r.found;
     return {iterator(r.b, rep_.limit(), r.index), inserted};
   }
diff --git a/tensorflow/core/lib/gtl/flatset_test.cc b/tensorflow/core/lib/gtl/flatset_test.cc
index 010b4bb5df3337ad814caa3a8767796074be1d18..7f0138404f131e4e625d90f6df7870b0dc4f29ad 100644
--- a/tensorflow/core/lib/gtl/flatset_test.cc
+++ b/tensorflow/core/lib/gtl/flatset_test.cc
@@ -552,18 +552,32 @@ TEST(FlatSet, UniqueSet) {
   }
   EXPECT_EQ(set.size(), N);
 
+  // Move constructor
+  UniqSet set2(std::move(set));
+
   // Lookups
   for (int i = 0; i < N; i++) {
-    EXPECT_EQ(set.count(MakeUniq(i)), 1);
+    EXPECT_EQ(set2.count(MakeUniq(i)), 1);
   }
 
+  // Move-assignment operator
+  UniqSet set3;
+  set3 = std::move(set2);
+
   // erase
-  set.erase(MakeUniq(2));
-  EXPECT_EQ(set.count(MakeUniq(2)), 0);
+  set3.erase(MakeUniq(2));
+  EXPECT_EQ(set3.count(MakeUniq(2)), 0);
 
   // clear
   set.clear();
   EXPECT_EQ(set.size(), 0);
+
+  // Check that moved-from sets are in a valid (though unspecified) state.
+  EXPECT_GE(set.size(), 0);
+  EXPECT_GE(set2.size(), 0);
+  // This insert should succeed no matter what state `set` is in, because
+  // MakeUniq(-1) is never called above: This key can't possibly exist.
+  EXPECT_TRUE(set.emplace(MakeUniq(-1)).second);
 }
 
 TEST(FlatSet, UniqueSetIter) {
@@ -579,6 +593,12 @@ TEST(FlatSet, UniqueSetIter) {
   EXPECT_EQ(sum, (kCount * (kCount + 1)) / 2);
 }
 
+TEST(FlatSet, InsertUncopyable) {
+  UniqSet set;
+  EXPECT_TRUE(set.insert(MakeUniq(0)).second);
+  EXPECT_EQ(set.size(), 1);
+}
+
 /* This would be a good negative compilation test, if we could do that.
 
 TEST(FlatSet, MutableIterator_ShouldNotCompile) {
diff --git a/tensorflow/core/lib/hash/hash.h b/tensorflow/core/lib/hash/hash.h
index 3f85303c0f6573508f4b6d407b037eab40243262..737d23f6994fe2600a1be450eb073e35fd99a6fb 100644
--- a/tensorflow/core/lib/hash/hash.h
+++ b/tensorflow/core/lib/hash/hash.h
@@ -44,6 +44,12 @@ inline uint64 Hash64Combine(uint64 a, uint64 b) {
   return a ^ (b + 0x9e3779b97f4a7800ULL + (a << 10) + (a >> 4));
 }
 
+// Combine two hashes in an order-independent way. This operation should be
+// associative and compute the same hash for a collection of elements
+// independent of traversal order. Note that it is better to combine hashes
+// symmetrically with addition rather than XOR, since (x^x) == 0 but (x+x) != 0.
+inline uint64 Hash64CombineUnordered(uint64 a, uint64 b) { return a + b; }
+
 // Hash functor suitable for use with power-of-two sized hashtables.  Use
 // instead of std::hash<T>.
 //
diff --git a/tensorflow/core/ops/candidate_sampling_ops.cc b/tensorflow/core/ops/candidate_sampling_ops.cc
index 6e4d100b04fba22c170a654c9314e3a7e26fadda..6e589c8d1c5fea61cf993359003521a9feac9f1e 100644
--- a/tensorflow/core/ops/candidate_sampling_ops.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops.cc
@@ -145,12 +145,15 @@ REGISTER_OP("ComputeAccidentalHits")
       int64 num_true;
       TF_RETURN_IF_ERROR(c->GetAttr("num_true", &num_true));
 
-      // Validate true_classes.
+      // Validate true_classes, must be a matrix.
       ShapeHandle true_classes;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &true_classes));
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(
           c->WithValue(c->Dim(true_classes, 1), num_true, &unused));
+      // Validate sampled_candidates, must be a vector.
+      ShapeHandle sampled_candidates;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sampled_candidates));
 
       // All three outputs are the same shape.
       ShapeHandle v = c->Vector(InferenceContext::kUnknownDim);
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 3db00d8180c242d5656a367be0ed238f3cb1a6a3..c86767448923cc68a45e579199ce33a8a20e3d34 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -14641,6 +14641,66 @@ op {
     }
   }
 }
+op {
+  name: "CropAndResize"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "crops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+        s: "nearest"
+      }
+    }
+  }
+  attr {
+    name: "extrapolation_value"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+}
 op {
   name: "CropAndResizeGradBoxes"
   input_arg {
@@ -14790,6 +14850,53 @@ op {
     }
   }
 }
+op {
+  name: "CropAndResizeGradImage"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "image_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+        s: "nearest"
+      }
+    }
+  }
+}
 op {
   name: "Cross"
   input_arg {
@@ -28129,6 +28236,54 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "MapAndBatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "MapClear"
   attr {
@@ -34408,6 +34563,33 @@ op {
     type: DT_INT32
   }
 }
+op {
+  name: "NonMaxSuppressionV3"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+}
 op {
   name: "NotEqual"
   input_arg {
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 73174c184c631fb8f39165453245229b04ba6be4..29d9cfbde94ea8b59de9b8e7b6ca0f726b714dd7 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -206,7 +206,40 @@ REGISTER_OP("MapAndBatchDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_batches, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("MapAndBatchDatasetV2")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Input("batch_size: int64")
+    .Input("num_parallel_calls: int64")
+    .Input("drop_remainder: bool")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("PrefetchDataset")
     .Input("input_dataset: variant")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index c3b08e067a2c35432e45f98ef9d57af629b90e02..d949e70c661467edbadae9fdb6f74d7e4cc3de02 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -435,6 +435,25 @@ REGISTER_OP("DrawBoundingBoxes")
     .Output("output: T")
     .Attr("T: {float, half} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
+      // The rank of images should be 4.
+      ShapeHandle images;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &images));
+      // Channel depth should be either 1 (GRY), 3 (RGB), or 4 (RGBA).
+      if (c->ValueKnown(c->Dim(images, 3))) {
+        int64 depth = c->Value(c->Dim(images, 3));
+        if (!(depth == 1 || depth == 3 || depth == 4)) {
+          return errors::InvalidArgument("Channel depth should be either 1 (GRY), "
+                                         "3 (RGB), or 4 (RGBA)");
+        }
+      }
+
+      // The rank of boxes is 3: [batch, num_bounding_boxes, 4].
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &boxes));
+      // The last value of boxes shape is 4.
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 2), 4, &unused));
+
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     });
 
@@ -548,7 +567,7 @@ REGISTER_OP("CropAndResize")
     .Input("crop_size: int32")
     .Output("crops: float")
     .Attr("T: {uint8, uint16, int8, int16, int32, int64, half, float, double}")
-    .Attr("method: {'bilinear'} = 'bilinear'")
+    .Attr("method: {'bilinear', 'nearest'} = 'bilinear'")
     .Attr("extrapolation_value: float = 0")
     .SetShapeFn([](InferenceContext* c) {
       // Get inputs and validate ranks.
@@ -579,7 +598,7 @@ REGISTER_OP("CropAndResizeGradImage")
     .Input("image_size: int32")
     .Output("output: T")
     .Attr("T: {float, half, double}")
-    .Attr("method: {'bilinear'} = 'bilinear'")
+    .Attr("method: {'bilinear', 'nearest'} = 'bilinear'")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(3, &out));
@@ -657,4 +676,35 @@ REGISTER_OP("NonMaxSuppressionV2")
       return Status::OK();
     });
 
+REGISTER_OP("NonMaxSuppressionV3")
+    .Input("boxes: float")
+    .Input("scores: float")
+    .Input("max_output_size: int32")
+    .Input("iou_threshold: float")
+    .Input("score_threshold: float")
+    .Output("selected_indices: int32")
+    .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
+      ShapeHandle scores;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
+      ShapeHandle max_output_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
+      ShapeHandle iou_threshold;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold));
+      ShapeHandle score_threshold;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &score_threshold));
+      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
+      DimensionHandle unused;
+      // The boxes[0] and scores[0] are both num_boxes.
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->Dim(boxes, 0), c->Dim(scores, 0), &unused));
+      // The boxes[1] is 4.
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+
+      c->set_output(0, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/image_ops_test.cc b/tensorflow/core/ops/image_ops_test.cc
index 5f0b391b0d14a94e687b2ebe26d4aac8d459b0df..517af26b44f53d85979e1194d1f0d6d8814cb1e8 100644
--- a/tensorflow/core/ops/image_ops_test.cc
+++ b/tensorflow/core/ops/image_ops_test.cc
@@ -312,4 +312,23 @@ TEST(ImageOpsTest, QuantizedResizeBilinear_ShapeFn) {
   INFER_OK(op, "[1,?,3,?];[2];[];[]", "[d0_0,20,30,d0_3];[];[]");
 }
 
+TEST(ImageOpsTest, DrawBoundingBoxes_ShapeFn) {
+  ShapeInferenceTestOp op("DrawBoundingBoxes");
+  op.input_tensors.resize(2);
+
+  // Check images.
+  INFER_ERROR("must be rank 4", op, "[1,?,3];?");
+  INFER_ERROR("should be either 1 (GRY), 3 (RGB), or 4 (RGBA)",
+      op, "[1,?,?,5];?");
+
+  // Check boxes.
+  INFER_ERROR("must be rank 3", op, "[1,?,?,4];[1,4]");
+  INFER_ERROR("Dimension must be 4", op, "[1,?,?,4];[1,2,2]");
+
+  // OK shapes.
+  INFER_OK(op, "[4,?,?,4];?", "in0");
+  INFER_OK(op, "[?,?,?,?];[?,?,?]", "in0");
+  INFER_OK(op, "[4,?,?,4];[?,?,?]", "in0");
+  INFER_OK(op, "[4,?,?,4];[?,?,4]", "in0");
+}
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 8f8443a46cfa68e9879825d36b305b4f7774bd66..8c0b073ce4646875921c7cfbaa47f44e4c67ec19 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1017,7 +1017,7 @@ REGISTER_OP("SegmentMean")
     .Input("data: T")
     .Input("segment_ids: Tindices")
     .Output("output: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
     .SetShapeFn(SegmentReductionShapeFn);
 
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index bb46dafd424fe6a4bc1cf27a80b0adc94f573926..41efa49ce3d5e27b392deb8eb90fcc2cad4811fe 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -547,7 +547,7 @@ REGISTER_OP("Conv3DBackpropFilter")
     });
 
 REGISTER_OP("Conv3DBackpropInputV2")
-    .Input("input_sizes: int32")
+    .Input("input_sizes: Tshape")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
@@ -556,6 +556,7 @@ REGISTER_OP("Conv3DBackpropInputV2")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
+    .Attr("Tshape: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -1452,6 +1453,7 @@ REGISTER_OP("QuantizedReluX")
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 7156440b46ea082e8aec78adffbd888014ee78a0..e45125a1e8768e7136480bba13ac7775507da9bc 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -6242,6 +6242,7 @@ op {
     allowed_values {
       list {
         s: "bilinear"
+        s: "nearest"
       }
     }
   }
@@ -6347,6 +6348,7 @@ op {
     allowed_values {
       list {
         s: "bilinear"
+        s: "nearest"
       }
     }
   }
@@ -13939,6 +13941,54 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "MapAndBatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "MapClear"
   attr {
@@ -16519,6 +16569,33 @@ op {
     type: DT_INT32
   }
 }
+op {
+  name: "NonMaxSuppressionV3"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+}
 op {
   name: "NotEqual"
   input_arg {
diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index 416ce9c0d82ca0bfba730d3d7f4513260876e9ad..80ffae579655d51d6930200698a17055750edce1 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -72,7 +72,15 @@ REGISTER_OP("ParameterizedTruncatedNormal")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(shape_inference::RandomShape);
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      // Parameters must be 0-d or 1-d.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(3), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(4), 1, &unused));
+      return shape_inference::RandomShape(c);
+    });
 
 REGISTER_OP("TruncatedNormal")
     .Input("shape: T")
diff --git a/tensorflow/core/ops/rpc_ops.cc b/tensorflow/core/ops/rpc_ops.cc
index 72fda5e6eba3fd4cf3e53a26e0b4d9f5d6b19100..136f96d9ea764e7272940d0f2ae2fd316bf90780 100644
--- a/tensorflow/core/ops/rpc_ops.cc
+++ b/tensorflow/core/ops/rpc_ops.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-using tensorflow::shape_inference::DimensionHandle;
 using tensorflow::shape_inference::InferenceContext;
 using tensorflow::shape_inference::ShapeHandle;
 
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 469f193cf417758a9b21b72c09c899493d54299a..1d5c743a56cec3d9888e0c19c0a6728d1a4fe682 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -37,6 +37,17 @@ REGISTER_OP("RegexReplace")
       return Status::OK();
     });
 
+REGISTER_OP("RegexFullMatch")
+    .Input("input: string")
+    .Input("pattern: string")
+    .Output("output: bool")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    });
+
 REGISTER_OP("StringToHashBucketFast")
     .Input("input: string")
     .Output("output: int64")
diff --git a/tensorflow/core/platform/cloud/file_block_cache.h b/tensorflow/core/platform/cloud/file_block_cache.h
index da167882470bfa3d833faeb7f031fdd7064aba35..c98b10640fa0b7ad7d1ee310b6fb4657cbc9e779 100644
--- a/tensorflow/core/platform/cloud/file_block_cache.h
+++ b/tensorflow/core/platform/cloud/file_block_cache.h
@@ -67,6 +67,13 @@ class FileBlockCache {
   virtual Status Read(const string& filename, size_t offset, size_t n,
                       char* buffer, size_t* bytes_transferred) = 0;
 
+  // Validate the given file signature with the existing file signature in the
+  // cache. Returns true if the signature doesn't change or the file did not
+  // exist before. If the signature changes, update the existing signature with
+  // the new one and remove the file from cache.
+  virtual bool ValidateAndUpdateFileSignature(const string& filename,
+                                              int64 file_signature) = 0;
+
   /// Remove all cached blocks for `filename`.
   virtual void RemoveFile(const string& filename) = 0;
 
@@ -80,6 +87,10 @@ class FileBlockCache {
 
   /// The current size (in bytes) of the cache.
   virtual size_t CacheSize() const = 0;
+
+  // Returns true if the cache is enabled. If false, the BlockFetcher callback
+  // is always executed during Read.
+  virtual bool IsCacheEnabled() const = 0;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 08697a45d8fe5c756c1183bd389b2cf52f8a4b9a..e91c3b1b0c15eca5ac21fa9c55eff924490fd0a0 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -80,7 +80,7 @@ constexpr uint64 kDefaultMaxStaleness = 0;
 // The environment variable that overrides the maximum age of entries in the
 // Stat cache. A value of 0 (the default) means nothing is cached.
 constexpr char kStatCacheMaxAge[] = "GCS_STAT_CACHE_MAX_AGE";
-constexpr uint64 kStatCacheDefaultMaxAge = 0;
+constexpr uint64 kStatCacheDefaultMaxAge = 5;
 // The environment variable that overrides the maximum number of entries in the
 // Stat cache.
 constexpr char kStatCacheMaxEntries[] = "GCS_STAT_CACHE_MAX_ENTRIES";
@@ -290,25 +290,34 @@ Status GetBoolValue(const Json::Value& parent, const char* name, bool* result) {
 /// A GCS-based implementation of a random access file with an LRU block cache.
 class GcsRandomAccessFile : public RandomAccessFile {
  public:
-  GcsRandomAccessFile(const string& filename, FileBlockCache* file_block_cache)
-      : filename_(filename), file_block_cache_(file_block_cache) {}
+  using SignatureGenFun =
+      std::function<Status(const string& filename, int64* file_signature)>;
+
+  GcsRandomAccessFile(const string& filename, FileBlockCache* file_block_cache,
+                      const SignatureGenFun& signature_gen_fun)
+      : filename_(filename),
+        file_block_cache_(file_block_cache),
+        signature_gen_fun_(signature_gen_fun) {}
 
   /// The implementation of reads with an LRU block cache. Thread safe.
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
+    if (file_block_cache_->IsCacheEnabled()) {
+      int64 signature;
+      TF_RETURN_IF_ERROR(signature_gen_fun_(filename_, &signature));
+      if (!file_block_cache_->ValidateAndUpdateFileSignature(filename_,
+                                                             signature)) {
+        VLOG(1) << "File " << filename_
+                << " signature has been changed. Refreshing the cache.";
+      }
+    }
+
     *result = StringPiece();
     size_t bytes_transferred;
     TF_RETURN_IF_ERROR(file_block_cache_->Read(filename_, offset, n, scratch,
                                                &bytes_transferred));
     *result = StringPiece(scratch, bytes_transferred);
-    string checkpoint_ending = "/checkpoint";
-    // Check if the file is the checkpoint file as we should not be caching
-    // that. As it's contents are updated and used for iterating checkpoints.
-    if (std::equal(checkpoint_ending.rbegin(), checkpoint_ending.rend(),
-                   filename_.rbegin())) {
-      // Remove the checkpoint file from the cache
-      file_block_cache_->RemoveFile(filename_);
-    }
+
     if (bytes_transferred < n) {
       // This is not an error per se. The RandomAccessFile interface expects
       // that Read returns OutOfRange if fewer bytes were read than requested.
@@ -324,6 +333,8 @@ class GcsRandomAccessFile : public RandomAccessFile {
   const string filename_;
   /// The LRU block cache for this file.
   mutable FileBlockCache* file_block_cache_;  // not owned
+
+  const SignatureGenFun signature_gen_fun_;
 };
 
 /// \brief GCS-based implementation of a writeable file.
@@ -664,8 +675,8 @@ GcsFileSystem::GcsFileSystem()
   if (GetEnvVar(kStatCacheMaxEntries, strings::safe_strtou64, &value)) {
     stat_cache_max_entries = value;
   }
-  stat_cache_.reset(new ExpiringLRUCache<FileStatistics>(
-      stat_cache_max_age, stat_cache_max_entries));
+  stat_cache_.reset(new ExpiringLRUCache<GcsFileStat>(stat_cache_max_age,
+                                                      stat_cache_max_entries));
   // Apply overrides for the matching paths cache max age and max entries, if
   // provided.
   uint64 matching_paths_cache_max_age = kMatchingPathsCacheDefaultMaxAge;
@@ -786,7 +797,18 @@ Status GcsFileSystem::NewRandomAccessFile(
     const string& fname, std::unique_ptr<RandomAccessFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
-  result->reset(new GcsRandomAccessFile(fname, file_block_cache_.get()));
+  result->reset(new GcsRandomAccessFile(
+      fname, file_block_cache_.get(),
+      [this, bucket, object](const string& fname, int64* signature) {
+        GcsFileStat stat;
+        TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(
+            fname, &stat,
+            [this, bucket, object](const string& fname, GcsFileStat* stat) {
+              return UncachedStatForObject(fname, bucket, object, stat);
+            }));
+        *signature = stat.generation_number;
+        return Status::OK();
+      }));
   return Status::OK();
 }
 
@@ -842,9 +864,9 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
 
   if (bytes_read < n) {
     // Check stat cache to see if we encountered an interrupted read.
-    FileStatistics stat;
+    GcsFileStat stat;
     if (stat_cache_->Lookup(filename, &stat)) {
-      if (offset + bytes_read < stat.length) {
+      if (offset + bytes_read < stat.base.length) {
         return errors::Internal(strings::Printf(
             "File contents are inconsistent for file: %s @ %lu.",
             filename.c_str(), offset));
@@ -957,7 +979,7 @@ Status GcsFileSystem::ObjectExists(const string& fname, const string& bucket,
   if (!result) {
     return errors::Internal("'result' cannot be nullptr.");
   }
-  FileStatistics not_used_stat;
+  GcsFileStat not_used_stat;
   const Status status = StatForObject(fname, bucket, object, &not_used_stat);
   switch (status.code()) {
     case errors::Code::OK:
@@ -971,57 +993,70 @@ Status GcsFileSystem::ObjectExists(const string& fname, const string& bucket,
   }
 }
 
-Status GcsFileSystem::StatForObject(const string& fname, const string& bucket,
-                                    const string& object,
-                                    FileStatistics* stat) {
-  if (!stat) {
-    return errors::Internal("'stat' cannot be nullptr.");
-  }
-  if (object.empty()) {
-    return errors::InvalidArgument(strings::Printf(
-        "'object' must be a non-empty string. (File: %s)", fname.c_str()));
+Status GcsFileSystem::UncachedStatForObject(const string& fname,
+                                            const string& bucket,
+                                            const string& object,
+                                            GcsFileStat* stat) {
+  std::vector<char> output_buffer;
+  std::unique_ptr<HttpRequest> request;
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request),
+                                  " when reading metadata of gs://", bucket,
+                                  "/", object);
+
+  request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket, "/o/",
+                                  request->EscapeString(object),
+                                  "?fields=size%2Cgeneration%2Cupdated"));
+  request->SetResultBuffer(&output_buffer);
+  request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
+
+  if (stats_ != nullptr) {
+    stats_->RecordStatObjectRequest();
   }
 
-  StatCache::ComputeFunc compute_func = [this, &bucket, &object](
-                                            const string& fname,
-                                            FileStatistics* stat) {
-    std::vector<char> output_buffer;
-    std::unique_ptr<HttpRequest> request;
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request),
-                                    " when reading metadata of gs://", bucket,
-                                    "/", object);
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      request->Send(), " when reading metadata of gs://", bucket, "/", object);
 
-    request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket, "/o/",
-                                    request->EscapeString(object),
-                                    "?fields=size%2Cupdated"));
-    request->SetResultBuffer(&output_buffer);
-    request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
+  Json::Value root;
+  TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root));
 
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
-                                    " when reading metadata of gs://", bucket,
-                                    "/", object);
+  // Parse file size.
+  TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &stat->base.length));
 
-    Json::Value root;
-    TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root));
+  // Parse generation number.
+  TF_RETURN_IF_ERROR(
+      GetInt64Value(root, "generation", &stat->generation_number));
 
-    // Parse file size.
-    TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &stat->length));
+  // Parse file modification time.
+  string updated;
+  TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
+  TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->base.mtime_nsec)));
 
-    // Parse file modification time.
-    string updated;
-    TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
-    TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec)));
+  VLOG(1) << "Stat of: gs://" << bucket << "/" << object << " -- "
+          << " length: " << stat->base.length
+          << " generation: " << stat->generation_number
+          << "; mtime_nsec: " << stat->base.mtime_nsec
+          << "; updated: " << updated;
 
-    VLOG(1) << "Stat of: gs://" << bucket << "/" << object << " -- "
-            << " length: " << stat->length
-            << "; mtime_nsec: " << stat->mtime_nsec << "; updated: " << updated;
+  stat->base.is_directory = false;
+  return Status::OK();
+}
 
-    stat->is_directory = false;
-    return Status::OK();
-  };
+Status GcsFileSystem::StatForObject(const string& fname, const string& bucket,
+                                    const string& object, GcsFileStat* stat) {
+  if (!stat) {
+    return errors::Internal("'stat' cannot be nullptr.");
+  }
+  if (object.empty()) {
+    return errors::InvalidArgument(strings::Printf(
+        "'object' must be a non-empty string. (File: %s)", fname.c_str()));
+  }
 
-  TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(fname, stat, compute_func));
-  if (stat->is_directory) {
+  TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(
+      fname, stat,
+      [this, &bucket, &object](const string& fname, GcsFileStat* stat) {
+        return UncachedStatForObject(fname, bucket, object, stat);
+      }));
+  if (stat->base.is_directory) {
     return errors::NotFound(fname, " is a directory.");
   } else {
     return Status::OK();
@@ -1055,22 +1090,22 @@ Status GcsFileSystem::FolderExists(const string& dirname, bool* result) {
     return errors::Internal("'result' cannot be nullptr.");
   }
   StatCache::ComputeFunc compute_func = [this](const string& dirname,
-                                               FileStatistics* stat) {
+                                               GcsFileStat* stat) {
     std::vector<string> children;
     TF_RETURN_IF_ERROR(
         GetChildrenBounded(dirname, 1, &children, true /* recursively */,
                            true /* include_self_directory_marker */));
     if (!children.empty()) {
-      *stat = DIRECTORY_STAT;
+      stat->base = DIRECTORY_STAT;
       return Status::OK();
     } else {
       return errors::InvalidArgument("Not a directory!");
     }
   };
-  FileStatistics stat;
+  GcsFileStat stat;
   Status s = stat_cache_->LookupOrCompute(dirname, &stat, compute_func);
   if (s.ok()) {
-    *result = stat.is_directory;
+    *result = stat.base.is_directory;
     return Status::OK();
   }
   if (errors::IsInvalidArgument(s)) {
@@ -1254,8 +1289,10 @@ Status GcsFileSystem::Stat(const string& fname, FileStatistics* stat) {
     return errors::NotFound("The specified bucket ", fname, " was not found.");
   }
 
-  const Status status = StatForObject(fname, bucket, object, stat);
+  GcsFileStat gcs_stat;
+  const Status status = StatForObject(fname, bucket, object, &gcs_stat);
   if (status.ok()) {
+    *stat = gcs_stat.base;
     return Status::OK();
   }
   if (status.code() != errors::Code::NOT_FOUND) {
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 6250aa75948d22c71ba625300e51fac1ca2d9914..d543db15775121bb59577ebb03d13f7ba9514e0f 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PLATFORM_GCS_FILE_SYSTEM_H_
-#define TENSORFLOW_CORE_PLATFORM_GCS_FILE_SYSTEM_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_FILE_SYSTEM_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_FILE_SYSTEM_H_
 
 #include <string>
 #include <utility>
@@ -56,6 +56,10 @@ class GcsStatsInterface {
   virtual void RecordBlockRetrieved(const string& file, size_t offset,
                                     size_t bytes_transferred) = 0;
 
+  // RecordStatObjectRequest is called once a statting object request over GCS
+  // is about to be made.
+  virtual void RecordStatObjectRequest() = 0;
+
   /// HttpStats is called to optionally provide a RequestStats listener
   /// to be annotated on every HTTP request made to the GCS API.
   ///
@@ -187,6 +191,12 @@ class GcsFileSystem : public FileSystem {
   Status CreateHttpRequest(std::unique_ptr<HttpRequest>* request);
 
  private:
+  // GCS file statistics.
+  struct GcsFileStat {
+    FileStatistics base;
+    int64 generation_number = 0;
+  };
+
   /// \brief Checks if the bucket exists. Returns OK if the check succeeded.
   ///
   /// 'result' is set if the function returns OK. 'result' cannot be nullptr.
@@ -214,9 +224,15 @@ class GcsFileSystem : public FileSystem {
   Status GetChildrenBounded(const string& dir, uint64 max_results,
                             std::vector<string>* result, bool recursively,
                             bool include_self_directory_marker);
-  /// Retrieves file statistics assuming fname points to a GCS object.
+
+  /// Retrieves file statistics assuming fname points to a GCS object. The data
+  /// may be read from cache or from GCS directly.
   Status StatForObject(const string& fname, const string& bucket,
-                       const string& object, FileStatistics* stat);
+                       const string& object, GcsFileStat* stat);
+  /// Retrieves file statistics of file fname directly from GCS.
+  Status UncachedStatForObject(const string& fname, const string& bucket,
+                               const string& object, GcsFileStat* stat);
+
   Status RenameObject(const string& src, const string& target);
 
   std::unique_ptr<FileBlockCache> MakeFileBlockCache(size_t block_size,
@@ -236,7 +252,7 @@ class GcsFileSystem : public FileSystem {
   std::unique_ptr<GcsDnsCache> dns_cache_;
   GcsThrottle throttle_;
 
-  using StatCache = ExpiringLRUCache<FileStatistics>;
+  using StatCache = ExpiringLRUCache<GcsFileStat>;
   std::unique_ptr<StatCache> stat_cache_;
 
   using MatchingPathsCache = ExpiringLRUCache<std::vector<string>>;
@@ -264,4 +280,4 @@ class RetryingGcsFileSystem : public RetryingFileSystem<GcsFileSystem> {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PLATFORM_GCS_FILE_SYSTEM_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_FILE_SYSTEM_H_
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 28be13869b6947b7e26863e1de4e0103c1de9f71..bb4ace65a9ed64ce4d8415e402204fc5f5731b61 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -74,7 +74,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache) {
   EXPECT_EQ("6789", result);
 }
 
-TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_differentN) {
+TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_DifferentN) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
@@ -123,6 +123,13 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
   // "0123456789abcde".
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "random_access.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"15\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-8\n"
@@ -145,7 +152,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
       std::unique_ptr<HttpRequest::Factory>(
           new FakeHttpRequestFactory(&requests)),
       9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, 0 /* initial retry delay */,
       kTestTimeoutConfig, nullptr /* gcs additional header */);
@@ -198,64 +205,30 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
   EXPECT_EQ("0123", result);
 }
 
-TEST(GcsFileSystemTest, NewRandomAccessFile_CheckpointFile_WithBlockCache) {
-  // Our underlying file in this test changes as new data comes in
-  std::vector<HttpRequest*> requests(
-      {new FakeHttpRequest(
-           "Uri: https://storage.googleapis.com/bucket/checkpoint\n"
-           "Auth Token: fake_token\n"
-           "Range: 0-8\n"
-           "Timeouts: 5 1 20\n",
-           "012345678"),
-       new FakeHttpRequest(
-           "Uri: https://storage.googleapis.com/bucket/checkpoint\n"
-           "Auth Token: fake_token\n"
-           "Range: 0-8\n"
-           "Timeouts: 5 1 20\n",
-           "abcdefghi")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
-      0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, nullptr /* gcs additional header */);
-
-  char scratch[100];
-  StringPiece result;
-  {
-    // We are instantiating this in an enclosed scope to make sure after the
-    // unique ptr goes out of scope, we can still access result.
-    std::unique_ptr<RandomAccessFile> file;
-    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/checkpoint", &file));
-
-    // Read the first chunk. The cache will be populated with the first block of
-    // 9 bytes.
-    scratch[5] = 'x';
-    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
-    EXPECT_EQ("0123", result);
-    EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
-
-    // The second chunk should not be in cache so we make a new request
-    // As the checkpoint file should not be cached
-    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
-    EXPECT_EQ("abcd", result);
-    EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
-  }
-}
-
 TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
   // Our underlying file in this test is a 15 byte file with contents
   // "0123456789abcde".
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "random_access.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"15\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-8\n"
            "Timeouts: 5 1 20\n",
            "012345678"),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "random_access.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"15\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
@@ -267,7 +240,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
       std::unique_ptr<HttpRequest::Factory>(
           new FakeHttpRequestFactory(&requests)),
       9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, 0 /* initial retry delay */,
       kTestTimeoutConfig, nullptr /* gcs additional header */);
@@ -293,7 +266,14 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
   // Our underlying file in this test is a 16 byte file with contents
   // "0123456789abcdef".
   std::vector<HttpRequest*> requests(
-      {new FakeHttpRequest("Uri: https://storage.googleapis.com/bucket/object\n"
+      {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "object?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"16\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest("Uri: https://storage.googleapis.com/bucket/object\n"
                            "Auth Token: fake_token\n"
                            "Range: 0-7\n"
                            "Timeouts: 5 1 20\n",
@@ -308,7 +288,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
       std::unique_ptr<HttpRequest::Factory>(
           new FakeHttpRequestFactory(&requests)),
       8 /* block size */, 16 /* max bytes */, 3600 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, 0 /* initial retry delay */,
       kTestTimeoutConfig, nullptr /* gcs additional header */);
@@ -343,6 +323,60 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
   }
 }
 
+TEST(GcsFileSystemTest,
+     NewRandomAccessFile_WithBlockCache_FileSignatureChanges) {
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "random_access.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"5\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "01234"),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "random_access.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"5\",\"generation\": \"2\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "43210")});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
+      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
+
+  std::unique_ptr<RandomAccessFile> file;
+  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+
+  char scratch[5];
+  StringPiece result;
+
+  // First read.
+  TF_EXPECT_OK(file->Read(0, sizeof(scratch), &result, scratch));
+  EXPECT_EQ("01234", result);
+
+  // Second read. File signatures are different.
+  TF_EXPECT_OK(file->Read(0, sizeof(scratch), &result, scratch));
+  EXPECT_EQ("43210", result);
+}
+
 TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
   GcsFileSystem fs(
@@ -364,10 +398,10 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_InconsistentRead) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "random_access.txt?fields=size%2Cupdated\n"
+           "random_access.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"6\","
+           strings::StrCat("{\"size\": \"6\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
@@ -404,6 +438,13 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_InconsistentRead) {
 TEST(GcsFileSystemTest, NewWritableFile) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fwriteable?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"16\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
            "Range: 0-7\n"
@@ -423,21 +464,28 @@ TEST(GcsFileSystemTest, NewWritableFile) {
                            "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            ""),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fwriteable?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"33\",\"generation\": \"2\","
+                           "\"updated\": \"2016-04-29T23:15:34.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
            "Range: 0-7\n"
            "Timeouts: 5 1 20\n",
            "01234567")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   8 /* block size */, 8 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */, kTestTimeoutConfig,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      8 /* block size */, 8 /* max bytes */, 0 /* max staleness */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
 
   // Read from the file first, to fill the block cache.
   std::unique_ptr<RandomAccessFile> rfile;
@@ -541,6 +589,13 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
   // path.
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fwriteable?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"16\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
            "Range: 0-7\n"
@@ -566,6 +621,13 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
                            "", Status::OK(), nullptr, {}, 201),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fwriteable?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"33\",\"generation\": \"2\","
+                           "\"updated\": \"2016-04-29T23:19:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
@@ -577,7 +639,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
       std::unique_ptr<HttpRequest::Factory>(
           new FakeHttpRequestFactory(&requests)),
       8 /* block size */, 8 /* max bytes */, 3600 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, 0 /* initial retry delay */,
       kTestTimeoutConfig, nullptr /* gcs additional header */);
@@ -761,6 +823,13 @@ TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
 TEST(GcsFileSystemTest, NewAppendableFile) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fappendable?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fappendable\n"
            "Auth Token: fake_token\n"
            "Range: 0-31\n"
@@ -780,6 +849,13 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
                            "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            ""),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fappendable?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"2\","
+                           "\"updated\": \"2016-04-29T23:25:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fappendable\n"
            "Auth Token: fake_token\n"
@@ -791,7 +867,7 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
       std::unique_ptr<HttpRequest::Factory>(
           new FakeHttpRequestFactory(&requests)),
       32 /* block size */, 32 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, 0 /* initial retry delay */,
       kTestTimeoutConfig, nullptr /* gcs additional header */);
@@ -840,11 +916,12 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Frandom_access.txt?fields=size%2Cupdated\n"
+           "path%2Frandom_access.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"", content.size(),
-                           "\", \"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+           strings::StrCat("{\"size\": \"", content.size(), "\"",
+                           ", \"generation\": \"1\"",
+                           ", \"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            strings::StrCat("Uri: https://storage.googleapis.com/bucket/"
                            "path%2Frandom_access.txt\n"
@@ -890,10 +967,10 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile_NoObjectName) {
 TEST(GcsFileSystemTest, FileExists_YesAsObject) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-      "path%2Ffile1.txt?fields=size%2Cupdated\n"
+      "path%2Ffile1.txt?fields=size%2Cgeneration%2Cupdated\n"
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
-      strings::StrCat("{\"size\": \"1010\","
+      strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -912,7 +989,7 @@ TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Fsubfolder?fields=size%2Cupdated\n"
+           "path%2Fsubfolder?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
@@ -967,7 +1044,7 @@ TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Ffile1.txt?fields=size%2Cupdated\n"
+           "path%2Ffile1.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
@@ -1023,14 +1100,14 @@ TEST(GcsFileSystemTest, FileExists_StatCache) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Ffile1.txt?fields=size%2Cupdated\n"
+           "path%2Ffile1.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Fsubfolder?fields=size%2Cupdated\n"
+           "path%2Fsubfolder?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
@@ -1493,6 +1570,13 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache_Flush) {
 TEST(GcsFileSystemTest, DeleteFile) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Ffile1.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Ffile1.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-15\n"
@@ -1504,6 +1588,13 @@ TEST(GcsFileSystemTest, DeleteFile) {
                            "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            ""),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Ffile1.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"2\","
+                           "\"updated\": \"2016-04-29T23:19:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Ffile1.txt\n"
            "Auth Token: fake_token\n"
@@ -1515,7 +1606,7 @@ TEST(GcsFileSystemTest, DeleteFile) {
       std::unique_ptr<HttpRequest::Factory>(
           new FakeHttpRequestFactory(&requests)),
       16 /* block size */, 16 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
       kTestTimeoutConfig, nullptr /* gcs additional header */);
@@ -1555,10 +1646,10 @@ TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "file.txt?fields=size%2Cupdated\n"
+           "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/file.txt\n"
@@ -1568,7 +1659,7 @@ TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) {
                            ""),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "file.txt?fields=size%2Cupdated\n"
+           "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
@@ -1693,10 +1784,10 @@ TEST(GcsFileSystemTest, DeleteDir_NonEmpty) {
 TEST(GcsFileSystemTest, GetFileSize) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-      "file.txt?fields=size%2Cupdated\n"
+      "file.txt?fields=size%2Cgeneration%2Cupdated\n"
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
-      strings::StrCat("{\"size\": \"1010\","
+      strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1816,33 +1907,31 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
 TEST(GcsFileSystemTest, RenameFile_Object) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fsrc.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fsrc.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-15\n"
            "Timeouts: 5 1 20\n",
            "01234567"),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fdst.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fdst.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-15\n"
            "Timeouts: 5 1 20\n",
            "76543210"),
-       // IsDirectory is checking whether there are children objects.
-       new FakeHttpRequest(
-           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
-           "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsrc.txt%2F"
-           "&maxResults=1\n"
-           "Auth Token: fake_token\n"
-           "Timeouts: 5 1 10\n",
-           "{}"),
-       // IsDirectory is checking if the path exists as an object.
-       new FakeHttpRequest(
-           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Fsrc.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n"
-           "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
-                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        // Copying to the new location.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
@@ -1859,12 +1948,26 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
            "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            ""),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fsrc.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"2\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fsrc.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-15\n"
            "Timeouts: 5 1 20\n",
            "89abcdef"),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fdst.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"2\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fdst.txt\n"
            "Auth Token: fake_token\n"
@@ -1876,7 +1979,7 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
       std::unique_ptr<HttpRequest::Factory>(
           new FakeHttpRequestFactory(&requests)),
       16 /* block size */, 64 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
       kTestTimeoutConfig, nullptr /* gcs additional header */);
@@ -1907,10 +2010,10 @@ TEST(GcsFileSystemTest, RenameFile_Object_FlushTargetStatCache) {
       {// Stat the target file.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Fdst.txt?fields=size%2Cupdated\n"
+           "path%2Fdst.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1000\","
+           strings::StrCat("{\"size\": \"1000\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        // IsDirectory is checking whether there are children objects.
        new FakeHttpRequest(
@@ -1923,10 +2026,10 @@ TEST(GcsFileSystemTest, RenameFile_Object_FlushTargetStatCache) {
        // IsDirectory is checking if the path exists as an object.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Fsrc.txt?fields=size%2Cupdated\n"
+           "path%2Fsrc.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        // Copying to the new location.
        new FakeHttpRequest(
@@ -1946,10 +2049,10 @@ TEST(GcsFileSystemTest, RenameFile_Object_FlushTargetStatCache) {
            ""),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Fdst.txt?fields=size%2Cupdated\n"
+           "path%2Fdst.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1988,10 +2091,10 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
        // IsDirectory is checking if the path exists as an object.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Fsrc.txt?fields=size%2Cupdated\n"
+           "path%2Fsrc.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        // Copying to the new location.
        new FakeHttpRequest(
@@ -2045,10 +2148,10 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
        // IsDirectory is checking if the path exists as an object.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Fsrc.txt?fields=size%2Cupdated\n"
+           "path%2Fsrc.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        // Copying to the new location.
        new FakeHttpRequest(
@@ -2077,10 +2180,10 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
 TEST(GcsFileSystemTest, Stat_Object) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-      "file.txt?fields=size%2Cupdated\n"
+      "file.txt?fields=size%2Cgeneration%2Cupdated\n"
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
-      strings::StrCat("{\"size\": \"1010\","
+      strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -2103,7 +2206,7 @@ TEST(GcsFileSystemTest, Stat_Folder) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "subfolder?fields=size%2Cupdated\n"
+           "subfolder?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
@@ -2136,7 +2239,7 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path?fields=size%2Cupdated\n"
+           "path?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
@@ -2208,14 +2311,14 @@ TEST(GcsFileSystemTest, Stat_Cache) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "file.txt?fields=size%2Cupdated\n"
+           "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "subfolder?fields=size%2Cupdated\n"
+           "subfolder?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
@@ -2256,17 +2359,17 @@ TEST(GcsFileSystemTest, Stat_Cache_Flush) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "file.txt?fields=size%2Cupdated\n"
+           "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "file.txt?fields=size%2Cupdated\n"
+           "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -2307,7 +2410,7 @@ TEST(GcsFileSystemTest, IsDirectory_NotFound) {
            "{}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "file.txt?fields=size%2Cupdated\n"
+           "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
@@ -2336,10 +2439,10 @@ TEST(GcsFileSystemTest, IsDirectory_NotDirectoryButObject) {
            "{}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "file.txt?fields=size%2Cupdated\n"
+           "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -2642,7 +2745,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
        // Checking if gs://bucket/path/file3.txt is an object - fails with 404.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Ffile3.txt?fields=size%2Cupdated\n"
+           "path%2Ffile3.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
@@ -2677,7 +2780,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
        // IsDirectory is checking if the path exists as an object.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path?fields=size%2Cupdated\n"
+           "path?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
@@ -2833,41 +2936,71 @@ TEST(GcsFileSystemTest, CreateHttpRequest) {
   TF_EXPECT_OK(request->Send());
 }
 
-TEST(GcsFileSystemTest, NewRandomAccessFile_StatsRecording) {
-  class TestGcsStats : public GcsStatsInterface {
-   public:
-    void Init(GcsFileSystem* fs, GcsThrottle* throttle,
-              const FileBlockCache* block_cache) override {
-      CHECK(fs_ == nullptr);
-      CHECK(throttle_ == nullptr);
-      CHECK(block_cache_ == nullptr);
-
-      fs_ = fs;
-      throttle_ = throttle;
-      block_cache_ = block_cache;
-    }
-
-    void RecordBlockLoadRequest(const string& file, size_t offset) override {
-      block_load_request_file_ = file;
-    }
-
-    void RecordBlockRetrieved(const string& file, size_t offset,
-                              size_t bytes_transferred) override {
-      block_retrieved_file_ = file;
-      block_retrieved_bytes_transferred_ = bytes_transferred;
-    }
-
-    HttpRequest::RequestStats* HttpStats() override { return nullptr; }
-
-    GcsFileSystem* fs_ = nullptr;
-    GcsThrottle* throttle_ = nullptr;
-    const FileBlockCache* block_cache_ = nullptr;
-
-    string block_load_request_file_;
-    string block_retrieved_file_;
-    size_t block_retrieved_bytes_transferred_ = 0;
-  };
+class TestGcsStats : public GcsStatsInterface {
+ public:
+  void Init(GcsFileSystem* fs, GcsThrottle* throttle,
+            const FileBlockCache* block_cache) override {
+    CHECK(fs_ == nullptr);
+    CHECK(throttle_ == nullptr);
+    CHECK(block_cache_ == nullptr);
+
+    fs_ = fs;
+    throttle_ = throttle;
+    block_cache_ = block_cache;
+  }
+
+  void RecordBlockLoadRequest(const string& file, size_t offset) override {
+    block_load_request_file_ = file;
+  }
+
+  void RecordBlockRetrieved(const string& file, size_t offset,
+                            size_t bytes_transferred) override {
+    block_retrieved_file_ = file;
+    block_retrieved_bytes_transferred_ = bytes_transferred;
+  }
 
+  void RecordStatObjectRequest() override { stat_object_request_count_++; }
+
+  HttpRequest::RequestStats* HttpStats() override { return nullptr; }
+
+  GcsFileSystem* fs_ = nullptr;
+  GcsThrottle* throttle_ = nullptr;
+  const FileBlockCache* block_cache_ = nullptr;
+
+  string block_load_request_file_;
+  string block_retrieved_file_;
+  size_t block_retrieved_bytes_transferred_ = 0;
+  int stat_object_request_count_ = 0;
+};
+
+TEST(GcsFileSystemTest, Stat_StatsRecording) {
+  std::vector<HttpRequest*> requests({new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+      "file.txt?fields=size%2Cgeneration%2Cupdated\n"
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
+      strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
+                      "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
+
+  TestGcsStats stats;
+  fs.SetStats(&stats);
+  EXPECT_EQ(stats.fs_, &fs);
+
+  FileStatistics stat;
+  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
+  EXPECT_EQ(1, stats.stat_object_request_count_);
+}
+
+TEST(GcsFileSystemTest, NewRandomAccessFile_StatsRecording) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
       "Auth Token: fake_token\n"
diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc
index 59ad3cbcc2031feb2e33c10ba5a4083d76edb2ca..e64653a67aceeb436138164e2acec1b5643ac2b2 100644
--- a/tensorflow/core/platform/cloud/oauth_client.cc
+++ b/tensorflow/core/platform/cloud/oauth_client.cc
@@ -97,7 +97,7 @@ Status CreateSignature(RSA* private_key, StringPiece to_sign,
   }
   std::unique_ptr<EVP_MD_CTX, std::function<void(EVP_MD_CTX*)>> md_ctx(
       EVP_MD_CTX_create(), [](EVP_MD_CTX* ptr) { EVP_MD_CTX_destroy(ptr); });
-  if (!md_ctx.get()) {
+  if (!md_ctx) {
     return errors::Internal("Could not create MD_CTX.");
   }
 
@@ -196,7 +196,7 @@ Status OAuthClient::GetTokenFromServiceAccountJson(
   std::unique_ptr<RSA, std::function<void(RSA*)>> private_key(
       PEM_read_bio_RSAPrivateKey(bio.get(), nullptr, nullptr, nullptr),
       [](RSA* ptr) { RSA_free(ptr); });
-  if (!private_key.get()) {
+  if (!private_key) {
     return errors::Internal("Could not deserialize the private key.");
   }
 
diff --git a/tensorflow/core/platform/cloud/ram_file_block_cache.cc b/tensorflow/core/platform/cloud/ram_file_block_cache.cc
index 55a5657a503a334866cad737bb11fe505e59699a..82b692a9e39c9c26c428af896ef3f5ef07d80c58 100644
--- a/tensorflow/core/platform/cloud/ram_file_block_cache.cc
+++ b/tensorflow/core/platform/cloud/ram_file_block_cache.cc
@@ -161,7 +161,7 @@ Status RamFileBlockCache::Read(const string& filename, size_t offset, size_t n,
   if (n == 0) {
     return Status::OK();
   }
-  if (block_size_ == 0 || max_bytes_ == 0) {
+  if (!IsCacheEnabled()) {
     // The cache is effectively disabled, so we pass the read through to the
     // fetcher without breaking it up into blocks.
     return block_fetcher_(filename, offset, n, buffer, bytes_transferred);
@@ -217,6 +217,23 @@ Status RamFileBlockCache::Read(const string& filename, size_t offset, size_t n,
   return Status::OK();
 }
 
+bool RamFileBlockCache::ValidateAndUpdateFileSignature(const string& filename,
+                                                       int64 file_signature) {
+  mutex_lock lock(mu_);
+  auto it = file_signature_map_.find(filename);
+  if (it != file_signature_map_.end()) {
+    if (it->second == file_signature) {
+      return true;
+    }
+    // Remove the file from cache if the signatures don't match.
+    RemoveFile_Locked(filename);
+    it->second = file_signature;
+    return false;
+  }
+  file_signature_map_[filename] = file_signature;
+  return true;
+}
+
 size_t RamFileBlockCache::CacheSize() const {
   mutex_lock lock(mu_);
   return cache_size_;
diff --git a/tensorflow/core/platform/cloud/ram_file_block_cache.h b/tensorflow/core/platform/cloud/ram_file_block_cache.h
index 7fdd7b2e0294e1cf289a77464fb60e08bdb28da7..2303f9caaa227fd92527d9380b394813d9597971 100644
--- a/tensorflow/core/platform/cloud/ram_file_block_cache.h
+++ b/tensorflow/core/platform/cloud/ram_file_block_cache.h
@@ -88,11 +88,19 @@ class RamFileBlockCache : public FileBlockCache {
   Status Read(const string& filename, size_t offset, size_t n, char* buffer,
               size_t* bytes_transferred) override;
 
+  // Validate the given file signature with the existing file signature in the
+  // cache. Returns true if the signature doesn't change or the file doesn't
+  // exist before. If the signature changes, update the existing signature with
+  // the new one and remove the file from cache.
+  bool ValidateAndUpdateFileSignature(const string& filename,
+                                      int64 file_signature) override
+      LOCKS_EXCLUDED(mu_);
+
   /// Remove all cached blocks for `filename`.
   void RemoveFile(const string& filename) override LOCKS_EXCLUDED(mu_);
 
   /// Remove all cached data.
-  void Flush() LOCKS_EXCLUDED(mu_) override;
+  void Flush() override LOCKS_EXCLUDED(mu_);
 
   /// Accessors for cache parameters.
   size_t block_size() const override { return block_size_; }
@@ -102,6 +110,12 @@ class RamFileBlockCache : public FileBlockCache {
   /// The current size (in bytes) of the cache.
   size_t CacheSize() const override LOCKS_EXCLUDED(mu_);
 
+  // Returns true if the cache is enabled. If false, the BlockFetcher callback
+  // is always executed during Read.
+  bool IsCacheEnabled() const override {
+    return block_size_ > 0 && max_bytes_ > 0;
+  }
+
  private:
   /// The size of the blocks stored in the LRU cache, as well as the size of the
   /// reads from the underlying filesystem.
@@ -222,6 +236,9 @@ class RamFileBlockCache : public FileBlockCache {
 
   /// The combined number of bytes in all of the cached blocks.
   size_t cache_size_ GUARDED_BY(mu_) = 0;
+
+  // A filename->file_signature map.
+  std::map<string, int64> file_signature_map_ GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc b/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc
index 10203783fcbe06fefeb1c7451ffbb20045cc421a..eea61135c31c9979843fa4ed41be9e05cd6fcadf 100644
--- a/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc
+++ b/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc
@@ -37,6 +37,52 @@ Status ReadCache(RamFileBlockCache* cache, const string& filename,
   return status;
 }
 
+TEST(RamFileBlockCacheTest, IsCacheEnabled) {
+  auto fetcher = [](const string& filename, size_t offset, size_t n,
+                    char* buffer, size_t* bytes_transferred) {
+    // Do nothing.
+    return Status::OK();
+  };
+  RamFileBlockCache cache1(0, 0, 0, fetcher);
+  RamFileBlockCache cache2(16, 0, 0, fetcher);
+  RamFileBlockCache cache3(0, 32, 0, fetcher);
+  RamFileBlockCache cache4(16, 32, 0, fetcher);
+
+  EXPECT_FALSE(cache1.IsCacheEnabled());
+  EXPECT_FALSE(cache2.IsCacheEnabled());
+  EXPECT_FALSE(cache3.IsCacheEnabled());
+  EXPECT_TRUE(cache4.IsCacheEnabled());
+}
+
+TEST(RamFileBlockCacheTest, ValidateAndUpdateFileSignature) {
+  int calls = 0;
+  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+                          char* buffer, size_t* bytes_transferred) {
+    calls++;
+    memset(buffer, 'x', n);
+    *bytes_transferred = n;
+    return Status::OK();
+  };
+  string filename = "file";
+  RamFileBlockCache cache(16, 32, 0, fetcher);
+  std::vector<char> out;
+
+  // First read.
+  EXPECT_TRUE(cache.ValidateAndUpdateFileSignature(filename, 123));
+  TF_EXPECT_OK(ReadCache(&cache, filename, 0, 16, &out));
+  EXPECT_EQ(calls, 1);
+
+  // Second read. Hit cache.
+  EXPECT_TRUE(cache.ValidateAndUpdateFileSignature(filename, 123));
+  TF_EXPECT_OK(ReadCache(&cache, filename, 0, 16, &out));
+  EXPECT_EQ(calls, 1);
+
+  // Third read. File signatures are different.
+  EXPECT_FALSE(cache.ValidateAndUpdateFileSignature(filename, 321));
+  TF_EXPECT_OK(ReadCache(&cache, filename, 0, 16, &out));
+  EXPECT_EQ(calls, 2);
+}
+
 TEST(RamFileBlockCacheTest, PassThrough) {
   const string want_filename = "foo/bar";
   const size_t want_offset = 42;
diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc
index 99de364042124341df6d8724c2dc7ee865493ae8..e9da3d8e32ab0cad5817bb3c15a7d939f37bc098 100644
--- a/tensorflow/core/platform/cpu_info.cc
+++ b/tensorflow/core/platform/cpu_info.cc
@@ -344,5 +344,28 @@ int CPUModelNum() {
 #endif
 }
 
+int CPUIDNumSMT() {
+#ifdef PLATFORM_IS_X86
+  // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration
+  // https://software.intel.com/en-us/articles/intel-sdm (Vol 3A)
+  // Section: Detecting Hardware Multi-threads Support and Topology
+  // Uses CPUID Leaf 11 to enumerate system topology on Intel x86 architectures
+  // Other cases not supported
+  uint32 eax, ebx, ecx, edx;
+  // Check if system supports Leaf 11
+  GETCPUID(eax, ebx, ecx, edx, 0, 0);
+  if (eax >= 11) {
+    // 1) Leaf 11 available? CPUID.(EAX=11, ECX=0):EBX != 0
+    // 2) SMT_Mask_Width = CPUID.(EAX=11, ECX=0):EAX[4:0] if CPUID.(EAX=11,
+    // ECX=0):ECX[15:8] is 1
+    GETCPUID(eax, ebx, ecx, edx, 11, 0);
+    if (ebx != 0 && ((ecx & 0xff00) >> 8) == 1) {
+      return 1 << (eax & 0x1f);  // 2 ^ SMT_Mask_Width
+    }
+  }
+#endif  // PLATFORM_IS_X86
+  return 0;
+}
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index b5be7e8b54543daddcc07cc8b31a228d4edf14f4..175c9ae8b183eaaa9f9e91de3cc1608df0b188be 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -35,6 +35,10 @@ namespace port {
 // software can change it dynamically.
 int NumSchedulableCPUs();
 
+// Returns an estimate of the number of hyperthreads per physical core
+// on the CPU
+int NumHyperthreadsPerCore();
+
 // Mostly ISA related features that we care about
 enum CPUFeature {
   // Do not change numeric assignments.
@@ -107,6 +111,9 @@ int CPUModelNum();
 // Returns nominal core processor cycles per second of each processor.
 double NominalCPUFrequency();
 
+// Returns num of hyperthreads per physical core
+int CPUIDNumSMT();
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 107c38114b55734b197df1202e9c3fc4dc3dae20..b4b756b86620e1f063c25de08c6f79bd028de096 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -335,6 +335,8 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
         name = cc_name,
         deps = cc_deps + ["@protobuf_archive//:protobuf_headers"] +
                if_static([name + "_cc_impl"]),
+        testonly = testonly,
+        visibility = visibility,
     )
     native.cc_library(
         name = cc_name + "_impl",
@@ -378,8 +380,10 @@ def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
     )
     native.py_library(
         name = py_name,
-        deps = py_deps + ["@protobuf_archive//:protobuf_python"])
-
+        deps = py_deps + ["@protobuf_archive//:protobuf_python"],
+        testonly = testonly,
+        visibility = visibility,
+    )
     return
 
   py_proto_library(
@@ -408,10 +412,11 @@ def tf_proto_library(name, srcs = [], has_services = None,
                      j2objc_api_version = 1,
                      java_api_version = 2, py_api_version = 2,
                      js_api_version = 2, js_codegen = "jspb",
+                     provide_cc_alias = False,
                      default_header = False):
   """Make a proto library, possibly depending on other proto libraries."""
-  js_api_version = js_api_version  # unused argument
-  js_codegen = js_codegen  # unused argument
+  _ignore = (js_api_version, js_codegen, provide_cc_alias)
+
   tf_proto_library_cc(
       name = name,
       srcs = srcs,
@@ -444,6 +449,16 @@ def tf_platform_srcs(files):
   base_set = ["platform/default/" + f for f in files]
   windows_set = base_set + ["platform/windows/" + f for f in files]
   posix_set = base_set + ["platform/posix/" + f for f in files]
+
+  # Handle cases where we must also bring the posix file in. Usually, the list
+  # of files to build on windows builds is just all the stuff in the
+  # windows_set. However, in some cases the implementations in 'posix/' are
+  # just what is necessary and historically we choose to simply use the posix
+  # file instead of making a copy in 'windows'.
+  for f in files:
+    if f == "error.cc":
+      windows_set.append("platform/posix/" + f)
+
   return select({
     "//tensorflow:windows" : native.glob(windows_set),
     "//tensorflow:windows_msvc" : native.glob(windows_set),
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 44a89c3a96ad293be76d709ac21ac6bafe6afcbd..c17e4810d552f703f2db495b0e3f1a44dcc5c55c 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -119,6 +119,37 @@ cc_library(
     copts = tf_copts(),
 )
 
+cc_library(
+    name = "port",
+    srcs = [],
+    copts = tf_copts(),
+)
+
+cc_library(
+    name = "protobuf",
+    srcs = [],
+    copts = tf_copts(),
+)
+
+cc_library(
+    name = "env",
+    srcs = [],
+    copts = tf_copts(),
+)
+
+cc_library(
+    name = "other",
+    srcs = [],
+    copts = tf_copts(),
+    deps = [
+        "@com_googlesource_code_re2//:re2",
+        "@farmhash_archive//:farmhash",
+        "@fft2d",
+        "@highwayhash//:sip_hash",
+        "@png_archive//:png",
+    ],
+)
+
 cc_library(
     name = "platformlib",
     copts = tf_copts(),
diff --git a/tensorflow/core/platform/default/mutex.h b/tensorflow/core/platform/default/mutex.h
index a12d92795e14665f62222879488ab1cb89da8cfc..89e57d58a00546f5539ade37cb66cdeb2a551e14 100644
--- a/tensorflow/core/platform/default/mutex.h
+++ b/tensorflow/core/platform/default/mutex.h
@@ -77,9 +77,7 @@ class SCOPED_LOCKABLE mutex_lock {
 
   // Manually nulls out the source to prevent double-free.
   // (std::move does not null the source pointer by default.)
-  explicit mutex_lock(mutex_lock&& ml) noexcept : mu_(ml.mu_) {
-    ml.mu_ = nullptr;
-  }
+  mutex_lock(mutex_lock&& ml) noexcept : mu_(ml.mu_) { ml.mu_ = nullptr; }
   ~mutex_lock() UNLOCK_FUNCTION() {
     if (mu_ != nullptr) {
       mu_->unlock();
diff --git a/tensorflow/core/platform/error.h b/tensorflow/core/platform/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae965b6c773e26bf3f92f108c380dbcbdd8d6931
--- /dev/null
+++ b/tensorflow/core/platform/error.h
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_ERROR_H_
+#define TENSORFLOW_CORE_PLATFORM_ERROR_H_
+
+#include "tensorflow/core/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_POSIX) || \
+    defined(PLATFORM_POSIX_ANDROID) || defined(PLATFORM_GOOGLE_ANDROID)
+#include "tensorflow/core/platform/posix/error.h"
+#elif defined(PLATFORM_WINDOWS)
+#include "tensorflow/core/platform/windows/error.h"
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_ERROR_H_
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index a8cb40502c11851cb8f18e4194131482616f34c8..72c12318cac883987c8013231c5d76d38c5aceaf 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/error.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/posix/error.h"
 #include "third_party/hadoop/hdfs.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index 37239681755c04152d3ae4a91ab7ec73a89f522b..b65eb43146962b4700e7e71ddcd91d3948213d28 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -79,7 +79,11 @@ limitations under the License.
 // analysis. Giving it this information can help it optimize for the
 // common case in the absence of better information (ie.
 // -fprofile-arcs).
-#if TF_HAS_BUILTIN(__builtin_expect) || (defined(__GNUC__) && __GNUC__ >= 3)
+//
+// We need to disable this for GPU builds, though, since nvcc8 and older
+// don't recognize `__builtin_expect` as a builtin, and fail compilation.
+#if (!defined(__NVCC__)) && \
+    (TF_HAS_BUILTIN(__builtin_expect) || (defined(__GNUC__) && __GNUC__ >= 3))
 #define TF_PREDICT_FALSE(x) (__builtin_expect(x, 0))
 #define TF_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
 #else
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 8e316472fe2ea6f7c3187f0a5f98052c20f5ce6b..708f32ba8085cdddfcc2de3ef20291153d83220e 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -74,6 +74,11 @@ int NumSchedulableCPUs() {
   return kDefaultCores;
 }
 
+int NumHyperthreadsPerCore() {
+  static const int ht_per_core = tensorflow::port::CPUIDNumSMT();
+  return (ht_per_core > 0) ? ht_per_core : 1;
+}
+
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
   return memalign(minimum_alignment, size);
diff --git a/tensorflow/core/platform/test_benchmark.h b/tensorflow/core/platform/test_benchmark.h
index 327237dba933230cb313dd06091d2ff2ca3cc4b2..9b8726d98fc5a82e3aee49ec19cde05e648d2d36 100644
--- a/tensorflow/core/platform/test_benchmark.h
+++ b/tensorflow/core/platform/test_benchmark.h
@@ -42,9 +42,43 @@ namespace testing {
 #if defined(PLATFORM_GOOGLE)
 
 using ::testing::Benchmark;
+using ::testing::DoNotOptimize;
 
 #else
 
+// The DoNotOptimize(...) function can be used to prevent a value or
+// expression from being optimized away by the compiler. This function is
+// intended to add little to no overhead.
+// See: http://stackoverflow.com/questions/28287064
+//
+// The specific guarantees of DoNotOptimize(x) are:
+//  1) x, and any data it transitively points to, will exist (in a register or
+//     in memory) at the current point in the program.
+//  2) The optimizer will assume that DoNotOptimize(x) could mutate x or
+//     anything it transitively points to (although it actually doesn't).
+//
+// To see this in action:
+//
+//   void BM_multiply(benchmark::State& state) {
+//     int a = 2;
+//     int b = 4;
+//     for (auto _ : state) {
+//       testing::DoNotOptimize(a);
+//       testing::DoNotOptimize(b);
+//       int c = a * b;
+//       testing::DoNotOptimize(c);
+//     }
+//   }
+//   BENCHMARK(BM_multiply);
+//
+// Guarantee (2) applied to 'a' and 'b' prevents the compiler lifting the
+// multiplication outside of the loop. Guarantee (1) applied to 'c' prevents the
+// compiler from optimizing away 'c' as dead code.
+template <class T>
+void DoNotOptimize(const T& var) {
+  asm volatile("" : "+m"(const_cast<T&>(var)));
+}
+
 class Benchmark {
  public:
   Benchmark(const char* name, void (*fn)(int));
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 078e76e7dcb74e0116119370e5a5f4ae72674358..a28ed06fb37826cf880bab8ed6a88b6dd947090c 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -14,12 +14,29 @@ import "tensorflow/core/protobuf/cluster.proto";
 import "tensorflow/core/protobuf/rewriter_config.proto";
 
 message GPUOptions {
-  // A value between 0 and 1 that indicates what fraction of the
-  // available GPU memory to pre-allocate for each process.  1 means
-  // to pre-allocate all of the GPU memory, 0.5 means the process
-  // allocates ~50% of the available GPU memory.
+  // Fraction of the available GPU memory to allocate for each process.
+  // 1 means to allocate all of the GPU memory, 0.5 means the process
+  // allocates up to ~50% of the available GPU memory.
+  //
+  // GPU memory is pre-allocated unless the allow_growth option is enabled.
+  //
+  // If greater than 1.0, uses CUDA unified memory to potentially oversubscribe
+  // the amount of memory available on the GPU device by using host memory as a
+  // swap space. Accessing memory not available on the device will be
+  // significantly slower as that would require memory transfer between the host
+  // and the device. Options to reduce the memory requirement should be
+  // considered before enabling this option as this may come with a negative
+  // performance impact. Oversubscription using the unified memory requires
+  // Pascal class or newer GPUs and it is currently only supported on the Linux
+  // operating system. See
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
+  // for the detailed requirements.
   double per_process_gpu_memory_fraction = 1;
 
+  // If true, the allocator does not pre-allocate the entire specified
+  // GPU memory region, instead starting small and growing as needed.
+  bool allow_growth = 4;
+
   // The type of GPU allocation strategy to use.
   //
   // Allowed values:
@@ -35,10 +52,6 @@ message GPUOptions {
   // a reasonable default (several MBs).
   int64 deferred_deletion_bytes = 3;
 
-  // If true, the allocator does not pre-allocate the entire specified
-  // GPU memory region, instead starting small and growing as needed.
-  bool allow_growth = 4;
-
   // A comma-separated list of GPU ids that determines the 'visible'
   // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
   // can see 8 GPU devices in the process, and one wanted to map
@@ -82,9 +95,6 @@ message GPUOptions {
   // the overall host system performance.
   bool force_gpu_compatible = 8;
 
-  // Everything inside Experimental is subject to change and is not subject
-  // to API stability guarantees in
-  // https://www.tensorflow.org/programmers_guide/version_compat.
   message Experimental {
     // Configuration for breaking down a visible GPU into multiple "virtual"
     // devices.
@@ -124,8 +134,20 @@ message GPUOptions {
     //    different settings in different sessions within same process will
     //    result in undefined behavior.
     repeated VirtualDevices virtual_devices = 1;
+
+    // If true, uses CUDA unified memory for memory allocations. If
+    // per_process_gpu_memory_fraction option is greater than 1.0, then unified
+    // memory is used regardless of the value for this field. See comments for
+    // per_process_gpu_memory_fraction field for more details and requirements
+    // of the unified memory. This option is useful to oversubscribe memory if
+    // multiple processes are sharing a single GPU while individually using less
+    // than 1.0 per process memory fraction.
+    bool use_unified_memory = 2;
   }
 
+  // Everything inside experimental is subject to change and is not subject
+  // to API stability guarantees in
+  // https://www.tensorflow.org/programmers_guide/version_compat.
   Experimental experimental = 9;
 };
 
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 5372ef24b88658c93ff9e1cb548b7416c381112f..45e57594e4d282aa9c4c7ed5c14f8f39ac422537 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -47,6 +47,12 @@ message RewriterConfig {
   // Statically infer the value of tensors when possible, and materialize the
   // result using constants.
   Toggle constant_folding = 3;
+  // Shape optimizations (default is ON)
+  // Simplify computations made on shapes.
+  Toggle shape_optimization = 13;
+  // Remapping (default is ON)
+  // Remap subgraphs onto more efficient implementations.
+  Toggle remapping = 14;
   // Arithmetic optimizations (default is ON)
   // e.g. Simplify arithmetic ops; merge ops with same value (like constants).
   Toggle arithmetic_optimization = 7;
diff --git a/tensorflow/core/protobuf/transport_options.proto b/tensorflow/core/protobuf/transport_options.proto
new file mode 100644
index 0000000000000000000000000000000000000000..d7b1bddbbe3d7d9bc78f499c68ab8b64766dbb88
--- /dev/null
+++ b/tensorflow/core/protobuf/transport_options.proto
@@ -0,0 +1,8 @@
+syntax = "proto3";
+
+package tensorflow;
+
+// Extra data needed on a non-RDMA RecvBufResponse.
+message RecvBufRespExtra {
+  bytes tensor_content = 1;
+};
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index d714d85ce68ce37ff1f64bc7d55b32982948504f..39dd66fa1f1724a67c71e49193c57c31ed75fe20 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -361,8 +361,11 @@ message RecvTensorResponse {
 // Out-of-band request to begin or end logging, or
 // to retrieve logs for particular steps.
 message LoggingRequest {
-  // If true, RPC logging will be activated.
-  bool rpc_logging = 1;
+  // If true, RPC logging will be enabled.
+  bool enable_rpc_logging = 1;
+
+  // If true, RPC logging will be disabled.
+  bool disable_rpc_logging = 4;
 
   // If true, discard any saved logging data (for all steps).
   bool clear = 2;
@@ -416,6 +419,60 @@ message TracingRequest {
 message TracingResponse {
 }
 
+////////////////////////////////////////////////////////////////////////////////
+//
+// Raw data transfers in support of Collective Ops.
+// These methods are experimental and subject to change.
+//
+// The intention is to allow collectives to take advantage of the most
+// efficient methods available on a platform, e.g. RDMA, and not be
+// constrained to use the RPC system in use by other methods.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message RecvBufRequest {
+  // Use of the fields below may vary by implementation.  For example
+  // the buf_ptr and num_bytes may be set only for local operations and
+  // not sent on the wire, or only sent on the wire in one direction.
+
+  // Used at server side to find the correct BufRendezvous.
+  int64 step_id = 1;
+
+  // Arbitrary string identifying a BufRendezvous entry.
+  string buf_rendezvous_key = 2;
+
+  // Size of value expected, must agree with BufRendezvous entry.
+  int64 num_bytes = 3;
+
+  // When RDMA is in use, address of destination field on client.
+  fixed64 buf_ptr = 4;
+
+  // Optional information on client-side device locality.
+  DeviceLocality client_locality = 5;
+
+  // Optional information on server-side device locality.
+  DeviceLocality server_locality = 6;
+
+  // Optional, implementation-specific data.
+  google.protobuf.Any transport_options = 7;
+  // Optional, for annotating the timeline.
+  string src_device = 8;
+  string dst_device = 9;
+}
+
+message RecvBufResponse {
+  // Use of the fields below may vary by implementation.  Comments give
+  // intended use.
+
+  fixed64 buf_ptr = 1;  // Address of source field on server.
+  int64 num_bytes = 2;  // Byte length of buf_ptr field, if set.
+  bool is_dead = 3;     // True if value is 'dead' like a tensor.
+  // Optional, implementation-specific data.
+  google.protobuf.Any transport_options = 4;
+  // Optional, for timeline.
+  int64 send_start_micros = 5;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // Collective Op dynamic group resolution messages.
diff --git a/tensorflow/core/protobuf/worker_service.proto b/tensorflow/core/protobuf/worker_service.proto
index 025fa7ca59452ff96e0a09a1de37d1855bc47fbd..9ebbd553f2181bdc0549cd4a5fb68cc0315926d3 100644
--- a/tensorflow/core/protobuf/worker_service.proto
+++ b/tensorflow/core/protobuf/worker_service.proto
@@ -73,6 +73,10 @@ service WorkerService {
   // See worker.proto for details.
   rpc Tracing(TracingRequest) returns (TracingResponse);
 
+  // See worker.proto for details.
+  rpc RecvBuf(RecvBufRequest) returns (RecvBufResponse) {
+  }
+
   // See worker.proto for details.
   rpc GetStepSequence(GetStepSequenceRequest) returns (GetStepSequenceResponse);
 
diff --git a/tensorflow/core/kernels/batch_util.cc b/tensorflow/core/util/batch_util.cc
similarity index 99%
rename from tensorflow/core/kernels/batch_util.cc
rename to tensorflow/core/util/batch_util.cc
index 1182ed42e7a9adacb6059e1e9da9b510b911f11f..7ea8851e6512555b0971945383e5dafcc0ea7774 100644
--- a/tensorflow/core/kernels/batch_util.cc
+++ b/tensorflow/core/util/batch_util.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/batch_util.h"
+#include "tensorflow/core/util/batch_util.h"
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/core/util/batch_util.h b/tensorflow/core/util/batch_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..eee0309fbc4c00f90cb668a27ddf80e80af286e7
--- /dev/null
+++ b/tensorflow/core/util/batch_util.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_UTIL_BATCH_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_BATCH_UTIL_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace batch_util {
+
+// Copies element into the index^th slice of parent (in the 0th dimension).
+//
+// NOTE(mrry): The `element` argument is taken by value. Use `std::move()`
+// to move the `element` argument into this function, and the implementation
+// may be able to optimize the copy to a move. This is particularly important
+// for DT_STRING tensors.
+Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
+
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
+
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+//
+// NOTE(mrry): The implementation may be able to optimize the copy to a move.
+// This is particularly important for DT_STRING tensors.
+Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index);
+
+// Zero-initializes the tensor `element` using the scalar stored in `padding`.
+// Both `element` and `padding` must have matching `dtype`.
+Status SetElementZero(Tensor* element, const Tensor& padding);
+
+// Copies `element` into a (0th dimension) slice of `parent`, assuming
+// the shape of `element` is strictly not larger along any axis than a
+// slice.
+Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                int index);
+
+}  // namespace batch_util
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_BATCH_UTIL_H_
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index 0ab875625ff617028c4bc53fa8ccba0488c3d0d1..540adb58d4b6252885fdca0788ad71954e52c749 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -21,10 +21,7 @@ limitations under the License.
 #include "tensorflow/core/util/cuda_device_functions.h"
 #include "tensorflow/core/util/cuda_launch_config.h"
 
-#if CUDA_VERSION >= 7050
 #include "cuda/include/cuda_fp16.h"
-#define TF_HAS_CUDA_FP16
-#endif
 
 // Deprecated, use 'for(int i : CudaGridRangeX(n))' instead.
 #define CUDA_1D_KERNEL_LOOP(i, n) \
diff --git a/tensorflow/core/util/example_proto_fast_parsing_test.cc b/tensorflow/core/util/example_proto_fast_parsing_test.cc
index 13e41c17f7c7df5ad581bd3f6a39051641139258..1a804e154cf607c7471d98ae5e91c98e0a2831f6 100644
--- a/tensorflow/core/util/example_proto_fast_parsing_test.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing_test.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #include "tensorflow/core/util/example_proto_fast_parsing.h"
 
 #include "tensorflow/core/example/example.pb.h"
@@ -336,34 +337,6 @@ TEST(FastParse, FuzzTest) {
   }
 }
 
-string MakeSerializedExample() {
-  Example example;
-  const int kFeatureNameLength = 10;
-  const int kFeatureValueLength = 20;
-  const int kBytesFeatureCount = 200;
-  const int kFloatFeatureCount = 200;
-  const int kInt64FeatureCount = 200;
-  auto& fmap = *example.mutable_features()->mutable_feature();
-  for (int i = 0; i < kBytesFeatureCount; ++i) {
-    fmap[strings::StrCat(string('b', kFeatureNameLength), i)]
-        .mutable_bytes_list()
-        ->add_value(string('v', kFeatureValueLength));
-  }
-  for (int i = 0; i < kFloatFeatureCount; ++i) {
-    fmap[strings::StrCat(string('f', kFeatureNameLength), i)]
-        .mutable_float_list()
-        ->add_value(123123123.123);
-  }
-  for (int i = 0; i < kInt64FeatureCount; ++i) {
-    fmap[strings::StrCat(string('i', kFeatureNameLength), i)]
-        .mutable_int64_list()
-        ->add_value(10 * i);
-  }
-  string serialized;
-  example.SerializeToString(&serialized);
-  return serialized;
-}
-
 TEST(TestFastParseExample, Empty) {
   Result result;
   FastParseExampleConfig config;
@@ -374,6 +347,5 @@ TEST(TestFastParseExample, Empty) {
 }
 
 }  // namespace
-
 }  // namespace example
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 8105121e7ce809868fb869fc12d6822a562c658b..230b4278ca926b0fe46e73e26c6d3fac7d58c807 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -706,15 +706,48 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
   return output_tensor;
 }
 #else
+using mkldnn::stream;
+template <typename T> class MklDnnData;
+
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklDnnShape& mkl_shape) {
   Tensor output_tensor;
-  TensorShape output_shape;
-
-  TF_CHECK_OK(
-      Status(error::Code::UNIMPLEMENTED, "Unimplemented conversion function"));
-
+  try {
+    if (!mkl_shape.IsMklTensor())
+      return mkl_tensor;  // return input since it is already TF tensor
+
+    TensorShape output_shape = mkl_shape.GetTfShape();;
+
+    // Allocate output tensor.
+    context->allocate_temp(DataTypeToEnum<T>::v(),
+        output_shape, &output_tensor);
+
+    auto cpu_engine = engine(engine::cpu, 0);
+    MklDnnData<T> input(&cpu_engine);
+
+    // Get Mkl layout of input tensor.
+    auto input_mkl_md = mkl_shape.GetMklLayout();
+    auto output_tf_md = mkl_shape.GetTfLayout();
+    auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
+    input.SetUsrMem(input_mkl_md, &mkl_tensor);
+
+    // reorder
+    if (input.IsReorderNeeded(output_tf_pd)) {
+      std::vector<primitive> net;
+      CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net),
+             true);
+      stream(stream::kind::eager).submit(net).wait();
+    } else {
+      // If not, just forward input tensor to output tensor.
+      CHECK(output_tensor.CopyFrom(mkl_tensor, output_shape));
+    }
+  } catch (mkldnn::error& e) {
+    string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) + ", in file " +
+                       string(__FILE__) + ":" + std::to_string(__LINE__);
+    LOG(FATAL) << "Operation received an exception: " << error_msg;
+  }
   return output_tensor;
 }
 #endif
diff --git a/tensorflow/core/util/port.cc b/tensorflow/core/util/port.cc
index 490c584dc5cf67cb765cdec583a078e7cf6686c1..c081ceae57cf7f671a18aa3f87da31c7b390d7dc 100644
--- a/tensorflow/core/util/port.cc
+++ b/tensorflow/core/util/port.cc
@@ -31,9 +31,7 @@ bool IsGoogleCudaEnabled() {
 
 bool CudaSupportsHalfMatMulAndConv() {
 #if GOOGLE_CUDA
-  // NOTE: We check compile-time and not runtime, since the check for
-  // whether we include the fp16 kernels or not is compile-time.
-  return CUDA_VERSION >= 7050;
+  return true;
 #else
   return false;
 #endif
diff --git a/tensorflow/core/util/tensor_format.cc b/tensorflow/core/util/tensor_format.cc
index 8c833650ca1341b83a0b5be2fe49d63a572794f3..d4311d1ab058aaa8ad6a5efac604add41cc6afd0 100644
--- a/tensorflow/core/util/tensor_format.cc
+++ b/tensorflow/core/util/tensor_format.cc
@@ -41,6 +41,8 @@ string ToString(TensorFormat format) {
       return "NCHW";
     case FORMAT_NCHW_VECT_C:
       return "NCHW_VECT_C";
+    case FORMAT_NHWC_VECT_W:
+      return "NHWC_VECT_W";
     default:
       LOG(FATAL) << "Invalid Format: " << static_cast<int32>(format);
       return "INVALID_FORMAT";
@@ -74,6 +76,10 @@ bool FormatFromString(const string& format_str, TensorFormat* format) {
     *format = FORMAT_NCHW_VECT_C;
     return true;
   }
+  if (format_str == "NHWC_VECT_W") {
+    *format = FORMAT_NHWC_VECT_W;
+    return true;
+  }
   return false;
 }
 
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index 517b85a5ba8c7cbf5f0895b6b5a4c54a00f33ddc..d3d5602f92454118c0df4d423e8fa4fe1f576a91 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_TENSOR_FORMAT_H_
-#define TENSORFLOW_UTIL_TENSOR_FORMAT_H_
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_FORMAT_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_FORMAT_H_
 
 #include <array>
 #include <vector>
@@ -29,6 +29,9 @@ namespace tensorflow {
 // The mnemonics specify the meaning of each tensor dimension sorted from
 // largest to smallest memory stride.
 // N = Batch, H = Image Height, W = Image Width, C = Number of Channels.
+// TODO(pauldonnelly): It would probably be better to switch to a registration
+// process for tensor formats, so specialized formats could be defined more
+// locally to where they are used.
 enum TensorFormat {
   // FORMAT_NHWC is the default format in TensorFlow.
   FORMAT_NHWC = 0,
@@ -45,6 +48,17 @@ enum TensorFormat {
   // NCHW_VECT_C format.
   // A pre-condition of this format is that C must be a multiple of 4.
   FORMAT_NCHW_VECT_C = 2,
+
+  // Similar to NHWC, but the size of the W dimension is divided by 4, and a
+  // new dimension of size 4 is appended, which packs 4 adjacent activations
+  // in the width dimension.
+  FORMAT_NHWC_VECT_W = 3,
+
+  // Note: although the current code in this file assumes VECT_C and VECT_W
+  // enums imply int8x4 vectors, this should not be relied upon.
+  // In the future we may change the meaning of these enums to include vectors
+  // of other types such as int16x2, with op implementations automatically
+  // determining which format is implied based on the datatype.
 };
 
 // Tensor format for convolutional filters.
@@ -89,10 +103,17 @@ string ToString(FilterTensorFormat format);
 // Returns the number of spatial dims of a tensor of rank 'num_dims' and tensor
 // format 'format'.
 inline int GetTensorSpatialDims(int num_dims, TensorFormat format) {
-  if (format == FORMAT_NCHW_VECT_C) {
-    return num_dims - 3;  // Exclude N,C,InnerC.
-  } else {
-    return num_dims - 2;  // Exclude N,C.
+  switch (format) {
+    case FORMAT_NHWC:
+      return num_dims - 2;  // Exclude N,C.
+    case FORMAT_NCHW:
+      return num_dims - 2;  // Exclude N,C.
+    case FORMAT_NCHW_VECT_C:
+      return num_dims - 3;  // Exclude N,C,VectDim.
+    case FORMAT_NHWC_VECT_W:
+      // Note: the VECT_W is not counted as an independent spatial dim here,
+      // since it just a component of the width dimension.
+      return num_dims - 3;  // Exclude N,C,VectDim.
   }
 }
 
@@ -108,10 +129,13 @@ inline int GetFilterTensorSpatialDims(int num_dims, FilterTensorFormat format) {
 // tensor format 'format'. This is the inverse of GetTensorSpatialDims.
 inline int GetTensorDimsFromSpatialDims(int num_spatial_dims,
                                         TensorFormat format) {
-  if (format == FORMAT_NCHW_VECT_C) {
-    return num_spatial_dims + 3;  // Include N,C,InnerC.
-  } else {
-    return num_spatial_dims + 2;  // Include N,C.
+  switch (format) {
+    case FORMAT_NHWC:
+    case FORMAT_NCHW:
+      return num_spatial_dims + 2;  // Include N,C.
+    case FORMAT_NCHW_VECT_C:
+    case FORMAT_NHWC_VECT_W:
+      return num_spatial_dims + 3;  // Include N,C,VectDim.
   }
 }
 
@@ -132,6 +156,7 @@ inline int GetTensorBatchDimIndex(int num_dims, TensorFormat format) {
     case FORMAT_NHWC:
     case FORMAT_NCHW:
     case FORMAT_NCHW_VECT_C:
+    case FORMAT_NHWC_VECT_W:
       return 0;
     default:
       LOG(FATAL) << "Unknown format " << format;
@@ -146,6 +171,8 @@ inline int GetTensorFeatureDimIndex(int num_dims, TensorFormat format) {
   switch (format) {
     case FORMAT_NHWC:
       return num_dims - 1;
+    case FORMAT_NHWC_VECT_W:
+      return num_dims - 2;
     case FORMAT_NCHW:
     case FORMAT_NCHW_VECT_C:
       return 1;
@@ -161,24 +188,34 @@ inline int GetTensorInnerFeatureDimIndex(int num_dims, TensorFormat format) {
   return num_dims - 1;
 }
 
-// Returns the index of the `dim`-th spatial dimension.
+// Returns the index of the inner width dimension.
+inline int GetTensorInnerWidthDimIndex(int num_dims, TensorFormat format) {
+  DCHECK_EQ(format, FORMAT_NHWC_VECT_W);
+  return num_dims - 1;
+}
+
+// Returns the dimension index of the specified 'spatial_dim' within an
+// activation tensor. If format is NHWC_VECT_W and spatial_dim is 1, returns
+// the index of the outer width dimension (i.e. dimension 2, whose size would
+// be width / 4 in this case).
 inline int GetTensorSpatialDimIndex(int num_dims, TensorFormat format,
-                                    int dim) {
-  CHECK(dim >= 0 && dim < GetTensorSpatialDims(num_dims, format))
-      << dim << " " << num_dims << " " << ToString(format);
+                                    int spatial_dim) {
+  CHECK(spatial_dim >= 0 &&
+        spatial_dim < GetTensorSpatialDims(num_dims, format))
+      << spatial_dim << " " << num_dims << " " << ToString(format);
   switch (format) {
     case FORMAT_NHWC:
-      return dim + 1;
+    case FORMAT_NHWC_VECT_W:
+      return spatial_dim + 1;
     case FORMAT_NCHW:
     case FORMAT_NCHW_VECT_C:
-      return dim + 2;
+      return spatial_dim + 2;
     default:
       LOG(FATAL) << "Unknown format " << format;
       return -1;  // Avoid compiler warning about missing return value
   }
 }
 
-// Returns the index of the `dim`-th spatial dimension.
 inline int GetFilterTensorSpatialDimIndex(int num_dims,
                                           FilterTensorFormat format, int dim) {
   CHECK(dim >= 0 && dim < GetFilterTensorSpatialDims(num_dims, format))
@@ -246,7 +283,7 @@ inline int GetFilterTensorOutputChannelsDimIndex(int num_dims,
 // the outer channel dimension (i.e. 1).
 template <int NUM_SPATIAL_DIMS>
 inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
-  if (format == FORMAT_NHWC) {
+  if (format == FORMAT_NHWC || format == FORMAT_NHWC_VECT_W) {
     // clang-format off
     switch (dimension) {
       case 'N': return 0;
@@ -404,28 +441,37 @@ string GetConvnet3dDataFormatAttrString();
 string GetConvnetFilterFormatAttrString();
 string GetConvnet3dFilterFormatAttrString();
 
-// Return a tensor shape for the given format. Works for both 2D and 3D
-// operations. If format is FORMAT_NCHW_VECT_C, the output TensorShape has rank
-// spatial.size()+3 (N,C,spatial,InnerC); otherwise, it has rank
-// spatial.size()+2 (e.g. N,C,spatial or N,spatial,C).
+// Returns a tensor shape for the specified format and dimension sizes.
+// Works for both 2D and 3D operations. The output shapes are as follows:
+// FORMAT_NHWC:        (N, spatial, C); rank = spatial.size() + 2
+// FORMAT_NCHW:        (N, C, spatial); rank = spatial.size() + 2
+// FORMAT_NCHW_VECT_C: (N, C, spatial, InnerC); rank = spatial.size() + 3
+// FORMAT_NHWC_VECT_W: (N, spatial, C, InnerW); rank = spatial.size() + 3
 inline TensorShape ShapeFromFormat(TensorFormat format, int64 N,
                                    gtl::ArraySlice<int64> spatial, int64 C) {
   const int dims = GetTensorDimsFromSpatialDims(spatial.size(), format);
   gtl::InlinedVector<int64, 6> dim_sizes(dims);
   dim_sizes[GetTensorBatchDimIndex(dims, format)] = N;
   for (int dim = 0; static_cast<size_t>(dim) < spatial.size(); dim++) {
-    dim_sizes[GetTensorSpatialDimIndex(dims, format, dim)] = spatial[dim];
+    auto dim_size = spatial[dim];
+    if (format == FORMAT_NHWC_VECT_W && dim == spatial.size() - 1) {
+      CHECK_EQ(0, dim_size % 4)
+          << "FORMAT_NHWC_VECT_W requires W to be a multiple of 4, but W="
+          << dim_size;
+      dim_sizes[GetTensorInnerWidthDimIndex(dims, format)] = 4;
+      dim_size /= 4;
+    }
+    dim_sizes[GetTensorSpatialDimIndex(dims, format, dim)] = dim_size;
   }
 
   int feature_index = GetTensorFeatureDimIndex(dims, format);
   if (format == FORMAT_NCHW_VECT_C) {
     CHECK_EQ(0, C % 4) << "NCHW_VECT_C requires C to be a multiple of 4, but C="
                        << C;
-    dim_sizes[feature_index] = C / 4;
+    C /= 4;
     dim_sizes[GetTensorInnerFeatureDimIndex(dims, format)] = 4;
-  } else {
-    dim_sizes[feature_index] = C;
   }
+  dim_sizes[feature_index] = C;
   return TensorShape(dim_sizes);
 }
 
@@ -478,19 +524,18 @@ inline TensorShape ShapeFromFormat(TensorFormat dst_format,
   const int64 batch = GetTensorDim(src_shape, src_format, 'N');
   const int64 channels = GetTensorDim(src_shape, src_format, 'C') *
                          (src_format == FORMAT_NCHW_VECT_C ? 4 : 1);
-
-  if (GetTensorSpatialDims(src_shape.dims(), src_format) == 3) {
-    return ShapeFromFormat(dst_format, batch,
-                           {{GetTensorDim(src_shape, src_format, '0'),
-                             GetTensorDim(src_shape, src_format, '1'),
-                             GetTensorDim(src_shape, src_format, '2')}},
-                           channels);
+  const int num_src_spatial_dims =
+      GetTensorSpatialDims(src_shape.dims(), src_format);
+  std::vector<int64> spatial_dims(num_src_spatial_dims);
+  for (int spatial_dim = 0; spatial_dim < num_src_spatial_dims; ++spatial_dim) {
+    spatial_dims[spatial_dim] =
+        gtl::ArraySlice<int64>(src_shape.dim_sizes())[GetTensorSpatialDimIndex(
+            src_shape.dims(), src_format, spatial_dim)];
   }
-
-  return ShapeFromFormat(dst_format, batch,
-                         {{GetTensorDim(src_shape, src_format, 'H'),
-                           GetTensorDim(src_shape, src_format, 'W')}},
-                         channels);
+  if (src_format == FORMAT_NHWC_VECT_W) {
+    spatial_dims[num_src_spatial_dims - 1] *= 4;
+  }
+  return ShapeFromFormat(dst_format, batch, {spatial_dims}, channels);
 }
 
 // Returns a copy of the specified filter tensor 'src_shape' converted from
@@ -525,4 +570,4 @@ inline TensorShape ShapeFromFilterFormat(FilterTensorFormat dst_filter_format,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_TENSOR_FORMAT_H_
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_FORMAT_H_
diff --git a/tensorflow/core/util/tensor_format_test.cc b/tensorflow/core/util/tensor_format_test.cc
index 36698e03831379f7b7b3ac2b83fa9e9e2c733e44..93902290eb094db58c9b19987999c716a3d16c79 100644
--- a/tensorflow/core/util/tensor_format_test.cc
+++ b/tensorflow/core/util/tensor_format_test.cc
@@ -29,6 +29,7 @@ std::pair<TensorFormat, const char*> test_data_formats[] = {
     EnumStringPair(FORMAT_NHWC),
     EnumStringPair(FORMAT_NCHW),
     EnumStringPair(FORMAT_NCHW_VECT_C),
+    EnumStringPair(FORMAT_NHWC_VECT_W),
 };
 
 std::pair<FilterTensorFormat, const char*> test_filter_formats[] = {
@@ -104,7 +105,8 @@ struct DimMaps {
 inline constexpr const TensorDimMap&
 GetTensorDimMap(const int num_spatial_dims, const TensorFormat format) {
   return
-      (format == FORMAT_NHWC) ? DimMaps::kTdmNHWC[num_spatial_dims] :
+      (format == FORMAT_NHWC ||
+       format == FORMAT_NHWC_VECT_W) ? DimMaps::kTdmNHWC[num_spatial_dims] :
       (format == FORMAT_NCHW ||
        format == FORMAT_NCHW_VECT_C) ? DimMaps::kTdmNCHW[num_spatial_dims]
                                      : DimMaps::kTdmInvalid;
diff --git a/tensorflow/docs_src/api_guides/python/contrib.framework.md b/tensorflow/docs_src/api_guides/python/contrib.framework.md
index f7f26e5626033cb18aa3b2cf7b6366ee3e2dd45d..6b4ce3a14d7e1f2712f33e1abff312c370417ed8 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.framework.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.framework.md
@@ -18,17 +18,20 @@ Framework utilities.
 *   @{tf.contrib.framework.with_same_shape}
 
 ## Deprecation
+
 *   @{tf.contrib.framework.deprecated}
 *   @{tf.contrib.framework.deprecated_args}
 *   @{tf.contrib.framework.deprecated_arg_values}
 
 ## Arg_Scope
+
 *   @{tf.contrib.framework.arg_scope}
 *   @{tf.contrib.framework.add_arg_scope}
 *   @{tf.contrib.framework.has_arg_scope}
 *   @{tf.contrib.framework.arg_scoped_arguments}
 
 ## Variables
+
 *   @{tf.contrib.framework.add_model_variable}
 *   @{tf.train.assert_global_step}
 *   @{tf.contrib.framework.assert_or_get_global_step}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.learn.md b/tensorflow/docs_src/api_guides/python/contrib.learn.md
index 8b2fffa2013da7e378ae1a71482be19fd88e801f..03838dc5aede4ac9349162d5c9d44d80fcb8d912 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.learn.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.learn.md
@@ -25,6 +25,7 @@ Train and evaluate TensorFlow models.
 *   @{tf.contrib.learn.LogisticRegressor}
 
 ## Distributed training utilities
+
 *   @{tf.contrib.learn.Experiment}
 *   @{tf.contrib.learn.ExportStrategy}
 *   @{tf.contrib.learn.TaskType}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
index 496d43dfd7e48c1ce4d98ed6e33b583c61953033..143919fd84b70be803f66693238cdb56de2b18f9 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
@@ -21,6 +21,7 @@ wrapper.  An instance of an `AttentionMechanism` is constructed with a
 ### Attention Mechanisms
 
 The two basic attention mechanisms are:
+
 *   @{tf.contrib.seq2seq.BahdanauAttention} (additive attention,
     [ref.](https://arxiv.org/abs/1409.0473))
 *   @{tf.contrib.seq2seq.LuongAttention} (multiplicative attention,
@@ -118,14 +119,17 @@ outputs, _ = tf.contrib.seq2seq.dynamic_decode(
 ```
 
 ### Decoder base class and functions
+
 *   @{tf.contrib.seq2seq.Decoder}
 *   @{tf.contrib.seq2seq.dynamic_decode}
 
 ### Basic Decoder
+
 *   @{tf.contrib.seq2seq.BasicDecoderOutput}
 *   @{tf.contrib.seq2seq.BasicDecoder}
 
 ### Decoder Helpers
+
 *   @{tf.contrib.seq2seq.Helper}
 *   @{tf.contrib.seq2seq.CustomHelper}
 *   @{tf.contrib.seq2seq.GreedyEmbeddingHelper}
diff --git a/tensorflow/docs_src/api_guides/python/meta_graph.md b/tensorflow/docs_src/api_guides/python/meta_graph.md
index 0eff9000931666dce742358a290f25bb2b5a7b16..f1c3adc22c3546260e68a5aa7b302aa91493915b 100644
--- a/tensorflow/docs_src/api_guides/python/meta_graph.md
+++ b/tensorflow/docs_src/api_guides/python/meta_graph.md
@@ -22,14 +22,14 @@ protocol buffer. It contains the following fields:
 * [`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto) for describing the graph.
 * [`SaverDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/saver.proto) for the saver.
 * [`CollectionDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto)
-map that further describes additional components of the model, such as
+map that further describes additional components of the model such as
 @{$python/state_ops$`Variables`},
-@{tf.train.QueueRunner}, etc.  In order for a Python object to be serialized
+@{tf.train.QueueRunner}, etc.
+
+In order for a Python object to be serialized
 to and from `MetaGraphDef`, the Python class must implement `to_proto()` and
 `from_proto()` methods, and register them with the system using
-`register_proto_function`.
-
-  For example,
+`register_proto_function`. For example:
 
   ```Python
   def to_proto(self, export_scope=None):
diff --git a/tensorflow/docs_src/api_guides/python/train.md b/tensorflow/docs_src/api_guides/python/train.md
index 80fe9784de64c3b3f1843cad07bb02507f682eaf..cbc50529469b32afbb9c0646a0cfd27627563f87 100644
--- a/tensorflow/docs_src/api_guides/python/train.md
+++ b/tensorflow/docs_src/api_guides/python/train.md
@@ -54,6 +54,7 @@ gradients.
 *   @{tf.global_norm}
 
 ## Decaying the learning rate
+
 *   @{tf.train.exponential_decay}
 *   @{tf.train.inverse_time_decay}
 *   @{tf.train.natural_exp_decay}
diff --git a/tensorflow/docs_src/community/groups.md b/tensorflow/docs_src/community/groups.md
index d92f5775fafa394d795dd451077a721a2ecbb259..0b07d413da3c6dae03301b5dda95b6ef6443575d 100644
--- a/tensorflow/docs_src/community/groups.md
+++ b/tensorflow/docs_src/community/groups.md
@@ -1,17 +1,38 @@
 # User Groups
 
-TensorFlow has communities around the world.
+TensorFlow has communities around the world. [Submit your community!](https://docs.google.com/forms/d/e/1FAIpQLSc_RQIUYtVgLLihzATaO_WUXkEyBDE_OoRoOXYDPmBEvHuEBA/viewform)
 
 ## Asia
 
-* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
-* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
-* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
+* [TensorFlow China community](https://www.tensorflowers.cn)
+* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/)
+* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/)
+* [Soleil Data Dojo](https://soleildatadojo.connpass.com/)
 * [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
+* [TensorFlow Philippines Community](https://www.facebook.com/groups/TensorFlowPH/)
+* [TensorFlow and Deep Learning Singapore](https://www.meetup.com/TensorFlow-and-Deep-Learning-Singapore/)
+* [TensorFlow India](https://www.facebook.com/tensorflowindia)
 
 
 ## Europe
 
 * [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
 * [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
+* [Tensorflow Belgium](https://www.meetup.com/TensorFlow-Belgium)
+* [TensorFlow x Rome Meetup](https://www.meetup.com/it-IT/TensorFlow-x-Rome-Meetup)
+* [TensorFlow London](https://www.meetup.com/TensorFlow-London/)
+* [TensorFlow Edinburgh](https://www.meetup.com/tensorflow-edinburgh/)
 
+
+## America
+
+* [TensorFlow Buenos Aires](https://www.meetup.com/TensorFlow-Buenos-Aires/)
+
+
+## Oceania
+* [Melbourne TensorFlow Meetup](https://www.meetup.com/Melbourne-TensorFlow-Meetup)
+
+
+## Africa
+
+* [TensorFlow Tunis Meetup](https://www.meetup.com/fr-FR/TensorFlow-Tunis-Meetup/)
diff --git a/tensorflow/docs_src/community/swift.md b/tensorflow/docs_src/community/swift.md
index e5a0f02a8c36339946980ded7a7e989d71de6a8a..d1625d3b93e2a95229afd897653c113bd45da2ea 100644
--- a/tensorflow/docs_src/community/swift.md
+++ b/tensorflow/docs_src/community/swift.md
@@ -8,7 +8,7 @@ Welcome to the Swift for TensorFlow development community!
 
 Swift for TensorFlow is a new way to develop machine learning models. It
 gives you the power of
-[TensorFlow](programmers_guide/eager) directly
+[TensorFlow](https://www.tensorflow.org) directly
 integrated into the [Swift programming language](https://swift.org/about).
 With Swift, you can write the following imperative code, and Swift
 automatically turns it into **a single TensorFlow Graph** and runs it
@@ -18,7 +18,7 @@ with the full performance of TensorFlow Sessions on CPU, GPU and
 ```swift
 import TensorFlow
 
-var x = Tensor([[1, 2], [3, 4]])
+var x = Tensor<Float>([[1, 2], [3, 4]])
 
 for i in 1...5 {
   x += x ⊗ x
@@ -28,8 +28,8 @@ print(x)
 ```
 
 Swift combines the flexibility of
-[Eager Execution](programmers_guide/eager) with the
-high performance of [Graphs and Sessions](programmers_guide/graphs).
+[Eager Execution](https://www.tensorflow.org/programmers_guide/eager) with the
+high performance of [Graphs and Sessions](https://www.tensorflow.org/programmers_guide/graphs).
 Behind the scenes, Swift analyzes your Tensor code and automatically builds
 graphs for you. Swift also catches type errors and shape mismatches before
 running your code, and has [Automatic Differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation)
diff --git a/tensorflow/docs_src/deploy/index.md b/tensorflow/docs_src/deploy/index.md
index 61edba04b46b7af11e97dff4cc7240436f98b471..33220041895acdbb90781c1ee618c06e44f49bf9 100644
--- a/tensorflow/docs_src/deploy/index.md
+++ b/tensorflow/docs_src/deploy/index.md
@@ -15,3 +15,7 @@ the following documents:
     out-of-the-box integration with TensorFlow models.
     [Source code for TensorFlow Serving](https://github.com/tensorflow/serving)
     is available on GitHub.
+
+[TensorFlow Extended (TFX)](/tfx) is an end-to-end machine learning platform for
+TensorFlow. Implemented at Google, we've open sourced some TFX libraries with the
+rest of the system to come.
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index c3795492cef7d6f26de157ce40544280c8194e59..1b028be4ea16af89b8aac8a8a73e9ceca9e842c5 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -863,48 +863,53 @@ REGISTER_OP("ZeroOut")
 Instead of writing another `OpKernel` with redundant code as above, often you
 will be able to use a C++ template instead.  You will still have one kernel
 registration (`REGISTER_KERNEL_BUILDER` call) per overload.
-<pre class="prettyprint"><code class="lang-cpp">
-<b>template &lt;typename T&gt;</b>
+```c++
+template <typename T>
 class ZeroOutOp : public OpKernel {
  public:
-  explicit ZeroOutOp(OpKernelConstruction\* context) : OpKernel(context) {}<br/>
-  void Compute(OpKernelContext\* context) override {
+  explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) {}
+  
+  void Compute(OpKernelContext* context) override {
     // Grab the input tensor
-    const Tensor& input\_tensor = context-&gt;input(0);
-    auto input = input\_tensor.flat<b>&lt;T&gt;</b>();<br/>
+    const Tensor& input_tensor = context->input(0);
+    auto input = input_tensor.flat<T>();
+    
     // Create an output tensor
     Tensor* output = NULL;
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;allocate\_output(0, input_tensor.shape(), &output));
-    auto output\_flat = output-&gt;template flat<b>&lt;T&gt;</b>();<br/>
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_tensor.shape(), &output));
+    auto output_flat = output->template flat<T>();
+    
     // Set all the elements of the output tensor to 0
     const int N = input.size();
-    for (int i = 0; i &lt; N; i++) {
-      output\_flat(i) = 0;
-    }<br/>
+    for (int i = 0; i < N; i++) {
+      output_flat(i) = 0;
+    }
+    
     // Preserve the first input value
-    if (N &gt; 0) output\_flat(0) = input(0);
+    if (N > 0) output_flat(0) = input(0);
   }
-};<br/>
-// Note that TypeConstraint&lt;int32&gt;("T") means that attr "T" (defined
+};
+
+// Note that TypeConstraint<int32>("T") means that attr "T" (defined
 // in the op registration above) must be "int32" to use this template
-// instantiation.</b>
-REGISTER\_KERNEL\_BUILDER(
+// instantiation.
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;int32&gt;("T"),
-    <b>ZeroOutOp&lt;int32&gt;</b>);
-REGISTER\_KERNEL\_BUILDER(
+    .Device(DEVICE_CPU)
+    .TypeConstraint<int32>("T"),
+    ZeroOutOp<int32>);
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;float&gt;("T"),
-    <b>ZeroOutOp&lt;float&gt;</b>);
-<b>REGISTER\_KERNEL\_BUILDER(
+    .Device(DEVICE_CPU)
+    .TypeConstraint<float>("T"),
+    ZeroOutOp<float>);
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;double&gt;("T"),
-    ZeroOutOp&lt;double&gt;);
-</b></code></pre>
+    .Device(DEVICE_CPU)
+    .TypeConstraint<double>("T"),
+    ZeroOutOp<double>);
+```
 
 If you have more than a couple overloads, you can put the registration in a
 macro.
diff --git a/tensorflow/docs_src/get_started/feature_columns.md b/tensorflow/docs_src/get_started/feature_columns.md
index 79c26679793f2f2830ce772708bb72c143b1ec9a..f152813442903a4cbe2cca8b56c90b49d9718c3b 100644
--- a/tensorflow/docs_src/get_started/feature_columns.md
+++ b/tensorflow/docs_src/get_started/feature_columns.md
@@ -528,10 +528,10 @@ suggested by the following snippet:
 categorical_column = ... # Create any categorical column
 
 # Represent the categorical column as an embedding column.
-# This means creating a one-hot vector with one element for each category.
+# This means creating an embedding vector lookup table with one element for each category.
 embedding_column = tf.feature_column.embedding_column(
     categorical_column=categorical_column,
-    dimension=dimension_of_embedding_vector)
+    dimension=embedding_dimensions)
 ```
 
 @{$programmers_guide/embedding$Embeddings} is a significant topic within machine
diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/get_started/premade_estimators.md
index 4be7e508f94074f20d07e271259bf77074dd19e3..15853bc0abbf9bab0c96f34475890056c3291045 100644
--- a/tensorflow/docs_src/get_started/premade_estimators.md
+++ b/tensorflow/docs_src/get_started/premade_estimators.md
@@ -177,7 +177,7 @@ other features so you can concentrate on your model. For more details see
 
 An Estimator is any class derived from @{tf.estimator.Estimator}. TensorFlow
 provides a collection of
-[pre-made Estimators](https://developers.google.com/machine-learning/glossary/#pre-made_Estimator)
+[pre-made Estimators](https://developers.google.com/machine-learning/glossary/#premade_Estimator)
 (for example, `LinearRegressor`) to implement common ML algorithms. Beyond
 those, you may write your own
 [custom Estimators](https://developers.google.com/machine-learning/glossary/#custom_Estimator).
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index e1948c71fdf45b853cfd8e27fe9a0a110c8d464b..3b9381625fd675ff30cad50e16737db1389b8cad 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -517,7 +517,7 @@ on your system:
   from source. To use the TensorFlow binaries, version 3.5 or higher is required.
   See the [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
   list of supported GPU cards.
-* [GPU drivers](http://nvidia.com/driver) that support your version of the CUDA
+* [GPU drivers](http://nvidia.com/drivers) that support your version of the CUDA
   Toolkit.
 * The `libcupti-dev` library is the NVIDIA CUDA Profile Tools Interface. This
   library provides advanced profiling support. To install this library,
@@ -533,12 +533,6 @@ Add this path to the `LD_LIBRARY_PATH` environmental variable:
   <code class="devsite-terminal">export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}/usr/local/cuda/extras/CUPTI/lib64</code>
 </pre>
 
-For CUDA Toolkit <= 7.5 use:
-
-<pre class="prettyprint lang-bsh">
-  <code class="devsite-terminal">sudo apt-get install libcupti-dev</code>
-</pre>
-
 * *OPTIONAL*:  For optimized performance during inference, install
   *NVIDIA&nbsp;TensorRT&nbsp;3.0*. To install the minimal amount of TensorRT
   runtime components required to use with the pre-built `tensorflow-gpu` package:
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index e2d16470420f0c74776baa6c31729ed67804fcb0..5ba522b436137bc5588382fd79f7559c6e9d11ed 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -133,7 +133,7 @@ The following NVIDIA <i>hardware</i> must be installed on your system:
 
 The following NVIDIA <i>software</i> must be installed on your system:
 
-  * [CUDA Toolkit](http://nvidia.com/cuda) (>= 7.0). We recommend version 9.0.
+  * [CUDA Toolkit](http://nvidia.com/cuda) (>= 8.0). We recommend version 9.0.
     For details, see
     [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
     Ensure that you append the relevant CUDA pathnames to the
@@ -141,7 +141,7 @@ The following NVIDIA <i>software</i> must be installed on your system:
     NVIDIA documentation.
   * [GPU drivers](http://nvidia.com/driver) supporting your version of the CUDA
     Toolkit.
-  * [cuDNN SDK](http://developer.nvidia.com/cudnn) (>= v3). We recommend version 7.0. For details, see
+  * [cuDNN SDK](http://developer.nvidia.com/cudnn) (>= 6.0). We recommend version 7.0. For details, see
     [NVIDIA's documentation](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/).
   * [CUPTI](http://docs.nvidia.com/cuda/cupti/) ships with the CUDA Toolkit, but
     you also need to append its path to the `LD_LIBRARY_PATH` environment
@@ -195,32 +195,6 @@ plan on executing tasks directly with `bazel` , without the pip installation,
 you may need to install additional python packages. For example, you should
 `pip install mock enum34` before running TensorFlow's tests with bazel.
 
-### Optional: install TensorFlow for GPU prerequisites
-
-If you do not have brew installed, install it by following
-[these instructions](http://brew.sh/).
-
-After installing brew, install GNU coreutils by issuing the following command:
-
-<pre>$ <b>brew install coreutils</b></pre>
-
-If you want to compile tensorflow and have XCode 7.3 and CUDA 7.5 installed,
-note that Xcode 7.3 is not yet compatible with CUDA 7.5.  To remedy this
-problem, do either of the following:
-
-  * Upgrade to CUDA 8.0.
-  * Download Xcode 7.2 and select it as your default by issuing the following
-    command:
-
-    <pre> $ <b>sudo xcode-select -s /Applications/Xcode-7.2/Xcode.app</b></pre>
-
-**NOTE:** Your system must fulfill the NVIDIA software requirements described
-in one of the following documents:
-
-  * @{$install_linux#NVIDIARequirements$Installing TensorFlow on Linux}
-  * @{$install_mac#NVIDIARequirements$Installing TensorFlow on Mac OS}
-
-
 <a name="ConfigureInstallation"></a>
 ## Configure the installation
 
@@ -280,7 +254,7 @@ Do you wish to build TensorFlow with CUDA support? [y/N] <b>Y</b>
 CUDA support will be enabled for TensorFlow
 Do you want to use clang as CUDA compiler? [y/N]
 nvcc will be used as CUDA compiler
-Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 9.0]: <b>9.0</b>
+Please specify the CUDA SDK version you want to use. [Leave empty to default to CUDA 9.0]: <b>9.0</b>
 Please specify the location where CUDA 9.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
 Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
 Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: <b>7</b>
diff --git a/tensorflow/docs_src/mobile/tflite/index.md b/tensorflow/docs_src/mobile/tflite/index.md
index 01881ccf3bb15b5df7f5659457650421343f8270..562203482763991c412b523bd261b3163d361134 100644
--- a/tensorflow/docs_src/mobile/tflite/index.md
+++ b/tensorflow/docs_src/mobile/tflite/index.md
@@ -155,7 +155,7 @@ retraining for both floating point and quantized inference.
 
 The following diagram shows the architectural design of TensorFlow Lite:
 
-<img src="/images/tflite-architecture.jpg"
+<img src="https://www.tensorflow.org/images/tflite-architecture.jpg"
      alt="TensorFlow Lite architecture diagram"
      style="max-width:600px;">
 
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index b1796cf9b2d0bf7459e70ab542b6e6fcb203667a..cb0f5ca9242098d06aa0a9898e4a3774fab527b8 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -78,7 +78,7 @@ training CIFAR-10 illustrates the use of the `tf.data` API along with
 The `tf.data` API utilizes C++ multi-threading and has a much lower overhead
 than the Python-based `queue_runner` that is limited by Python's multi-threading
 performance. A detailed performance guide for the `tf.data` API can be found
-[here](@{$datasets_performance}).
+@{$datasets_performance$here}.
 
 While feeding data using a `feed_dict` offers a high level of flexibility, in
 general `feed_dict` does not provide a scalable solution. If only a single GPU
diff --git a/tensorflow/docs_src/performance/xla/broadcasting.md b/tensorflow/docs_src/performance/xla/broadcasting.md
index ca3bddf758cf64e7c580f9babfe559ae23708705..eaa709c2f84245341044b93060f932a22fbe54c7 100644
--- a/tensorflow/docs_src/performance/xla/broadcasting.md
+++ b/tensorflow/docs_src/performance/xla/broadcasting.md
@@ -97,9 +97,9 @@ shape is broadcast into a larger rank shape. For example, given a 2x3x4 cuboid
 and a 3x4 matrix, a broadcasting tuple (1,2) means matching the matrix to
 dimensions 1 and 2 of the cuboid.
 
-This type of broadcast is used in the binary ops in `ComputationBuilder`, if the
+This type of broadcast is used in the binary ops in `XlaBuilder`, if the
 `broadcast_dimensions` argument is given. For example, see
-[ComputationBuilder::Add](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.cc).
+[XlaBuilder::Add](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.cc).
 In the XLA source code, this type of broadcasting is sometimes called "InDim"
 broadcasting.
 
diff --git a/tensorflow/docs_src/performance/xla/jit.md b/tensorflow/docs_src/performance/xla/jit.md
index d9a979ccbd31773b9d227ff946486706844a8f81..6724d1eaf8f85320b963eddc37947d69dcaa8471 100644
--- a/tensorflow/docs_src/performance/xla/jit.md
+++ b/tensorflow/docs_src/performance/xla/jit.md
@@ -137,12 +137,12 @@ TF_XLA_FLAGS=--xla_generate_hlo_graph=.* python mnist_softmax_xla.py
 ```
 
 Open the timeline file created (`timeline.ctf.json`).  The rendered timeline
-should look similar to the picture below with one long bar labeled `_XlaLaunch`.
+should look similar to the picture below with one long bar labeled `XlaLaunch`.
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
   <img style="width:100%" src="https://www.tensorflow.org/images/jit_timeline_gpu_xla.png">
 </div>
 
-To understand what is happening in `_XlaLaunch`, look at the console output for
+To understand what is happening in `XlaLaunch`, look at the console output for
 statements similar to the following:
 
 ```shell
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index f530fe1206c0e9e67816d2c9a2ba276858314737..5887c3d88bf8c7844349cc1cc0db224586e56719 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -1,7 +1,7 @@
 # Operation Semantics
 
 The following describes the semantics of operations defined in the
-[`ComputationBuilder`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h)
+[`XlaBuilder`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
 interface. Typically, these operations map one-to-one to operations defined in
 the RPC interface in
 [`xla_data.proto`](https://www.tensorflow.org/code/tensorflow/compiler/xla/xla_data.proto).
@@ -16,7 +16,7 @@ and familiar names; for example a *vector* is a 1-dimensional array and a
 ## BatchNormGrad
 
 See also
-[`ComputationBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h)
+[`XlaBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
 and [the original batch normalization paper](https://arxiv.org/abs/1502.03167)
 for a detailed description of the algorithm.
 
@@ -26,14 +26,14 @@ Calculates gradients of batch norm.
 
 | Arguments       | Type                    | Semantics                        |
 | --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
+| `operand`       | `XlaOp`                 | n dimensional array to be        |
 :                 :                         : normalized (x)                   :
-| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
+| `scale`         | `XlaOp`                 | 1 dimensional array              |
 :                 :                         : (\\(\gamma\\))                   :
-| `mean`          | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))  |
-| `variance`      | `ComputationDataHandle` | 1 dimensional array              |
+| `mean`          | `XlaOp`                 | 1 dimensional array (\\(\mu\\))  |
+| `variance`      | `XlaOp`                 | 1 dimensional array              |
 :                 :                         : (\\(\sigma^2\\))                 :
-| `grad_output`   | `ComputationDataHandle` | Gradients passed to              |
+| `grad_output`   | `XlaOp`                 | Gradients passed to              |
 :                 :                         : `BatchNormTraining`              :
 :                 :                         : (\\( \nabla y\\))                :
 | `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
@@ -70,35 +70,33 @@ The output type is a tuple of three handles:
 
 | Outputs        | Type                    | Semantics                         |
 | -------------  | ----------------------- | --------------------------------- |
-| `grad_operand` | `ComputationDataHandle` | gradient with respect to input    |
+| `grad_operand` | `XlaOp`                 | gradient with respect to input    |
 :                :                         : `operand` (\\( \nabla x\\))       :
-| `grad_scale`   | `ComputationDataHandle` | gradient with respect to input    |
+| `grad_scale`   | `XlaOp`                 | gradient with respect to input    |
 :                :                         : `scale` (\\( \nabla \gamma\\))    :
-| `grad_offset`  | `ComputationDataHandle` | gradient with respect to input    |
+| `grad_offset`  | `XlaOp`                 | gradient with respect to input    |
 :                :                         : `offset`(\\( \nabla \beta\\))     :
 
 ## BatchNormInference
 
 See also
-[`ComputationBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h) and
-[the original batch normalization paper](https://arxiv.org/abs/1502.03167)
+[`XlaBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+and [the original batch normalization paper](https://arxiv.org/abs/1502.03167)
 for a detailed description of the algorithm.
 
 Normalizes an array across batch and spatial dimensions.
 
 <b> `BatchNormInference(operand, scale, offset, mean, variance, epsilon, feature_index)` </b>
 
-| Arguments       | Type                    | Semantics                       |
-| --------------  | ----------------------- | ------------------------------- |
-| `operand`       | `ComputationDataHandle` | n dimensional array to be       |
-:                 :                         : normalized                      :
-| `scale`         | `ComputationDataHandle` | 1 dimensional array             |
-| `offset`        | `ComputationDataHandle` | 1 dimensional array             |
-| `mean`          | `ComputationDataHandle` | 1 dimensional array             |
-| `variance`      | `ComputationDataHandle` | 1 dimensional array             |
-| `epsilon`       | `float`                 | Epsilon value                   |
-| `feature_index` | `int64`                 | Index to feature dimension in   |
-:                 :                         : `operand`                       :
+Arguments       | Type    | Semantics
+--------------- | ------- | ---------------------------------------
+`operand`       | `XlaOp` | n dimensional array to be normalized
+`scale`         | `XlaOp` | 1 dimensional array
+`offset`        | `XlaOp` | 1 dimensional array
+`mean`          | `XlaOp` | 1 dimensional array
+`variance`      | `XlaOp` | 1 dimensional array
+`epsilon`       | `float` | Epsilon value
+`feature_index` | `int64` | Index to feature dimension in `operand`
 
 For each feature in the feature dimension (`feature_index` is the index for the
 feature dimension in `operand`), the operation calculates the mean and variance
@@ -117,25 +115,21 @@ The output is an n-dimensional, normalized array with the same shape as input
 ## BatchNormTraining
 
 See also
-[`ComputationBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h) and
-[`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
+[`XlaBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+and [`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
 for a detailed description of the algorithm.
 
 Normalizes an array across batch and spatial dimensions.
 
 <b> `BatchNormTraining(operand, scale, offset, epsilon, feature_index)` </b>
 
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
-:                 :                         : normalized (x)                   :
-| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\gamma\\))                   :
-| `offset`        | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\beta\\))                    :
-| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
-| `feature_index` | `int64`                 | Index to feature dimension       |
-:                 :                         : in `operand`                     :
+Arguments       | Type    | Semantics
+--------------- | ------- | ----------------------------------------
+`operand`       | `XlaOp` | n dimensional array to be normalized (x)
+`scale`         | `XlaOp` | 1 dimensional array (\\(\gamma\\))
+`offset`        | `XlaOp` | 1 dimensional array (\\(\beta\\))
+`epsilon`       | `float` | Epsilon value (\\(\epsilon\\))
+`feature_index` | `int64` | Index to feature dimension in `operand`
 
 For each feature in the feature dimension (`feature_index` is the index for the
 feature dimension in `operand`), the operation calculates the mean and variance
@@ -158,14 +152,14 @@ contains `m` elements with `w` and `h` as the size of spatial dimensions
 
 The epsilon value, usually a small number, is added to avoid divide-by-zero errors.
 
-The output type is a tuple of three `ComputationDataHandle`s:
+The output type is a tuple of three `XlaOp`s:
 
 | Outputs      | Type                    | Semantics                            |
 | ------------ | ----------------------- | -------------------------------------|
-| `output`     | `ComputationDataHandle` | n dimensional array with the same    |
+| `output`     | `XlaOp`                 | n dimensional array with the same    |
 :              :                         : shape as input `operand` (y)         :
-| `batch_mean` | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))      |
-| `batch_var`  | `ComputationDataHandle` | 1 dimensional array (\\(\sigma^2\\)) |
+| `batch_mean` | `XlaOp`                 | 1 dimensional array (\\(\mu\\))      |
+| `batch_var`  | `XlaOp`                 | 1 dimensional array (\\(\sigma^2\\)) |
 
 The `batch_mean` and `batch_var` are moments calculated across the batch and
 spatial dimensions using the formulas above.
@@ -173,7 +167,7 @@ spatial dimensions using the formulas above.
 ## BitcastConvertType
 
 See also
-[`ComputationBuilder::BitcastConvertType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::BitcastConvertType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Similar to a `tf.bitcast` in TensorFlow, performs an element-wise bitcast
 operation from a data shape to a target shape. The dimensions must match, and
@@ -183,10 +177,10 @@ with different floating-point representations will give different results.
 
 <b> `BitcastConvertType(operand, new_element_type)` </b>
 
-Arguments          | Type                    | Semantics
------------------- | ----------------------- | ---------------------------
-`operand`          | `ComputationDataHandle` | array of type T with dims D
-`new_element_type` | `PrimitiveType`         | type U
+Arguments          | Type            | Semantics
+------------------ | --------------- | ---------------------------
+`operand`          | `XlaOp`         | array of type T with dims D
+`new_element_type` | `PrimitiveType` | type U
 
 The dimensions of the operand and the target shape must match. The bit-width of
 the source and destination element types must be equal. The source
@@ -195,16 +189,16 @@ and destination element types must not be tuples.
 ## Broadcast
 
 See also
-[`ComputationBuilder::Broadcast`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Broadcast`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Adds dimensions to an array by duplicating the data in the array.
 
 <b> `Broadcast(operand, broadcast_sizes)` </b>
 
-Arguments         | Type                    | Semantics
------------------ | ----------------------- | -------------------------------
-`operand`         | `ComputationDataHandle` | The array to duplicate
-`broadcast_sizes` | `ArraySlice<int64>`     | The sizes of the new dimensions
+Arguments         | Type                | Semantics
+----------------- | ------------------- | -------------------------------
+`operand`         | `XlaOp`             | The array to duplicate
+`broadcast_sizes` | `ArraySlice<int64>` | The sizes of the new dimensions
 
 The new dimensions are inserted on the left, i.e. if `broadcast_sizes` has
 values `{a0, ..., aN}` and the operand shape has dimensions `{b0, ..., bM}` then
@@ -223,19 +217,18 @@ For example, if `operand` is a scalar `f32` with value `2.0f`, and
 ## Call
 
 See also
-[`ComputationBuilder::Call`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Call`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Invokes a computation with the given arguments.
 
 <b> `Call(computation, args...)` </b>
 
-| Arguments     | Type                     | Semantics                        |
-| ------------- | ------------------------ | -------------------------------- |
-| `computation` | `Computation`            | computation of type `T_0, T_1,   |
-:               :                          : ..., T_N -> S` with N parameters :
-:               :                          : of arbitrary type                :
-| `args`        | sequence of N            | N arguments of arbitrary type    |
-:               : `ComputationDataHandle`s :                                  :
+| Arguments     | Type                   | Semantics                           |
+| ------------- | ---------------------- | ----------------------------------- |
+| `computation` | `XlaComputation`       | computation of type `T_0, T_1, ..., |
+:               :                        : T_N -> S` with N parameters of      :
+:               :                        : arbitrary type                      :
+| `args`        | sequence of N `XlaOp`s | N arguments of arbitrary type       |
 
 The arity and types of the `args` must match the parameters of the
 `computation`. It is allowed to have no `args`.
@@ -243,17 +236,17 @@ The arity and types of the `args` must match the parameters of the
 ## Clamp
 
 See also
-[`ComputationBuilder::Clamp`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Clamp`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Clamps an operand to within the range between a minimum and maximum value.
 
 <b> `Clamp(min, operand, max)` </b>
 
-| Arguments     | Type                    | Semantics                        |
-| ------------- | ----------------------- | -------------------------------- |
-| `min`         | `ComputationDataHandle` | array of type T                  |
-| `operand`     | `ComputationDataHandle` | array of type T                  |
-| `max`         | `ComputationDataHandle` | array of type T                  |
+Arguments | Type    | Semantics
+--------- | ------- | ---------------
+`min`     | `XlaOp` | array of type T
+`operand` | `XlaOp` | array of type T
+`max`     | `XlaOp` | array of type T
 
 Given an operand and minimum and maximum values, returns the operand if it is in
 the range between the minimum and maximum, else returns the minimum value if the
@@ -276,18 +269,17 @@ Clamp(min, operand, max) = s32[3]{0, 5, 6};
 ## Collapse
 
 See also
-[`ComputationBuilder::Collapse`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h)
+[`XlaBuilder::Collapse`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
 and the @{tf.reshape} operation.
 
 Collapses dimensions of an array into one dimension.
 
 <b> `Collapse(operand, dimensions)` </b>
 
-| Arguments    | Type                    | Semantics                           |
-| ------------ | ----------------------- | ----------------------------------- |
-| `operand`    | `ComputationDataHandle` | array of type T                     |
-| `dimensions` | `int64` vector          | in-order, consecutive subset of T's |
-:              :                         : dimensions.                         :
+Arguments    | Type           | Semantics
+------------ | -------------- | -----------------------------------------------
+`operand`    | `XlaOp`        | array of type T
+`dimensions` | `int64` vector | in-order, consecutive subset of T's dimensions.
 
 Collapse replaces the given subset of the operand's dimensions by a single
 dimension. The input arguments are an arbitrary array of type T and a
@@ -340,7 +332,7 @@ then v12 == f32[8x3] {{10, 11, 12},
 ## Concatenate
 
 See also
-[`ComputationBuilder::ConcatInDim`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::ConcatInDim`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Concatenate composes an array from multiple array operands. The array is of the
 same rank as each of the input array operands (which must be of the same rank as
@@ -348,13 +340,13 @@ each other) and contains the arguments in the order that they were specified.
 
 <b> `Concatenate(operands..., dimension)` </b>
 
-| Arguments   | Type                    | Semantics                            |
-| ----------- | ----------------------- | ------------------------------------ |
-| `operands`  | sequence of N           | N arrays of type T with dimensions   |
-:             : `ComputationDataHandle` : [L0, L1, ...]. Requires N >= 1.      :
-| `dimension` | `int64`                 | A value in the interval `[0, N)`     |
-:             :                         : that names the dimension to be       :
-:             :                         : concatenated between the `operands`. :
+| Arguments   | Type                  | Semantics                              |
+| ----------- | --------------------- | -------------------------------------- |
+| `operands`  | sequence of N `XlaOp` | N arrays of type T with dimensions     |
+:             :                       : [L0, L1, ...]. Requires N >= 1.        :
+| `dimension` | `int64`               | A value in the interval `[0, N)` that  |
+:             :                       : names the dimension to be concatenated :
+:             :                       : between the `operands`.                :
 
 With the exception of `dimension` all dimensions must be the same. This is
 because XLA does not support "ragged" arrays. Also note that rank-0 values
@@ -395,20 +387,19 @@ Diagram:
 
 ## Conditional
 
-See also [`ComputationBuilder::Conditional`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+See also
+[`XlaBuilder::Conditional`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `Conditional(pred, true_operand, true_computation, false_operand,
-    false_computation)` </b>
-
-| Arguments           | Type                    | Semantics                   |
-| ------------------- | ----------------------- | --------------------------- |
-| `pred`              | `ComputationDataHandle` | Scalar of type `PRED`       |
-| `true_operand`      | `ComputationDataHandle` | Argument of type `T_0`      |
-| `true_computation`  | `Computation`           | Computation of type `T_0 -> |
-:                     :                         : S`                          :
-| `false_operand`     | `ComputationDataHandle` | Argument of type `T_1`      |
-| `false_computation` | `Computation`           | Computation of type `T_1 -> |
-:                     :                         : S`                          :
+false_computation)` </b>
+
+Arguments           | Type             | Semantics
+------------------- | ---------------- | ---------------------------------
+`pred`              | `XlaOp`          | Scalar of type `PRED`
+`true_operand`      | `XlaOp`          | Argument of type `T_0`
+`true_computation`  | `XlaComputation` | XlaComputation of type `T_0 -> S`
+`false_operand`     | `XlaOp`          | Argument of type `T_1`
+`false_computation` | `XlaComputation` | XlaComputation of type `T_1 -> S`
 
 Executes `true_computation` if `pred` is `true`, `false_computation` if `pred`
 is `false`, and returns the result.
@@ -425,7 +416,7 @@ executed depending on the value of `pred`.
 ## Conv (convolution)
 
 See also
-[`ComputationBuilder::Conv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Conv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 As ConvWithGeneralPadding, but the padding is specified in a short-hand way as
 either SAME or VALID. SAME padding pads the input (`lhs`) with zeroes so that
@@ -435,7 +426,7 @@ account. VALID padding simply means no padding.
 ## ConvWithGeneralPadding (convolution)
 
 See also
-[`ComputationBuilder::ConvWithGeneralPadding`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::ConvWithGeneralPadding`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Computes a convolution of the kind used in neural networks. Here, a convolution
 can be thought of as a n-dimensional window moving across a n-dimensional base
@@ -443,8 +434,8 @@ area and a computation is performed for each possible position of the window.
 
 | Arguments        | Type                    | Semantics                     |
 | ---------------- | ----------------------- | ----------------------------- |
-| `lhs`            | `ComputationDataHandle` | rank n+2 array of inputs      |
-| `rhs`            | `ComputationDataHandle` | rank n+2 array of kernel      |
+| `lhs`            | `XlaOp`                 | rank n+2 array of inputs      |
+| `rhs`            | `XlaOp`                 | rank n+2 array of kernel      |
 :                  :                         : weights                       :
 | `window_strides` | `ArraySlice<int64>`     | n-d array of kernel strides   |
 | `padding`        | `ArraySlice<pair<int64, | n-d array of (low, high)      |
@@ -547,7 +538,7 @@ for (b, oz, oy, ox) {  // output coordinates
 ## ConvertElementType
 
 See also
-[`ComputationBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Similar to an element-wise `static_cast` in C++, performs an element-wise
 conversion operation from a data shape to a target shape. The dimensions must
@@ -556,10 +547,10 @@ match, and the conversion is an element-wise one; e.g. `s32` elements become
 
 <b> `ConvertElementType(operand, new_element_type)` </b>
 
-Arguments          | Type                    | Semantics
------------------- | ----------------------- | ---------------------------
-`operand`          | `ComputationDataHandle` | array of type T with dims D
-`new_element_type` | `PrimitiveType`         | type U
+Arguments          | Type            | Semantics
+------------------ | --------------- | ---------------------------
+`operand`          | `XlaOp`         | array of type T with dims D
+`new_element_type` | `PrimitiveType` | type U
 
 The dimensions of the operand and the target shape must match. The source and
 destination element types must not be tuples.
@@ -581,15 +572,15 @@ then b == f32[3]{0.0, 1.0, 2.0}
 ## CrossReplicaSum
 
 See also
-[`ComputationBuilder::CrossReplicaSum`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::CrossReplicaSum`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Computes a sum across replicas.
 
 <b> `CrossReplicaSum(operand)` </b>
 
-| Arguments    | Type                    | Semantics                          |
-| ------------ | ----------------------- | ---------------------------------- |
-| `operand`    | `ComputationDataHandle` | Array to sum across replicas.      |
+Arguments | Type    | Semantics
+--------- | ------- | -----------------------------
+`operand` | `XlaOp` | Array to sum across replicas.
 
 The output shape is the same as the input shape. For example, if there are two
 replicas and the operand has the value `(1.0, 2.5)` and `(3.0, 5.25)`
@@ -607,21 +598,21 @@ than another.
 ## CustomCall
 
 See also
-[`ComputationBuilder::CustomCall`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::CustomCall`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Call a user-provided function within a computation.
 
 <b> `CustomCall(target_name, args..., shape)` </b>
 
-| Arguments     | Type                     | Semantics                        |
-| ------------- | ------------------------ | -------------------------------- |
-| `target_name` | `string`                 | Name of the function. A call     |
-:               :                          : instruction will be emitted      :
-:               :                          : which targets this symbol name.  :
-| `args`        | sequence of N            | N arguments of arbitrary type,   |
-:               : `ComputationDataHandle`s : which will be passed to the      :
-:               :                          : function.                        :
-| `shape`       | `Shape`                  | Output shape of the function     |
+| Arguments     | Type                   | Semantics                         |
+| ------------- | ---------------------- | --------------------------------- |
+| `target_name` | `string`               | Name of the function. A call      |
+:               :                        : instruction will be emitted which :
+:               :                        : targets this symbol name.         :
+| `args`        | sequence of N `XlaOp`s | N arguments of arbitrary type,    |
+:               :                        : which will be passed to the       :
+:               :                        : function.                         :
+| `shape`       | `Shape`                | Output shape of the function      |
 
 The function signature is the same, regardless of the arity or type of args:
 
@@ -668,14 +659,14 @@ idempotent.
 ## Dot
 
 See also
-[`ComputationBuilder::Dot`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Dot`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `Dot(lhs, rhs)` </b>
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | ---------------
-`lhs`     | `ComputationDataHandle` | array of type T
-`rhs`     | `ComputationDataHandle` | array of type T
+Arguments | Type    | Semantics
+--------- | ------- | ---------------
+`lhs`     | `XlaOp` | array of type T
+`rhs`     | `XlaOp` | array of type T
 
 The exact semantics of this operation depend on the ranks of the operands:
 
@@ -697,15 +688,15 @@ multiplications or matrix/matrix multiplications.
 ## DotGeneral
 
 See also
-[`ComputationBuilder::DotGeneral`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::DotGeneral`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `DotGeneral(lhs, rhs, dimension_numbers)` </b>
 
-| Arguments | Type                    | Semantics
-| --------- | ----------------------- | ---------------
-| `lhs`     | `ComputationDataHandle` | array of type T
-| `rhs`     | `ComputationDataHandle` | array of type T
-| `dimension_numbers` | `DotDimensionNumbers` | array of type T
+Arguments           | Type                  | Semantics
+------------------- | --------------------- | ---------------
+`lhs`               | `XlaOp`               | array of type T
+`rhs`               | `XlaOp`               | array of type T
+`dimension_numbers` | `DotDimensionNumbers` | array of type T
 
 As Dot, but allows contracting and batch dimension numbers to be specified for
 both the 'lhs' and 'rhs'.
@@ -784,7 +775,7 @@ non-contracting/non-batch dimension.
 ## DynamicSlice
 
 See also
-[`ComputationBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 DynamicSlice extracts a sub-array from the input array at dynamic
 `start_indices`. The size of the slice in each dimension is passed in
@@ -796,22 +787,21 @@ calculation of 'start_indices') is currently implementation-defined.
 
 <b> `DynamicSlice(operand, start_indices, size_indices)` </b>
 
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
-| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
-:                 :                         : containing the starting indices  :
-:                 :                         : of the slice for each dimension. :
-:                 :                         : Value must be greater than or    :
-:                 :                         : equal to zero.                   :
-| `size_indices`  | `ArraySlice<int64>`     | List of N integers containing    |
-:                 :                         : the slice size for each          :
-:                 :                         : dimension. Each value must be    :
-:                 :                         : strictly greater than zero, and  :
-:                 :                         : start + size must be less than   :
-:                 :                         : or equal to the size of the      :
-:                 :                         : dimension to avoid wrapping      :
-:                 :                         : modulo dimension size.           :
+| Arguments       | Type                | Semantics                           |
+| --------------- | ------------------- | ----------------------------------- |
+| `operand`       | `XlaOp`             | N dimensional array of type T       |
+| `start_indices` | `XlaOp`             | Rank 1 array of N integers          |
+:                 :                     : containing the starting indices of  :
+:                 :                     : the slice for each dimension. Value :
+:                 :                     : must be greater than or equal to    :
+:                 :                     : zero.                               :
+| `size_indices`  | `ArraySlice<int64>` | List of N integers containing the   |
+:                 :                     : slice size for each dimension. Each :
+:                 :                     : value must be strictly greater than :
+:                 :                     : zero, and start + size must be less :
+:                 :                     : than or equal to the size of the    :
+:                 :                     : dimension to avoid wrapping modulo  :
+:                 :                     : dimension size.                     :
 
 1-dimensional example:
 
@@ -840,7 +830,7 @@ DynamicSlice(b, s, {2, 2}) produces:
 ## DynamicUpdateSlice
 
 See also
-[`ComputationBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 DynamicUpdateSlice generates a result which is the value of the input array
 `operand`, with a slice `update` overwritten at `start_indices`.
@@ -853,23 +843,19 @@ calculation of 'start_indices') is currently implementation-defined.
 
 <b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
 
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
-| `update`        | `ComputationDataHandle` | N dimensional array of type T    |
-:                 :                         : containing the slice update.     :
-:                 :                         : Each dimension of update shape   :
-:                 :                         : must be strictly greater than    :
-:                 :                         : zero, and start + update must be :
-:                 :                         : less than or equal to the operand:
-:                 :                         : size for each dimension to avoid :
-:                 :                         : generating out-of-bounds update  :
-:                 :                         : indices.                         :
-| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
-:                 :                         : containing the starting indices  :
-:                 :                         : of the slice for each dimension. :
-:                 :                         : Value must be greater than or    :
-:                 :                         : equal to zero.                   :
+| Arguments       | Type    | Semantics                                        |
+| --------------- | ------- | ------------------------------------------------ |
+| `operand`       | `XlaOp` | N dimensional array of type T                    |
+| `update`        | `XlaOp` | N dimensional array of type T containing the     |
+:                 :         : slice update. Each dimension of update shape     :
+:                 :         : must be strictly greater than zero, and start +  :
+:                 :         : update must be less than or equal to the operand :
+:                 :         : size for each dimension to avoid generating      :
+:                 :         : out-of-bounds update indices.                    :
+| `start_indices` | `XlaOp` | Rank 1 array of N integers containing the        |
+:                 :         : starting indices of the slice for each           :
+:                 :         : dimension. Value must be greater than or equal   :
+:                 :         : to zero.                                         :
 
 1-dimensional example:
 
@@ -907,7 +893,7 @@ DynamicUpdateSlice(b, u, s) produces:
 ## Element-wise binary arithmetic operations
 
 See also
-[`ComputationBuilder::Add`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Add`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 A set of element-wise binary arithmetic operations is supported.
 
@@ -917,10 +903,10 @@ Where `Op` is one of `Add` (addition), `Sub` (subtraction), `Mul`
 (multiplication), `Div` (division), `Rem` (remainder), `Max` (maximum), `Min`
 (minimum), `LogicalAnd` (logical AND), or `LogicalOr` (logical OR).
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | ----------------------------------------
-`lhs`     | `ComputationDataHandle` | left-hand-side operand: array of type T
-`rhs`     | `ComputationDataHandle` | right-hand-side operand: array of type T
+Arguments | Type    | Semantics
+--------- | ------- | ----------------------------------------
+`lhs`     | `XlaOp` | left-hand-side operand: array of type T
+`rhs`     | `XlaOp` | right-hand-side operand: array of type T
 
 The arguments' shapes have to be either similar or compatible. See the
 @{$broadcasting$broadcasting} documentation about what it means for shapes to
@@ -952,7 +938,7 @@ shapes of both operands. The semantics are described in detail on the
 ## Element-wise comparison operations
 
 See also
-[`ComputationBuilder::Eq`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Eq`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 A set of standard element-wise binary comparison operations is supported. Note
 that standard IEEE 754 floating-point comparison semantics apply when comparing
@@ -964,10 +950,10 @@ Where `Op` is one of `Eq` (equal-to), `Ne` (not equal-to), `Ge`
 (greater-or-equal-than), `Gt` (greater-than), `Le` (less-or-equal-than), `Lt`
 (less-than).
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | ----------------------------------------
-`lhs`     | `ComputationDataHandle` | left-hand-side operand: array of type T
-`rhs`     | `ComputationDataHandle` | right-hand-side operand: array of type T
+Arguments | Type    | Semantics
+--------- | ------- | ----------------------------------------
+`lhs`     | `XlaOp` | left-hand-side operand: array of type T
+`rhs`     | `XlaOp` | right-hand-side operand: array of type T
 
 The arguments' shapes have to be either similar or compatible. See the
 @{$broadcasting$broadcasting} documentation about what it means for shapes to
@@ -991,7 +977,7 @@ in detail on the @{$broadcasting$broadcasting page}.
 
 ## Element-wise unary functions
 
-ComputationBuilder supports these element-wise unary functions:
+XlaBuilder supports these element-wise unary functions:
 
 <b>`Abs(operand)`</b> Element-wise abs `x -> |x|`.
 
@@ -1023,9 +1009,9 @@ using the comparison operator of the element type of `operand`.
 <b>`Tanh(operand)`</b> Element-wise hyperbolic tangent `x -> tanh(x)`.
 
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | ---------------------------
-`operand` | `ComputationDataHandle` | The operand to the function
+Arguments | Type    | Semantics
+--------- | ------- | ---------------------------
+`operand` | `XlaOp` | The operand to the function
 
 The function is applied to each element in the `operand` array, resulting in an
 array with the same shape. It is allowed for `operand` to be a scalar (rank 0).
@@ -1038,19 +1024,19 @@ potentially different runtime offset) of an input tensor into an output tensor.
 ### General Semantics
 
 See also
-[`ComputationBuilder::Gather`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Gather`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 For a more intuitive description, see the "Informal Description" section below.
 
 <b> `gather(operand, gather_indices, output_window_dims, elided_window_dims, window_bounds, gather_dims_to_operand_dims)` </b>
 
 |Arguments         | Type                    | Semantics                       |
 |----------------- | ----------------------- | --------------------------------|
-|`operand`         | `ComputationDataHandle` | The tensor we’re gathering      |
+|`operand`         | `XlaOp`                 | The tensor we’re gathering      |
 :                  :                         : from.                           :
-|`gather_indices`  | `ComputationDataHandle` | Tensor containing the starting  |
+|`gather_indices`  | `XlaOp`                 | Tensor containing the starting  |
 :                  :                         : indices of the slices we're     :
-:                  :                         : we're stitching together into   :
-:                  :                         : the output tensor.              :
+:                  :                         : stitching together into the     :
+:                  :                         : output tensor.                  :
 |`index_vector_dim`  | `int64`               | The dimension in                |
 :                  :                         : `gather_indices` that contains  :
 :                  :                         : the starting indices.           :
@@ -1241,7 +1227,7 @@ concatenation of all these rows.
 ## GetTupleElement
 
 See also
-[`ComputationBuilder::GetTupleElement`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::GetTupleElement`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Indexes into a tuple with a compile-time-constant value.
 
@@ -1262,7 +1248,7 @@ See also @{tf.tuple}.
 ## Infeed
 
 See also
-[`ComputationBuilder::Infeed`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Infeed`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `Infeed(shape)` </b>
 
@@ -1275,7 +1261,7 @@ See also
 
 Reads a single data item from the implicit Infeed streaming interface of the
 device, interpreting the data as the given shape and its layout, and returns a
-`ComputationDataHandle` of the data. Multiple Infeed operations are allowed in a
+`XlaOp` of the data. Multiple Infeed operations are allowed in a
 computation, but there must be a total order among the Infeed operations. For
 example, two Infeeds in the code below have a total order since there is a
 dependency between the while loops.
@@ -1301,21 +1287,19 @@ Infeed of the device.
 ## Map
 
 See also
-[`ComputationBuilder::Map`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Map`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `Map(operands..., computation)` </b>
 
-| Arguments         | Type                     | Semantics                     |
-| ----------------- | ------------------------ | ----------------------------- |
-| `operands`        | sequence of N            | N arrays of types T_0..T_{N-1}|
-:                   : `ComputationDataHandle`s :                               :
-| `computation`     | `Computation`            | computation of type `T_0,     |
-:                   :                          : T_1, ..., T_{N + M -1} -> S`  :
-:                   :                          : with N parameters of type T   :
-:                   :                          : and M of arbitrary type       :
-| `dimensions`       | `int64` array           | array of map dimensions    |
-| `static_operands` | sequence of M            | M arrays of arbitrary type    |
-:                   : `ComputationDataHandle`s :                               :
+| Arguments         | Type                   | Semantics                      |
+| ----------------- | ---------------------- | ------------------------------ |
+| `operands`        | sequence of N `XlaOp`s | N arrays of types T_0..T_{N-1} |
+| `computation`     | `XlaComputation`        | computation of type `T_0, T_1, |
+:                   :                        : ..., T_{N + M -1} -> S` with N :
+:                   :                        : parameters of type T and M of  :
+:                   :                        : arbitrary type                 :
+| `dimensions`      | `int64` array          | array of map dimensions        |
+| `static_operands` | sequence of M `XlaOp`s | M arrays of arbitrary type     |
 
 Applies a scalar function over the given `operands` arrays, producing an array
 of the same dimensions where each element is the result of the mapped function
@@ -1334,18 +1318,18 @@ input arrays to produce the output array.
 ## Pad
 
 See also
-[`ComputationBuilder::Pad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Pad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `Pad(operand, padding_value, padding_config)` </b>
 
-| Arguments        | Type                    | Semantics                     |
-| ---------------- | ----------------------- | ----------------------------- |
-| `operand`        | `ComputationDataHandle` | array of type `T`             |
-| `padding_value`  | `ComputationDataHandle` | scalar of type `T` to fill in |
-:                  :                         : the added padding             :
-| `padding_config` | `PaddingConfig`         | padding amount on both edges  |
-:                  :                         : (low, high) and between the   :
-:                  :                         : elements of each dimension    :
+| Arguments        | Type            | Semantics                               |
+| ---------------- | --------------- | --------------------------------------- |
+| `operand`        | `XlaOp`         | array of type `T`                       |
+| `padding_value`  | `XlaOp`         | scalar of type `T` to fill in the added |
+:                  :                 : padding                                 :
+| `padding_config` | `PaddingConfig` | padding amount on both edges (low,      |
+:                  :                 : high) and between the elements of each  :
+:                  :                 : dimension                               :
 
 Expands the given `operand` array by padding around the array as well as between
 the elements of the array with the given `padding_value`. `padding_config`
@@ -1373,7 +1357,7 @@ are all 0. The figure below shows examples of different `edge_padding` and
 ## Recv
 
 See also
-[`ComputationBuilder::Recv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Recv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `Recv(shape, channel_handle)` </b>
 
@@ -1384,7 +1368,7 @@ See also
 
 Receives data of the given shape from a `Send` instruction in another
 computation that shares the same channel handle. Returns a
-ComputationDataHandle for the received data.
+XlaOp for the received data.
 
 The client API of `Recv` operation represents synchronous communication.
 However, the instruction is internally decomposed into 2 HLO instructions
@@ -1407,19 +1391,18 @@ complete and returns the received data.
 ## Reduce
 
 See also
-[`ComputationBuilder::Reduce`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Reduce`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Applies a reduction function to an array.
 
 <b> `Reduce(operand, init_value, computation, dimensions)` </b>
 
-| Arguments     | Type                    | Semantics                        |
-| ------------- | ----------------------- | -------------------------------- |
-| `operand`     | `ComputationDataHandle` | array of type `T`                |
-| `init_value`  | `ComputationDataHandle` | scalar of type `T`               |
-| `computation` | `Computation`           | computation of type `T, T -> T`  |
-| `dimensions`  | `int64` array           | unordered array of dimensions to |
-:               :                         : reduce                           :
+Arguments     | Type             | Semantics
+------------- | ---------------- | ---------------------------------------
+`operand`     | `XlaOp`          | array of type `T`
+`init_value`  | `XlaOp`          | scalar of type `T`
+`computation` | `XlaComputation` | computation of type `T, T -> T`
+`dimensions`  | `int64` array    | unordered array of dimensions to reduce
 
 This operation reduces one or more dimensions of the input array into scalars.
 The rank of the returned array is `rank(operand) - len(dimensions)`.
@@ -1525,7 +1508,7 @@ Reducing the 3D array over all its dimensions produces the scalar `84`.
 ## ReducePrecision
 
 See also
-[`ComputationBuilder::ReducePrecision`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::ReducePrecision`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Models the effect of converting floating-point values to a lower-precision
 format (such as IEEE-FP16) and back to the original format.  The number of
@@ -1535,14 +1518,11 @@ implementations.
 
 <b> `ReducePrecision(operand, mantissa_bits, exponent_bits)` </b>
 
-| Arguments           | Type                    | Semantics                    |
-| ------------------- | ----------------------- | ---------------------------- |
-| `operand`           | `ComputationDataHandle` | array of floating-point type |
-:                     :                         : `T`.                         :
-| `exponent_bits`     | `int32`                 | number of exponent bits in   |
-:                     :                         : lower-precision format       :
-| `mantissa_bits`     | `int32`                 | number of mantissa bits in   |
-:                     :                         : lower-precision format       :
+Arguments       | Type    | Semantics
+--------------- | ------- | -------------------------------------------------
+`operand`       | `XlaOp` | array of floating-point type `T`.
+`exponent_bits` | `int32` | number of exponent bits in lower-precision format
+`mantissa_bits` | `int32` | number of mantissa bits in lower-precision format
 
 The result is an array of type `T`.  The input values are rounded to the nearest
 value representable with the given number of mantissa bits (using "ties to even"
@@ -1559,7 +1539,7 @@ portion of the conversion is then simply a no-op.
 ## ReduceWindow
 
 See also
-[`ComputationBuilder::ReduceWindow`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::ReduceWindow`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Applies a reduction function to all elements in each window of the input
 multi-dimensional array, producing an output multi-dimensional array with the
@@ -1571,25 +1551,25 @@ on the left-hand side.
 <b> `ReduceWindow(operand, init_value, computation, window_dimensions,
 window_strides, padding)` </b>
 
-| Arguments           | Type                    | Semantics                    |
-| ------------------- | ----------------------- | ---------------------------- |
-| `operand`           | `ComputationDataHandle` | N dimensional array          |
-:                     :                         : containing elements of type  :
-:                     :                         : T. This is the base area on  :
-:                     :                         : which the window is placed.  :
-| `init_value`        | `ComputationDataHandle` | Starting value for the       |
-:                     :                         : reduction. See [Reduce]      :
-:                     :                         : (#reduce) for details.       :
-| `computation`       | `Computation`           | Reduction function of type   |
-:                     :                         : `T, T -> T`, to apply to all :
-:                     :                         : elements in each window      :
-| `window_dimensions` | `ArraySlice<int64>`     | array of integers for window |
-:                     :                         : dimension values             :
-| `window_strides`    | `ArraySlice<int64>`     | array of integers for window |
-:                     :                         : stride values                :
-| `padding`           | `Padding`               | padding type for window      |
-:                     :                         : (Padding\:\:kSame or         :
-:                     :                         : Padding\:\:kValid)           :
+| Arguments           | Type                | Semantics                        |
+| ------------------- | ------------------- | -------------------------------- |
+| `operand`           | `XlaOp`             | N dimensional array containing   |
+:                     :                     : elements of type T. This is the  :
+:                     :                     : base area on which the window is :
+:                     :                     : placed.                          :
+| `init_value`        | `XlaOp`             | Starting value for the           |
+:                     :                     : reduction. See [Reduce](#reduce) :
+:                     :                     : for details.                     :
+| `computation`       | `XlaComputation`    | Reduction function of type `T, T |
+:                     :                     : -> T`, to apply to all elements  :
+:                     :                     : in each window                   :
+| `window_dimensions` | `ArraySlice<int64>` | array of integers for window     |
+:                     :                     : dimension values                 :
+| `window_strides`    | `ArraySlice<int64>` | array of integers for window     |
+:                     :                     : stride values                    :
+| `padding`           | `Padding`           | padding type for window          |
+:                     :                     : (Padding\:\:kSame or             :
+:                     :                     : Padding\:\:kValid)               :
 
 Below code and figure shows an example of using `ReduceWindow`. Input is a
 matrix of size [4x6] and both window_dimensions and window_stride_dimensions are
@@ -1597,9 +1577,9 @@ matrix of size [4x6] and both window_dimensions and window_stride_dimensions are
 
 ```
 // Create a computation for the reduction (maximum).
-Computation max;
+XlaComputation max;
 {
-  ComputationBuilder builder(client_, "max");
+  XlaBuilder builder(client_, "max");
   auto y = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "y");
   auto x = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "x");
   builder.Max(y, x);
@@ -1607,7 +1587,7 @@ Computation max;
 }
 
 // Create a ReduceWindow computation with the max reduction computation.
-ComputationBuilder builder(client_, "reduce_window_2x3");
+XlaBuilder builder(client_, "reduce_window_2x3");
 auto shape = ShapeUtil::MakeShape(F32, {4, 6});
 auto input = builder.Parameter(0, shape, "input");
 builder.ReduceWindow(
@@ -1642,7 +1622,7 @@ context of [`Reduce`](#reduce) for more details.
 ## Reshape
 
 See also
-[`ComputationBuilder::Reshape`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h)
+[`XlaBuilder::Reshape`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
 and the [`Collapse`](#collapse) operation.
 
 Reshapes the dimensions of an array into a new configuration.
@@ -1650,11 +1630,11 @@ Reshapes the dimensions of an array into a new configuration.
 <b> `Reshape(operand, new_sizes)` </b>
 <b> `Reshape(operand, dimensions, new_sizes)` </b>
 
-Arguments    | Type                    | Semantics
------------- | ----------------------- | ---------------------------------------
-`operand`    | `ComputationDataHandle` | array of type T
-`dimensions` | `int64` vector          | order in which dimensions are collapsed
-`new_sizes`  | `int64` vector          | vector of sizes of new dimensions
+Arguments    | Type           | Semantics
+------------ | -------------- | ---------------------------------------
+`operand`    | `XlaOp`        | array of type T
+`dimensions` | `int64` vector | order in which dimensions are collapsed
+`new_sizes`  | `int64` vector | vector of sizes of new dimensions
 
 Conceptually, reshape first flattens an array into a one-dimensional vector of
 data values, and then refines this vector into a new shape. The input arguments
@@ -1723,14 +1703,14 @@ Reshape(5, {}, {1,1}) == f32[1x1] {{5}};
 ## Rev (reverse)
 
 See also
-[`ComputationBuilder::Rev`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Rev`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b>`Rev(operand, dimensions)`</b>
 
-Arguments    | Type                    | Semantics
------------- | ----------------------- | ---------------------
-`operand`    | `ComputationDataHandle` | array of type T
-`dimensions` | `ArraySlice<int64>`     | dimensions to reverse
+Arguments    | Type                | Semantics
+------------ | ------------------- | ---------------------
+`operand`    | `XlaOp`             | array of type T
+`dimensions` | `ArraySlice<int64>` | dimensions to reverse
 
 Reverses the order of elements in the `operand` array along the specified
 `dimensions`, generating an output array of the same shape. Each element of the
@@ -1745,7 +1725,7 @@ the two window dimensions during the gradient computation in neural networks.
 ## RngNormal
 
 See also
-[`ComputationBuilder::RngNormal`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::RngNormal`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Constructs an output of a given shape with random numbers generated following
 the $$N(\mu, \sigma)$$ normal distribution. The parameters `mu` and `sigma`, and
@@ -1754,18 +1734,18 @@ be scalar valued.
 
 <b>`RngNormal(mean, sigma, shape)`</b>
 
-| Arguments | Type                    | Semantics                              |
-| --------- | ----------------------- | -------------------------------------- |
-| `mu`      | `ComputationDataHandle` | Scalar of type F32 specifying mean of  |
-:           :                         : generated numbers                      :
-| `sigma`   | `ComputationDataHandle` | Scalar of type F32 specifying standard |
-:           :                         : deviation of generated numbers         :
-| `shape`   | `Shape`                 | Output shape of type F32               |
+| Arguments | Type    | Semantics                                           |
+| --------- | ------- | --------------------------------------------------- |
+| `mu`      | `XlaOp` | Scalar of type F32 specifying mean of generated     |
+:           :         : numbers                                             :
+| `sigma`   | `XlaOp` | Scalar of type F32 specifying standard deviation of |
+:           :         : generated numbers                                   :
+| `shape`   | `Shape` | Output shape of type F32                            |
 
 ## RngUniform
 
 See also
-[`ComputationBuilder::RngUniform`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::RngUniform`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Constructs an output of a given shape with random numbers generated following
 the uniform distribution over the interval $$[a,b)$$. The parameters and output
@@ -1777,27 +1757,27 @@ is implementation-defined.
 
 | Arguments | Type                    | Semantics                         |
 | --------- | ----------------------- | --------------------------------- |
-| `a`       | `ComputationDataHandle` | Scalar of type T specifying lower |
+| `a`       | `XlaOp`                 | Scalar of type T specifying lower |
 :           :                         : limit of interval                 :
-| `b`       | `ComputationDataHandle` | Scalar of type T specifying upper |
+| `b`       | `XlaOp`                 | Scalar of type T specifying upper |
 :           :                         : limit of interval                 :
 | `shape`   | `Shape`                 | Output shape of type T            |
 
 ## Select
 
 See also
-[`ComputationBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Constructs an output array from elements of two input arrays, based on the
 values of a predicate array.
 
 <b> `Select(pred, on_true, on_false)` </b>
 
-Arguments  | Type                    | Semantics
----------- | ----------------------- | ------------------
-`pred`     | `ComputationDataHandle` | array of type PRED
-`on_true`  | `ComputationDataHandle` | array of type T
-`on_false` | `ComputationDataHandle` | array of type T
+Arguments  | Type    | Semantics
+---------- | ------- | ------------------
+`pred`     | `XlaOp` | array of type PRED
+`on_true`  | `XlaOp` | array of type T
+`on_false` | `XlaOp` | array of type T
 
 The arrays `on_true` and `on_false` must have the same shape. This is also the
 shape of the output array. The array `pred` must have the same dimensionality as
@@ -1837,7 +1817,7 @@ the same shape!) then `pred` has to be a scalar of type `PRED`.
 ## SelectAndScatter
 
 See also
-[`ComputationBuilder::SelectAndScatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::SelectAndScatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 This operation can be considered as a composite operation that first computes
 `ReduceWindow` on the `operand` array to select an element from each window, and
@@ -1870,33 +1850,32 @@ backpropagate the gradient values for a pooling layer in a neural network.
 <b>`SelectAndScatter(operand, select, window_dimensions, window_strides,
 padding, source, init_value, scatter)`</b>
 
-| Arguments           | Type                    | Semantics                    |
-| ------------------- | ----------------------- | ---------------------------- |
-| `operand`           | `ComputationDataHandle` | array of type T over which   |
-:                     :                         : the windows slide            :
-| `select`            | `Computation`           | binary computation of type   |
-:                     :                         : `T, T -> PRED`, to apply to  :
-:                     :                         : all elements in each window; :
-:                     :                         : returns `true` if the first  :
-:                     :                         : parameter is selected and    :
-:                     :                         : returns `false` if the       :
-:                     :                         : second parameter is selected :
-| `window_dimensions` | `ArraySlice<int64>`     | array of integers for window |
-:                     :                         : dimension values             :
-| `window_strides`    | `ArraySlice<int64>`     | array of integers for window |
-:                     :                         : stride values                :
-| `padding`           | `Padding`               | padding type for window      |
-:                     :                         : (Padding\:\:kSame or         :
-:                     :                         : Padding\:\:kValid)           :
-| `source`            | `ComputationDataHandle` | array of type T with the     |
-:                     :                         : values to scatter            :
-| `init_value`        | `ComputationDataHandle` | scalar value of type T for   |
-:                     :                         : the initial value of the     :
-:                     :                         : output array                 :
-| `scatter`           | `Computation`           | binary computation of type   |
-:                     :                         : `T, T -> T`, to apply each   :
-:                     :                         : scatter source element with  :
-:                     :                         : its destination element      :
+| Arguments           | Type                | Semantics                        |
+| ------------------- | ------------------- | -------------------------------- |
+| `operand`           | `XlaOp`             | array of type T over which the   |
+:                     :                     : windows slide                    :
+| `select`            | `XlaComputation`    | binary computation of type `T, T |
+:                     :                     : -> PRED`, to apply to all        :
+:                     :                     : elements in each window; returns :
+:                     :                     : `true` if the first parameter is :
+:                     :                     : selected and returns `false` if  :
+:                     :                     : the second parameter is selected :
+| `window_dimensions` | `ArraySlice<int64>` | array of integers for window     |
+:                     :                     : dimension values                 :
+| `window_strides`    | `ArraySlice<int64>` | array of integers for window     |
+:                     :                     : stride values                    :
+| `padding`           | `Padding`           | padding type for window          |
+:                     :                     : (Padding\:\:kSame or             :
+:                     :                     : Padding\:\:kValid)               :
+| `source`            | `XlaOp`             | array of type T with the values  |
+:                     :                     : to scatter                       :
+| `init_value`        | `XlaOp`             | scalar value of type T for the   |
+:                     :                     : initial value of the output      :
+:                     :                     : array                            :
+| `scatter`           | `XlaComputation`    | binary computation of type `T, T |
+:                     :                     : -> T`, to apply each scatter     :
+:                     :                     : source element with its          :
+:                     :                     : destination element              :
 
 The figure below shows examples of using `SelectAndScatter`, with the `select`
 function computing the maximal value among its parameters. Note that when the
@@ -1918,14 +1897,14 @@ context of [`Reduce`](#reduce) for more details.
 ## Send
 
 See also
-[`ComputationBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `Send(operand, channel_handle)` </b>
 
-| Arguments        | Type                    | Semantics                        |
-| ---------------- | ----------------------- | -------------------------------- |
-| `operand`        | `ComputationDataHandle` | data to send (array of type T)   |
-| `channel_handle` | `ChannelHandle`         | unique identifier for each send/recv pair |
+Arguments        | Type            | Semantics
+---------------- | --------------- | -----------------------------------------
+`operand`        | `XlaOp`         | data to send (array of type T)
+`channel_handle` | `ChannelHandle` | unique identifier for each send/recv pair
 
 Sends the given operand data to a `Recv` instruction in another computation
 that shares the same channel handle. Does not return any data.
@@ -1973,7 +1952,7 @@ computations. For example, below schedules lead to deadlocks.
 ## Slice
 
 See also
-[`ComputationBuilder::Slice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Slice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Slicing extracts a sub-array from the input array. The sub-array is of the same
 rank as the input and contains the values inside a bounding box within the input
@@ -1982,23 +1961,20 @@ arguments to the slice operation.
 
 <b> `Slice(operand, start_indices, limit_indices)` </b>
 
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
-| `start_indices` | `ArraySlice<int64>`     | List of N integers containing    |
-:                 :                         : the starting indices of the      :
-:                 :                         : slice for each dimension. Values :
-:                 :                         : must be greater than or equal to :
-:                 :                         : zero.                            :
-| `limit_indices` | `ArraySlice<int64>`     | List of N integers containing    |
-:                 :                         : the ending indices (exclusive)   :
-:                 :                         : for the slice for each           :
-:                 :                         : dimension. Each value must be    :
-:                 :                         : strictly greater than the        :
-:                 :                         : respective `start_indices` value :
-:                 :                         : for the dimension and less than  :
-:                 :                         : or equal to the size of the      :
-:                 :                         : dimension.                       :
+| Arguments       | Type                | Semantics                            |
+| --------------- | ------------------- | ------------------------------------ |
+| `operand`       | `XlaOp`             | N dimensional array of type T        |
+| `start_indices` | `ArraySlice<int64>` | List of N integers containing the    |
+:                 :                     : starting indices of the slice for    :
+:                 :                     : each dimension. Values must be       :
+:                 :                     : greater than or equal to zero.       :
+| `limit_indices` | `ArraySlice<int64>` | List of N integers containing the    |
+:                 :                     : ending indices (exclusive) for the   :
+:                 :                     : slice for each dimension. Each value :
+:                 :                     : must be strictly greater than the    :
+:                 :                     : respective `start_indices` value for :
+:                 :                     : the dimension and less than or equal :
+:                 :                     : to the size of the dimension.        :
 
 1-dimensional example:
 
@@ -2025,15 +2001,15 @@ Slice(b, {2, 1}, {4, 3}) produces:
 ## Sort
 
 See also
-[`ComputationBuilder::Sort`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Sort`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Sorts the elements in the operand.
 
 <b>`Sort(operand)`</b>
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | -------------------
-`operand` | `ComputationDataHandle` | The operand to sort
+Arguments | Type    | Semantics
+--------- | ------- | -------------------
+`operand` | `XlaOp` | The operand to sort
 
 ## Transpose
 
@@ -2041,10 +2017,10 @@ See also the @{tf.reshape} operation.
 
 <b>`Transpose(operand)`</b>
 
-Arguments     | Type                    | Semantics
----------     | ----------------------- | -------------------------
-`operand`     | `ComputationDataHandle` | The operand to transpose.
-`permutation` | `ArraySlice<int64>`     | How to permute the dimensions.
+Arguments     | Type                | Semantics
+------------- | ------------------- | ------------------------------
+`operand`     | `XlaOp`             | The operand to transpose.
+`permutation` | `ArraySlice<int64>` | How to permute the dimensions.
 
 
 Permutes the operand dimensions with the given permutation, so
@@ -2056,7 +2032,7 @@ This is the same as Reshape(operand, permutation,
 ## Tuple
 
 See also
-[`ComputationBuilder::Tuple`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Tuple`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 A tuple containing a variable number of data handles, each of which has its own
 shape.
@@ -2075,18 +2051,19 @@ Tuples can be deconstructed (accessed) via the [`GetTupleElement`]
 ## While
 
 See also
-[`ComputationBuilder::While`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::While`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `While(condition, body, init)` </b>
 
-| Arguments   | Type          | Semantics                                      |
-| ----------- | ------------- | ---------------------------------------------- |
-| `condition` | `Computation` | Computation of type `T -> PRED` which defines  |
-:             :               : the termination condition of the loop.         :
-| `body`      | `Computation` | Computation of type `T -> T` which defines the |
-:             :               : body of the loop.                              :
-| `init`      | `T`           | Initial value for the parameter of `condition` |
-:             :               : and `body`.                                    :
+| Arguments   | Type             | Semantics                                |
+| ----------- | ---------------- | ---------------------------------------- |
+| `condition` | `XlaComputation` | XlaComputation of type `T -> PRED` which |
+:             :                  : defines the termination condition of the :
+:             :                  : loop.                                    :
+| `body`      | `XlaComputation` | XlaComputation of type `T -> T` which    |
+:             :                  : defines the body of the loop.            :
+| `init`      | `T`              | Initial value for the parameter of       |
+:             :                  : `condition` and `body`.                  :
 
 Sequentially executes the `body` until the `condition` fails. This is similar to
 a typical while loop in many other languages except for the differences and
diff --git a/tensorflow/docs_src/programmers_guide/datasets.md b/tensorflow/docs_src/programmers_guide/datasets.md
index 67be41b1a68bc72913d859f83ccd74b529350a4c..8b69860a68461e849a445f5c01c2e9b71d614a46 100644
--- a/tensorflow/docs_src/programmers_guide/datasets.md
+++ b/tensorflow/docs_src/programmers_guide/datasets.md
@@ -485,6 +485,46 @@ dataset = dataset.flat_map(
         .filter(lambda line: tf.not_equal(tf.substr(line, 0, 1), "#"))))
 ```
 
+### Consuming CSV data
+
+The CSV file format is a popular format for storing tabular data in plain text.
+The @{tf.contrib.data.CsvDataset} class provides a way to extract records from
+one or more CSV files that comply with [RFC 4180](https://tools.ietf.org/html/rfc4180).
+Given one or more filenames and a list of defaults, a `CsvDataset` will produce
+a tuple of elements whose types correspond to the types of the defaults
+provided, per CSV record. Like `TFRecordDataset` and `TextLineDataset`,
+`CsvDataset` accepts `filenames` as a `tf.Tensor`, so you can parameterize it
+by passing a  `tf.placeholder(tf.string)`.
+
+```
+# Creates a dataset that reads all of the records from two CSV files, each with
+# eight float columns
+filenames = ["/var/data/file1.csv", "/var/data/file2.csv"]
+record_defaults = [tf.float32] * 8   # Eight required float columns
+dataset = tf.contrib.data.CsvDataset(filenames, record_defaults)
+```
+
+If some columns are empty, you can provide defaults instead of types.
+
+```
+# Creates a dataset that reads all of the records from two CSV files, each with
+# four float columns which may have missing values
+record_defaults = [[0.0]] * 8
+dataset = tf.contrib.data.CsvDataset(filenames, record_defaults)
+```
+
+By default, a `CsvDataset` yields *every* column of *every* line of the file,
+which may not be desirable, for example if the file starts with a header line
+that should be ignored, or if some columns are not required in the input.
+These lines and fields can be removed with the `header` and `select_cols`
+arguments respectively.
+
+```
+# Creates a dataset that reads all of the records from two CSV files with
+# headers, extracting float data from columns 2 and 4.
+record_defaults = [[0.0]] * 2  # Only provide defaults for the selected columns
+dataset = tf.contrib.data.CsvDataset(filenames, record_defaults, header=True, select_cols=[2,4])
+```
 <!--
 TODO(mrry): Add these sections.
 
diff --git a/tensorflow/docs_src/programmers_guide/eager.md b/tensorflow/docs_src/programmers_guide/eager.md
index 5926e9f7f4cef96b248f9e1fd926e8712c3cc32b..00d02b44558d023d4cb3d8e2bce01c4e1911a302 100644
--- a/tensorflow/docs_src/programmers_guide/eager.md
+++ b/tensorflow/docs_src/programmers_guide/eager.md
@@ -118,13 +118,14 @@ it is easy to write [fizzbuzz](https://en.wikipedia.org/wiki/Fizz_buzz):
 ```py
 def fizzbuzz(max_num):
   counter = tf.constant(0)
-  for num in range(max_num):
+  max_num = tf.convert_to_tensor(max_num)
+  for num in range(max_num.numpy()):
     num = tf.constant(num)
-    if num % 3 == 0 and num % 5 == 0:
+    if int(num % 3) == 0 and int(num % 5) == 0:
       print('FizzBuzz')
-    elif num % 3 == 0:
+    elif int(num % 3) == 0:
       print('Fizz')
-    elif num % 5 == 0:
+    elif int(num % 5) == 0:
       print('Buzz')
     else:
       print(num)
diff --git a/tensorflow/docs_src/programmers_guide/embedding.md b/tensorflow/docs_src/programmers_guide/embedding.md
index d5703e07375b1f68f4e22476288f1ed57d340c5b..8a98367dfbb97e923824dd86e67ba26e95a3565f 100644
--- a/tensorflow/docs_src/programmers_guide/embedding.md
+++ b/tensorflow/docs_src/programmers_guide/embedding.md
@@ -238,7 +238,7 @@ row doesn't have to be filled, as shown below.
 </tr>
 </table>
 
-Follow [this link]("https://www.tensorflow.org/images/embedding-mnist.mp4" )
+Follow [this link](https://www.tensorflow.org/images/embedding-mnist.mp4)
 to see a fun example of thumbnail images in the Embedding Projector.
 
 
diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
index ffadf29ad7710de860a56253f279204d17cc318a..de830112e001ee4b5204454725317e5d661af5ec 100644
--- a/tensorflow/docs_src/programmers_guide/estimators.md
+++ b/tensorflow/docs_src/programmers_guide/estimators.md
@@ -21,18 +21,17 @@ Note: TensorFlow also includes a deprecated `Estimator` class at
 
 Estimators provide the following benefits:
 
-*   You can run Estimators-based models on a local host or on a
+*   You can run Estimator-based models on a local host or on a
     distributed multi-server environment without changing your model.
-    Furthermore, you can run Estimators-based models on CPUs, GPUs,
+    Furthermore, you can run Estimator-based models on CPUs, GPUs,
     or TPUs without recoding your model.
 *   Estimators simplify sharing implementations between model developers.
-*   You can develop a state of the art model with high-level intuitive code,
+*   You can develop a state of the art model with high-level intuitive code.
     In short, it is generally much easier to create models with Estimators
     than with the low-level TensorFlow APIs.
-*   Estimators are themselves built on tf.layers, which
+*   Estimators are themselves built on @{tf.layers}, which
     simplifies customization.
-*   Estimators build the graph for you.  In other words, you don't have to
-    build the graph.
+*   Estimators build the graph for you.
 *   Estimators provide a safe distributed training loop that controls how and
     when to:
     *   build the graph
@@ -57,7 +56,7 @@ the "plumbing" for you.  That is, pre-made Estimators create and manage
 pre-made Estimators let you experiment with different model architectures by
 making only minimal code changes.  @{tf.estimator.DNNClassifier$`DNNClassifier`},
 for example, is a pre-made Estimator class that trains classification models
-through dense, feed-forward neural networks.
+based on dense, feed-forward neural networks.
 
 
 ### Structure of a pre-made Estimators program
@@ -79,7 +78,7 @@ of the following four steps:
     an input function:
 
         def input_fn(dataset):
-           ...  # manipulate dataset, extracting feature names and the label
+           ...  # manipulate dataset, extracting the feature dict and the label
            return feature_dict, label
 
     (See @{$programmers_guide/datasets} for full details.)
@@ -96,13 +95,13 @@ of the following four steps:
         population = tf.feature_column.numeric_column('population')
         crime_rate = tf.feature_column.numeric_column('crime_rate')
         median_education = tf.feature_column.numeric_column('median_education',
-                            normalizer_fn='lambda x: x - global_education_mean')
+                            normalizer_fn=lambda x: x - global_education_mean)
 
 3.  **Instantiate the relevant pre-made Estimator.**  For example, here's
     a sample instantiation of a pre-made Estimator named `LinearClassifier`:
 
         # Instantiate an estimator, passing the feature columns.
-        estimator = tf.estimator.Estimator.LinearClassifier(
+        estimator = tf.estimator.LinearClassifier(
             feature_columns=[population, crime_rate, median_education],
             )
 
diff --git a/tensorflow/docs_src/programmers_guide/faq.md b/tensorflow/docs_src/programmers_guide/faq.md
index 51c1a1e032baae7eff334da785fc5ffa2438e0ca..b6291a9fface404406829d8d7ce5cc36980661a3 100644
--- a/tensorflow/docs_src/programmers_guide/faq.md
+++ b/tensorflow/docs_src/programmers_guide/faq.md
@@ -72,7 +72,7 @@ tensors in the execution of a step.
 
 If `t` is a @{tf.Tensor} object,
 @{tf.Tensor.eval} is shorthand for
-@{tf.Session.run} (where `sess` is the
+@{tf.Session.run}, where `sess` is the
 current @{tf.get_default_session}. The
 two following snippets of code are equivalent:
 
@@ -101,9 +101,8 @@ sessions, it may be more straightforward to make explicit calls to
 Sessions can own resources, such as
 @{tf.Variable},
 @{tf.QueueBase}, and
-@{tf.ReaderBase}; and these resources can use
-a significant amount of memory. These resources (and the associated memory) are
-released when the session is closed, by calling
+@{tf.ReaderBase}. These resources can sometimes use
+a significant amount of memory, and can be released when the session is closed by calling
 @{tf.Session.close}.
 
 The intermediate tensors that are created as part of a call to
@@ -137,7 +136,7 @@ TensorFlow also has a
 to help build support for more client languages.  We invite contributions of new
 language bindings.
 
-Bindings for various other languages (such as [C#](https://github.com/migueldeicaza/TensorFlowSharp), [Julia](https://github.com/malmaud/TensorFlow.jl), [Ruby](https://github.com/somaticio/tensorflow.rb) and [Scala](https://github.com/eaplatanios/tensorflow_scala)) created and supported by the opensource community build on top of the C API supported by the TensorFlow maintainers.
+Bindings for various other languages (such as [C#](https://github.com/migueldeicaza/TensorFlowSharp), [Julia](https://github.com/malmaud/TensorFlow.jl), [Ruby](https://github.com/somaticio/tensorflow.rb) and [Scala](https://github.com/eaplatanios/tensorflow_scala)) created and supported by the open source community build on top of the C API supported by the TensorFlow maintainers.
 
 #### Does TensorFlow make use of all the devices (GPUs and CPUs) available on my machine?
 
@@ -210,8 +209,8 @@ a new tensor with a different dynamic shape.
 
 #### How do I build a graph that works with variable batch sizes?
 
-It is often useful to build a graph that works with variable batch sizes, for
-example so that the same code can be used for (mini-)batch training, and
+It is often useful to build a graph that works with variable batch sizes 
+so that the same code can be used for (mini-)batch training, and
 single-instance inference. The resulting graph can be
 @{tf.Graph.as_graph_def$saved as a protocol buffer}
 and
@@ -260,7 +259,7 @@ See the how-to documentation for
 There are three main options for dealing with data in a custom format.
 
 The easiest option is to write parsing code in Python that transforms the data
-into a numpy array. Then use @{tf.data.Dataset.from_tensor_slices} to
+into a numpy array. Then, use @{tf.data.Dataset.from_tensor_slices} to
 create an input pipeline from the in-memory data.
 
 If your data doesn't fit in memory, try doing the parsing in the Dataset
@@ -274,7 +273,7 @@ If your data is not easily parsable with the built-in TensorFlow operations,
 consider converting it, offline, to a format that is easily parsable, such
 as @{tf.python_io.TFRecordWriter$`TFRecord`} format.
 
-The more efficient method to customize the parsing behavior is to
+The most efficient method to customize the parsing behavior is to
 @{$adding_an_op$add a new op written in C++} that parses your
 data format. The @{$new_data_formats$guide to handling new data formats} has
 more information about the steps for doing this.
diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
index e8cf7711552f4c83ed1e03e0753b580cc7505ddc..cd8c4b5b9a026f01af4957ade0e132477b0066a5 100644
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@@ -237,7 +237,7 @@ TensorFlow supports two ways of sharing variables:
 While code which explicitly passes variables around is very clear, it is
 sometimes convenient to write TensorFlow functions that implicitly use
 variables in their implementations. Most of the functional layers from
-`tf.layer` use this approach, as well as all `tf.metrics`, and a few other
+`tf.layers` use this approach, as well as all `tf.metrics`, and a few other
 library utilities.
 
 Variable scopes allow you to control variable reuse when calling functions which
diff --git a/tensorflow/docs_src/tutorials/audio_recognition.md b/tensorflow/docs_src/tutorials/audio_recognition.md
index 372ab47df7df309ab926836ca19f34d2d0d38915..d7a8da6f96194ae4e35441224411145d200aa687 100644
--- a/tensorflow/docs_src/tutorials/audio_recognition.md
+++ b/tensorflow/docs_src/tutorials/audio_recognition.md
@@ -25,13 +25,15 @@ python tensorflow/examples/speech_commands/train.py
 ```
 
 The script will start off by downloading the [Speech Commands
-dataset](https://storage.cloud.google.com/download.tensorflow.org/data/speech_commands_v0.01.tar.gz),
-which consists of 65,000 WAVE audio files of people saying thirty different
-words. This data was collected by Google and released under a CC BY license, and
-you can help improve it by [contributing five minutes of your own
+dataset](https://storage.cloud.google.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz),
+which consists of over 105,000 WAVE audio files of people saying thirty
+different words. This data was collected by Google and released under a CC BY
+license, and you can help improve it by [contributing five minutes of your own
 voice](https://aiyprojects.withgoogle.com/open_speech_recording). The archive is
-over 1GB, so this part may take a while, but you should see progress logs, and
-once it's been downloaded once you won't need to do this step again.
+over 2GB, so this part may take a while, but you should see progress logs, and
+once it's been downloaded once you won't need to do this step again. You can
+find more information about this dataset in this
+[Speech Commands paper](https://arxiv.org/abs/1804.03209).
 
 Once the downloading has completed, you'll see logging information that looks
 like this:
@@ -229,7 +231,7 @@ You can also build this application yourself, since it's open source and
 [available as part of the TensorFlow repository on
 github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#building-in-android-studio-using-the-tensorflow-aar-from-jcenter).
 By default it downloads [a pretrained model from
-tensorflow.org](http://download.tensorflow.org/models/speech_commands_v0.01.zip),
+tensorflow.org](http://download.tensorflow.org/models/speech_commands_v0.02.zip),
 but you can easily [replace it with a model you've trained
 yourself](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install-model-files-optional).
 If you do this, you'll need to make sure that the constants in [the main
diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index 37cd2bb1397deaddec9f7194d8c5093145e08644..496b1e4da9d3b85d88be4dd86086e04fda51b8be 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -209,7 +209,6 @@ for two-dimensional image data expect input tensors to have a shape of
 *   _`channels`_. Number of color channels in the example images. For color
     images, the number of channels is 3 (red, green, blue). For monochrome
     images, there is just 1 channel (black).
-*   _`image_height`_. Height of the example images.
 *   _`data_format`_. A string, one of `channels_last` (default) or `channels_first`.
       `channels_last` corresponds to inputs with shape
       `(batch, ..., channels)` while `channels_first` corresponds to
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index aa594a63c6ad5ab7129e452e7a6345114b994231..07f096418f53219c9ec7000a4560d78a3ff609e1 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -36,7 +36,7 @@ cc_binary(
         "-z defs",
         "-s",
         "-Wl,--version-script",  # This line must be directly followed by LINKER_SCRIPT.
-        LINKER_SCRIPT,
+        "$(location {})".format(LINKER_SCRIPT),
     ],
     linkshared = 1,
     linkstatic = 1,
diff --git a/tensorflow/examples/learn/text_classification_cnn.py b/tensorflow/examples/learn/text_classification_cnn.py
index 9e21aee87f629835222ab367dc3ed55863f553e4..a40a9eaecbd9bbaea38f49cd10463c5ba9782e40 100644
--- a/tensorflow/examples/learn/text_classification_cnn.py
+++ b/tensorflow/examples/learn/text_classification_cnn.py
@@ -73,7 +73,7 @@ def cnn_model(features, labels, mode):
         kernel_size=FILTER_SHAPE2,
         padding='VALID')
     # Max across each filter to get useful features for classification.
-    pool2 = tf.squeeze(tf.reduce_max(conv2, 1), squeeze_dims=[1])
+    pool2 = tf.squeeze(tf.reduce_max(conv2, 1), axis=[1])
 
   # Apply regular WX + B and classification.
   logits = tf.layers.dense(pool2, MAX_LABEL, activation=None)
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index f084931215261f183f1ecfc5517ea9a5126db039..fc28eb0631dc5e1947c2a31a6acdb02ed8d28f3a 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -288,7 +288,7 @@ if __name__ == '__main__':
       '--data_url',
       type=str,
       # pylint: disable=line-too-long
-      default='http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz',
+      default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
       # pylint: enable=line-too-long
       help='Location of speech training data archive on the web.')
   parser.add_argument(
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 2f1be51ada8665df3dbb4d69a8ebdadaf72d3a4b..e08f38969cf8392c90725d7bc45db154c37a0e39 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2544,6 +2544,136 @@ func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values
 	return op.Output(0)
 }
 
+// Reverses specific dimensions of a tensor.
+//
+// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
+// of `tensor`, this operation reverses each dimension i of `tensor` where
+// `dims[i]` is `True`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions
+// of `tensor` must equal the number of elements in `dims`. In other words:
+//
+// `rank(tensor) = size(dims)`
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [False, False, False, True]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is [False, True, False, False]
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is [False, False, True, False]
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
+//
+// Arguments:
+//	tensor: Up to 8-D.
+//	dims: 1-D. The dimensions to reverse.
+//
+// Returns The same shape as `tensor`.
+func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Reverse",
+		Input: []tf.Input{
+			tensor, dims,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Copy a tensor setting everything outside a central band in each innermost matrix
+//
+// to zero.
+//
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
+//
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+//
+// The indicator function
+//
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
+//
+// For example:
+//
+// ```
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
+//
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
+//
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
+//
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixBandPart",
+		Input: []tf.Input{
+			input, num_lower, num_upper,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Clips tensor values to a specified min and max.
 //
 // Given a tensor `t`, this operation returns a tensor of the same type and
@@ -2796,71 +2926,6 @@ func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
-
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
-//
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
-//
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
-//
-// Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
-//
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseToDense",
-		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the sum along sparse segments of a tensor.
 //
 // Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
@@ -6469,72 +6534,6 @@ func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dens
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Reverses specific dimensions of a tensor.
-//
-// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
-// of `tensor`, this operation reverses each dimension i of `tensor` where
-// `dims[i]` is `True`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions
-// of `tensor` must equal the number of elements in `dims`. In other words:
-//
-// `rank(tensor) = size(dims)`
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [False, False, False, True]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is [False, True, False, False]
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
-//
-// # 'dims' is [False, False, True, False]
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
-//
-// Arguments:
-//	tensor: Up to 8-D.
-//	dims: 1-D. The dimensions to reverse.
-//
-// Returns The same shape as `tensor`.
-func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Reverse",
-		Input: []tf.Input{
-			tensor, dims,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // BiasAddGradAttr is an optional argument to BiasAddGrad.
 type BiasAddGradAttr func(optionalAttr)
 
@@ -7311,70 +7310,6 @@ func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-//
-// The indicator function
-//
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
-//
-// For example:
-//
-// ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
-//
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
-//
-// Useful special cases:
-//
-// ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
-//
-// Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
-//
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
-		Input: []tf.Input{
-			input, num_lower, num_upper,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
 type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
@@ -10687,7 +10622,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 // SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
 //
 // value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
+// supplied image within this range.
 // If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
@@ -23656,7 +23591,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 // SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
 //
 // value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
+// supplied image within this range.
 // If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
@@ -24884,6 +24819,71 @@ func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples t
 	return op.Output(0)
 }
 
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
+
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+//
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Converts a sparse representation into a dense tensor.
+//
+// Builds an array `dense` with shape `output_shape` such that
+//
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
+//
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
+//
+// Arguments:
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseToDense",
+		Input: []tf.Input{
+			sparse_indices, output_shape, sparse_values, default_value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
 //
 // The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 0cc8e7c3e2c16654852cfa558bbc7e408d9ed1ad..50d5dcef548758dcab4109f8d61a30290892bbf7 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -330,7 +330,7 @@ tf_cc_binary(
         "//tensorflow:debug": [],  # Disable all custom linker options in debug mode
         "//tensorflow:darwin": [
             "-Wl,-exported_symbols_list",  # This line must be directly followed by LINKER_EXPORTED_SYMBOLS
-            LINKER_EXPORTED_SYMBOLS,
+            "$(location {})".format(LINKER_EXPORTED_SYMBOLS),
         ],
         "//tensorflow:windows": [],
         "//tensorflow:windows_msvc": [],
@@ -338,7 +338,7 @@ tf_cc_binary(
             "-z defs",
             "-s",
             "-Wl,--version-script",  #  This line must be directly followed by LINKER_VERSION_SCRIPT
-            LINKER_VERSION_SCRIPT,
+            "$(location {})".format(LINKER_VERSION_SCRIPT),
         ],
     }),
     linkshared = 1,
diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc
index 081062ceaf2d0b7881f4ab90f60f38fb8bb37bda..4bcfc7fe011423df71a899d18815d3558e01b35f 100644
--- a/tensorflow/java/src/gen/cc/op_specs.cc
+++ b/tensorflow/java/src/gen/cc/op_specs.cc
@@ -382,8 +382,7 @@ EndpointSpec CreateEndpoint(const OpDef& op_def, const ApiDef& api_def,
   return EndpointSpec(package,
       name,
       Javadoc::Create(ParseDocumentation(api_def.summary()))
-          .details(ParseDocumentation(api_def.description())),
-      endpoint_def.deprecation_version() > 0);
+          .details(ParseDocumentation(api_def.description())));
 }
 
 }  // namespace
diff --git a/tensorflow/java/src/gen/cc/op_specs.h b/tensorflow/java/src/gen/cc/op_specs.h
index 81582ea207fef9608ccbaebefcd8499a9457bc41..034cf636ed071a9dccac643d0f89988b070a1efc 100644
--- a/tensorflow/java/src/gen/cc/op_specs.h
+++ b/tensorflow/java/src/gen/cc/op_specs.h
@@ -34,11 +34,11 @@ class EndpointSpec {
   // package: package of this endpoint (from which also derives its package)
   // name: name of this endpoint class
   // javadoc: the endpoint class documentation
-  // deprecated: true if this endpoint is now deprecated
+  // TODO(annarev): hardcode depcreated to false until deprecated is possible
   EndpointSpec(const string& package, const string& name,
-      const Javadoc& javadoc, bool deprecated)
+      const Javadoc& javadoc)
     : package_(package), name_(name), javadoc_(javadoc),
-      deprecated_(deprecated) {}
+      deprecated_(false) {}
 
   const string& package() const { return package_; }
   const string& name() const { return name_; }
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 087b89b125037644802fb8beeb40137092254357..7201e12c50c5f6e1f7c84be1986b93349a7aa01e 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -313,8 +313,8 @@ cc_library(
     hdrs = ["util/util.h"],
     deps = [
         ":safe_ptr",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//util/python:python_headers",
     ],
 )
@@ -502,7 +502,10 @@ py_test(
 
 cc_library(
     name = "python_op_gen",
-    srcs = ["framework/python_op_gen.cc"],
+    srcs = [
+        "framework/python_op_gen.cc",
+        "framework/python_op_gen_internal.cc",
+    ],
     hdrs = [
         "framework/python_op_gen.h",
         "framework/python_op_gen_internal.h",
@@ -524,12 +527,12 @@ cc_library(
     srcs = ["framework/python_op_gen_main.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        ":python_op_gen",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/python/eager:python_eager_op_gen",
     ],
 )
 
@@ -624,6 +627,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":pywrap_tensorflow",
+        "//tensorflow/core:protos_all_py",
     ],
 )
 
@@ -1474,6 +1478,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/training/checkpointable:__pkg__",
     ],
 )
 
@@ -1762,6 +1767,7 @@ py_library(
         ":logging_ops_gen",
         ":math_ops",
         ":platform",
+        ":resource_variable_ops_gen",
         ":sparse_tensor",
         ":tensor_array_ops",
         ":tf_should_use",
@@ -2671,7 +2677,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
-        ":checkpointable",
         ":control_flow_ops",
         ":dtypes",
         ":framework_ops",
@@ -2682,6 +2687,7 @@ py_library(
         ":util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/checkpointable:base",
     ],
 )
 
@@ -2968,9 +2974,9 @@ py_library(
         ["training/**/*.py"],
         exclude = [
             "**/*test*",
+            "training/checkpointable/**/*.py",
             # The following targets have their own build rules (same name as the
             # file):
-            "training/checkpointable.py",
             "training/saveable_object.py",
             "training/training_util.py",
         ],
@@ -2980,7 +2986,6 @@ py_library(
         ":array_ops",
         ":array_ops_gen",
         ":checkpoint_ops_gen",
-        ":checkpointable",
         ":client",
         ":control_flow_ops",
         ":data_flow_ops",
@@ -3020,33 +3025,11 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/ops/losses",
         # `layers` dependency only exists due to the use of a small utility.
         "//tensorflow/python/keras:layers",
-    ],
-)
-
-py_library(
-    name = "checkpointable",
-    srcs = ["training/checkpointable.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":array_ops",
-        ":dtypes",
-        ":io_ops_gen",
-        ":ops",
-        ":util",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
-py_test(
-    name = "checkpointable_test",
-    srcs = ["training/checkpointable_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":checkpointable",
-        ":client_testlib",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/checkpointable:util",
     ],
 )
 
@@ -3086,39 +3069,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "checkpointable_utils_test",
-    srcs = ["training/checkpointable_utils_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",  # TODO: needs investigation on Windows
-        "notsan",  # b/74395663
-    ],
-    deps = [
-        ":checkpointable",
-        ":constant_op",
-        ":control_flow_ops",
-        ":dtypes",
-        ":framework_ops",
-        ":framework_test_lib",
-        ":init_ops",
-        ":resource_variable_ops",
-        ":session",
-        ":state_ops",
-        ":template",
-        ":training",
-        ":training_util",
-        ":variable_scope",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:function",
-        "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras:engine",
-        "//tensorflow/python/keras:layers",
-        "@six_archive//:six",
-    ],
-)
-
 py_test(
     name = "distribute_test",
     size = "small",
@@ -3219,6 +3169,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "util_serialization_test",
+    size = "small",
+    srcs = ["util/serialization_test.py"],
+    main = "util/serialization_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":util",
+    ],
+)
+
 py_test(
     name = "future_api_test",
     size = "small",
@@ -3230,6 +3192,16 @@ py_test(
     ],
 )
 
+py_test(
+    name = "function_utils_test",
+    srcs = ["util/function_utils_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":util",
+    ],
+)
+
 py_test(
     name = "tf_contextlib_test",
     size = "small",
@@ -3525,7 +3497,6 @@ tf_py_wrap_cc(
         "//tensorflow/core/profiler/internal:print_model_analysis",
         "//tensorflow/tools/graph_transforms:transform_graph_lib",
         "//tensorflow/python/eager:pywrap_tfe_lib",
-        "//tensorflow/python/eager:python_eager_op_gen",
         "//util/python:python_headers",
     ] + (tf_additional_lib_deps() +
          tf_additional_plugin_deps() +
@@ -3940,7 +3911,18 @@ cuda_py_test(
         ":math_ops",
         "//tensorflow/core:protos_all_py",
     ],
-    tags = ["noguitar"],
+)
+
+py_test(
+    name = "c_api_util_test",
+    size = "small",
+    srcs = ["framework/c_api_util_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":c_api_util",
+        ":framework_test_lib",
+        ":platform_test",
+    ],
 )
 
 py_test(
@@ -4134,7 +4116,7 @@ cuda_py_test(
 
 py_test(
     name = "saver_large_variable_test",
-    size = "small",
+    size = "medium",
     srcs = ["training/saver_large_variable_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -4216,7 +4198,7 @@ tf_py_test(
 
 py_test(
     name = "basic_session_run_hooks_test",
-    size = "small",
+    size = "medium",
     srcs = ["training/basic_session_run_hooks_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -4472,7 +4454,6 @@ py_library(
         ":variable_scope",
         ":variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:util",
         "//tensorflow/python/keras:engine",
         "//third_party/py/numpy",
     ],
@@ -4509,7 +4490,6 @@ py_library(
         ":variable_scope",
         ":variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:util",
         "//tensorflow/python/keras:layers",
         "//third_party/py/numpy",
         "@six_archive//:six",
diff --git a/tensorflow/python/client/session_clusterspec_prop_test.py b/tensorflow/python/client/session_clusterspec_prop_test.py
index f1934241334e049c1d02e095d371927bec71be14..df020f88a88687ac9616d40618aebb8f7eef2858 100644
--- a/tensorflow/python/client/session_clusterspec_prop_test.py
+++ b/tensorflow/python/client/session_clusterspec_prop_test.py
@@ -42,7 +42,6 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
-ops._USE_C_API = True
 
 # NOTE(mrry): Dummy shape registration for ops used in the tests, since they
 # don't have C++ op registrations on which to attach C++ shape fns.
@@ -77,7 +76,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
     config = config_pb2.ConfigProto(cluster_def=cluster_def)
 
     with ops.Graph().as_default() as g, ops.device('/job:worker/task:1'):
-      with ops.device('/cpu:0'):	 
+      with ops.device('/cpu:0'):
         const = constant_op.constant(17)
     sess = session.Session(server1.target, config=config, graph=g)
     run_options = config_pb2.RunOptions(
@@ -459,7 +458,6 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
     with self.assertRaises(errors.FailedPreconditionError):
       sess3.run(v)
 
-  @test_util.disable_c_api  # Partial runs don't work with C API
   def testClusterSpecPropagationPartialRun(self):
     """Test successful partial run with ClusterSpec propagation."""
     server1 = server_lib.Server.create_local_server()
diff --git a/tensorflow/python/client/session_list_devices_test.py b/tensorflow/python/client/session_list_devices_test.py
index 38a3acb2dc304968915e84c8054621e441294e61..c5d82c213ac890ac4c968eba506695c3a2ce93c4 100644
--- a/tensorflow/python/client/session_list_devices_test.py
+++ b/tensorflow/python/client/session_list_devices_test.py
@@ -30,8 +30,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import server_lib
 
 
-class SessionListDevicesTestMethods(object):
-  """Mixin with test methods."""
+class SessionListDevicesTest(test_util.TensorFlowTestCase):
 
   def testListDevices(self):
     with session.Session() as sess:
@@ -75,33 +74,5 @@ class SessionListDevicesTestMethods(object):
           '/job:worker/replica:0/task:1/device:CPU:0' in device_names)
 
 
-class SessionListDevicesTest(SessionListDevicesTestMethods,
-                             test_util.TensorFlowTestCase):
-  """Test case that invokes test methods with _USE_C_API=False."""
-
-  def setUp(self):
-    self.prev_use_c_api = ops._USE_C_API
-    ops._USE_C_API = False
-    super(SessionListDevicesTest, self).setUp()
-
-  def tearDown(self):
-    ops._USE_C_API = self.prev_use_c_api
-    super(SessionListDevicesTest, self).tearDown()
-
-
-class SessionListDevicesWithCApiTest(SessionListDevicesTestMethods,
-                                     test_util.TensorFlowTestCase):
-  """Test case that invokes test methods with _USE_C_API=True."""
-
-  def setUp(self):
-    self.prev_use_c_api = ops._USE_C_API
-    ops._USE_C_API = True
-    super(SessionListDevicesWithCApiTest, self).setUp()
-
-  def tearDown(self):
-    ops._USE_C_API = self.prev_use_c_api
-    super(SessionListDevicesWithCApiTest, self).tearDown()
-
-
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/client/session_partial_run_test.py b/tensorflow/python/client/session_partial_run_test.py
index 6a389b078a54adea18bedb1e0412835c0e997a7f..92ca47efa9348f4ac77f2b22e684080eccb38617 100644
--- a/tensorflow/python/client/session_partial_run_test.py
+++ b/tensorflow/python/client/session_partial_run_test.py
@@ -39,7 +39,7 @@ from tensorflow.python.training import server_lib
 ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
 
 
-class PartialRunTestMethods(object):
+class PartialRunTest(test_util.TensorFlowTestCase):
 
   def RunTestPartialRun(self, sess):
     a = array_ops.placeholder(dtypes.float32, shape=[])
@@ -283,32 +283,5 @@ class PartialRunTestMethods(object):
     self.RunTestPartialRunEmptyFetches(session.Session(server.target))
 
 
-class PartialRunTest(PartialRunTestMethods, test_util.TensorFlowTestCase):
-  """Test case that invokes test methods with _USE_C_API=False."""
-
-  def setUp(self):
-    self.prev_use_c_api = ops._USE_C_API
-    ops._USE_C_API = False
-    super(PartialRunTest, self).setUp()
-
-  def tearDown(self):
-    ops._USE_C_API = self.prev_use_c_api
-    super(PartialRunTest, self).tearDown()
-
-
-class PartialRunWithCApiTest(PartialRunTestMethods,
-                             test_util.TensorFlowTestCase):
-  """Test case that invokes test methods with _USE_C_API=True."""
-
-  def setUp(self):
-    self.prev_use_c_api = ops._USE_C_API
-    ops._USE_C_API = True
-    super(PartialRunWithCApiTest, self).setUp()
-
-  def tearDown(self):
-    ops._USE_C_API = self.prev_use_c_api
-    super(PartialRunWithCApiTest, self).tearDown()
-
-
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 92497272c66b5c3be36aba75b9e3b7f3d99b062d..482497078cd3e0544b7465fc7c0be0dc81b5ff6a 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -62,7 +62,6 @@ from tensorflow.python.util import compat
 ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
 
 
-@test_util.with_c_api
 class SessionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -173,21 +172,6 @@ class SessionTest(test_util.TensorFlowTestCase):
         # Run with a bogus handle.
         s.partial_run('foo', r1, feed_dict={a: 1, b: 2})
 
-  def testOpConstructionErrorPayload(self):
-    if ops._USE_C_API:
-      return  # No shape registration for 'ConstructionFails'
-
-    with session.Session():
-      failing_op = ops.get_default_graph().create_op(
-          'ConstructionFails', [], [], name='f')
-
-      def exc_predicate(e):
-        return (e.op == failing_op and
-                e.error_code == error_codes_pb2.INVALID_ARGUMENT)
-
-      with self.assertRaisesOpError(exc_predicate):
-        failing_op.run()
-
   def testErrorBasedOn(self):
     with session.Session() as sess:
       a = constant_op.constant(0.0, shape=[2, 3])
@@ -1084,10 +1068,7 @@ class SessionTest(test_util.TensorFlowTestCase):
           if gdef is None:
             gdef = graph.as_graph_def()
           else:
-            # NOTE(skyewm): import_graph_def breaks the running threads without
-            # the C API enabled. This is not a regression so I didn't fix it.
-            if ops._USE_C_API:
-              importer.import_graph_def(gdef, name='import')
+            importer.import_graph_def(gdef, name='import')
 
       stop.set()
       for t in threads:
@@ -1584,10 +1565,6 @@ class SessionTest(test_util.TensorFlowTestCase):
         self.assertEquals(len(run_metadata.step_stats.dev_stats), 1)
 
   def testFeedShapeCompatibility(self):
-    # TODO(nolivia): C API doesn't yet handle marking nodes as not feedable.
-    if ops._USE_C_API:
-      return
-
     with session.Session() as sess:
       some_tensor = constant_op.constant([2.0, 2.0, 2.0, 2.0])
       new_shape = constant_op.constant([2, 2])
@@ -1596,7 +1573,10 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, 'Cannot feed value of shape'):
         sess.run(reshaped_tensor, feed_dict={some_tensor: [1.0, 2.0, 3.0]})
 
-      with self.assertRaisesRegexp(ValueError, 'may not be fed'):
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          'Input to reshape is a tensor with 4 values, '
+          'but the requested shape has 21'):
         sess.run(reshaped_tensor, feed_dict={new_shape: [3, 7]})
 
   def testInferShapesFalse(self):
diff --git a/tensorflow/python/client/virtual_gpu_test.py b/tensorflow/python/client/virtual_gpu_test.py
index addf63474c9ba213cf0c1eeffa9d31e94f15eac1..52e1b56886f83c3e15bcf918fbc5e1f88ba26ed6 100644
--- a/tensorflow/python/client/virtual_gpu_test.py
+++ b/tensorflow/python/client/virtual_gpu_test.py
@@ -192,7 +192,6 @@ class VirtualGpuTestUtil(object):
     return True
 
 
-@test_util.with_c_api
 class VirtualGpuTest(test_util.TensorFlowTestCase):
 
   def __init__(self, method_name):
@@ -236,7 +235,7 @@ class VirtualGpuTest(test_util.TensorFlowTestCase):
     with self.test_session(config=self._util.config) as sess:
       if not test.is_gpu_available(cuda_only=True):
         self.skipTest('No GPU available')
-      for _ in range(10):
+      for _ in range(5):
         if not self._util.TestRandomGraph(sess):
           return
 
diff --git a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py b/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
index 6aabad2f574551cbdc152fe378eb9dc0f5f71995..296a76ec887ae7c31cb9d0bd2afd6d1fe827d95c 100644
--- a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
@@ -32,9 +32,12 @@ from tensorflow.python.platform import test
 
 class DatasetConstructorTest(test.TestCase):
 
-  def _testFromGenerator(self, generator, elem_sequence, num_repeats):
+  def _testFromGenerator(self, generator, elem_sequence, num_repeats,
+                         output_types=None):
+    if output_types is None:
+      output_types = dtypes.int64
     iterator = (
-        dataset_ops.Dataset.from_generator(generator, output_types=dtypes.int64)
+        dataset_ops.Dataset.from_generator(generator, output_types=output_types)
         .repeat(num_repeats)
         .prefetch(5)
         .make_initializable_iterator())
@@ -84,8 +87,8 @@ class DatasetConstructorTest(test.TestCase):
   def testFromGeneratorUsingNdarray(self):
     generator = lambda: np.arange(100, dtype=np.int64)
     elem_sequence = list(generator())
-    self._testFromGenerator(generator, elem_sequence, 1)
-    self._testFromGenerator(generator, elem_sequence, 5)
+    self._testFromGenerator(generator, elem_sequence, 1, output_types=np.int64)
+    self._testFromGenerator(generator, elem_sequence, 5, output_types=np.int64)
 
   def testFromGeneratorUsingGeneratorExpression(self):
     # NOTE(mrry): Generator *expressions* are not repeatable (or in
@@ -357,6 +360,65 @@ class DatasetConstructorTest(test.TestCase):
       # iterator terminates (and the generator iterator is deleted).
       self.assertTrue(event.is_set())
 
+  def testFromGeneratorWithArgs(self):
+
+    def flat_map_fn(elem):
+
+      def generator_with_arg(n):
+        for _ in range(n):
+          yield np.array(n, dtype=np.int64)
+
+      return dataset_ops.Dataset.from_generator(
+          generator_with_arg, output_types=dtypes.int64, output_shapes=(),
+          args=(elem,))
+
+    iterator = (dataset_ops.Dataset
+                .range(5)
+                .flat_map(flat_map_fn)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      expected = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
+      for x in expected:
+        self.assertEqual(x, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromGeneratorWithTwoArgs(self):
+
+    def flat_map_fn(elem, message):
+
+      def generator_with_arg(n, msg):
+        for i in range(n):
+          yield i, msg
+
+      return dataset_ops.Dataset.from_generator(
+          generator_with_arg, output_types=(dtypes.int64, dtypes.string),
+          output_shapes=((), ()), args=(elem, message))
+
+    iterator = (
+        dataset_ops.Dataset.zip(
+            (dataset_ops.Dataset.range(5),
+             dataset_ops.Dataset.from_tensors("Hi!").repeat(None)))
+        .flat_map(flat_map_fn)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      expected = [(0, b"Hi!"),
+                  (0, b"Hi!"), (1, b"Hi!"),
+                  (0, b"Hi!"), (1, b"Hi!"), (2, b"Hi!"),
+                  (0, b"Hi!"), (1, b"Hi!"), (2, b"Hi!"), (3, b"Hi!")]
+      for x in expected:
+        self.assertEqual(x, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
   def testGeneratorDatasetFinalizeFunctionCalled(self):
     # NOTE(mrry): This test tests the internal `_GeneratorDataset`,
     # which affords more control over what the finalize function can do than
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index bd9686f692102df4ef64f0e81c33d2dec1b2222c..6f9b12b12323fa5bcd0ebc7d383cdcec8aed72cc 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
-import collections
 import threading
 
 import numpy as np
@@ -259,25 +258,32 @@ class Dataset(object):
       self._generator = generator
       self._lock = threading.Lock()
       self._next_id = 0  # GUARDED_BY(self._lock)
-      self._iterators = collections.defaultdict(lambda: iter(generator()))
+      self._args = {}
+      self._iterators = {}
 
-    def get_next_id(self):
+    def get_next_id(self, *args):
       with self._lock:
         ret = self._next_id
         self._next_id += 1
+      self._args[ret] = args
       # NOTE(mrry): Explicitly create an array of `np.int64` because implicit
       # casting in `py_func()` will create an array of `np.int32` on Windows,
       # leading to a runtime error.
       return np.array(ret, dtype=np.int64)
 
     def get_iterator(self, iterator_id):
-      return self._iterators[iterator_id]
+      try:
+        return self._iterators[iterator_id]
+      except KeyError:
+        iterator = iter(self._generator(*self._args.pop(iterator_id)))
+        self._iterators[iterator_id] = iterator
+        return iterator
 
     def iterator_completed(self, iterator_id):
       del self._iterators[iterator_id]
 
   @staticmethod
-  def from_generator(generator, output_types, output_shapes=None):
+  def from_generator(generator, output_types, output_shapes=None, args=None):
     """Creates a `Dataset` whose elements are generated by `generator`.
 
     The `generator` argument must be a callable object that returns
@@ -320,13 +326,17 @@ class Dataset(object):
     `Dataset.from_generator()`.
 
     Args:
-      generator: A callable object that takes no arguments and returns an
-        object that supports the `iter()` protocol.
+      generator: A callable object that returns an object that supports the
+        `iter()` protocol. If `args` is not specified, `generator` must take
+        no arguments; otherwise it must take as many arguments as there are
+        values in `args`.
       output_types: A nested structure of `tf.DType` objects corresponding to
         each component of an element yielded by `generator`.
       output_shapes: (Optional.) A nested structure of `tf.TensorShape`
         objects corresponding to each component of an element yielded by
         `generator`.
+      args: (Optional.) A tuple of `tf.Tensor` objects that will be evaluated
+        and passed to `generator` as NumPy-array arguments.
 
     Returns:
       Dataset: A `Dataset`.
@@ -339,8 +349,12 @@ class Dataset(object):
     else:
       output_shapes = nest.map_structure_up_to(
           output_types, tensor_shape.as_shape, output_shapes)
+    if args is None:
+      args = ()
+    else:
+      args = tuple(ops.convert_n_to_tensor(args, name="args"))
 
-    flattened_types = nest.flatten(output_types)
+    flattened_types = [dtypes.as_dtype(dt) for dt in nest.flatten(output_types)]
     flattened_shapes = nest.flatten(output_shapes)
 
     generator_state = Dataset._GeneratorState(generator)
@@ -359,7 +373,7 @@ class Dataset(object):
         `generator_state`.
       """
       return script_ops.py_func(
-          generator_state.get_next_id, [], dtypes.int64, stateful=True)
+          generator_state.get_next_id, args, dtypes.int64, stateful=True)
 
     def generator_next_fn(iterator_id_t):
       """Generates the next element from iterator with ID `iterator_id_t`.
@@ -726,7 +740,6 @@ class Dataset(object):
     d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
     d = d.repeat(FLAGS.num_epochs)
     d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.repeat()
     d = d.interleave(tf.data.TFRecordDataset,
                      cycle_length=FLAGS.num_readers, block_length=1)
     d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 0c76afd29d4626be9120c059d60218daab5cc0ac..fd164277b6fd7509403d84c50a62638df7968a03 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -52,6 +52,9 @@ GET_NEXT_CALL_WARNING_MESSAGE = (
     "`next_element` as the input to some computation that is invoked inside "
     "the loop.")
 
+# Collection of all IteratorResources in the `Graph`.
+GLOBAL_ITERATORS = "iterators"
+
 
 @tf_export("data.Iterator")
 class Iterator(object):
@@ -75,8 +78,7 @@ class Iterator(object):
       output_shapes: A nested structure of `tf.TensorShape` objects
         corresponding to each component of an element of this dataset.
       output_classes: A nested structure of Python `type` object corresponding
-        to each
-        component of an element of this iterator.
+        to each component of an element of this iterator.
     """
     self._iterator_resource = iterator_resource
     self._initializer = initializer
@@ -86,6 +88,7 @@ class Iterator(object):
     self._string_handle = gen_dataset_ops.iterator_to_string_handle(
         self._iterator_resource)
     self._get_next_call_count = 0
+    ops.add_to_collection(GLOBAL_ITERATORS, self._iterator_resource)
 
   @staticmethod
   def from_structure(output_types,
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index 7ee3d92cadd5d7081f05f9e8c6cb7a70c8c661dc..32e08021dc80d11baaead68ea062b6dab7a8dfdd 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -17,19 +17,16 @@
 """## Functions for working with arbitrarily nested sequences of elements.
 
 NOTE(mrry): This fork of the `tensorflow.python.util.nest` module
-makes three changes:
+makes two changes:
 
-1. It adds support for dictionaries as a level of nesting in nested structures.
-2. It removes support for lists as a level of nesting in nested structures.
-3. It adds support for `SparseTensorValue` as an atomic element.
+1. It removes support for lists as a level of nesting in nested structures.
+2. It adds support for `SparseTensorValue` as an atomic element.
 
-The motivation for this change is threefold:
+The motivation for this change is twofold:
 
-1. Many input-processing functions (e.g. `tf.parse_example()`) return
-   dictionaries, and we would like to support them natively in datasets.
-2. It seems more natural for lists to be treated (e.g. in Dataset constructors)
+1. It seems more natural for lists to be treated (e.g. in Dataset constructors)
    as tensors, rather than lists of (lists of...) tensors.
-3. This is needed because `SparseTensorValue` is implemented as a `namedtuple`
+2. This is needed because `SparseTensorValue` is implemented as a `namedtuple`
    that would normally be flattened and we want to be able to create sparse
    tensor from `SparseTensorValue's similarly to creating tensors from numpy
    arrays.
@@ -43,6 +40,7 @@ import collections as _collections
 
 import six as _six
 
+from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
 from tensorflow.python.framework import sparse_tensor as _sparse_tensor
 
 
@@ -99,15 +97,6 @@ def _yield_value(iterable):
       yield value
 
 
-def _yield_flat_nest(nest):
-  for n in _yield_value(nest):
-    if is_sequence(n):
-      for ni in _yield_flat_nest(n):
-        yield ni
-    else:
-      yield n
-
-
 def is_sequence(seq):
   """Returns a true if `seq` is a Sequence or dict (except strings/lists).
 
@@ -123,9 +112,7 @@ def is_sequence(seq):
     True if the sequence is a not a string or list and is a
     collections.Sequence.
   """
-  return (isinstance(seq, (_collections.Sequence, dict)) and
-          not isinstance(seq, _sparse_tensor.SparseTensorValue) and
-          not isinstance(seq, (list, _six.string_types)))
+  return _pywrap_tensorflow.IsSequenceForData(seq)
 
 
 def flatten(nest):
@@ -140,7 +127,7 @@ def flatten(nest):
   Returns:
     A Python list, the flattened version of the input.
   """
-  return list(_yield_flat_nest(nest)) if is_sequence(nest) else [nest]
+  return _pywrap_tensorflow.FlattenForData(nest)
 
 
 def _recursive_assert_same_structure(nest1, nest2, check_types):
@@ -536,4 +523,3 @@ def map_structure_up_to(shallow_tree, func, *inputs):
 
   results = [func(*tensors) for tensors in zip(*all_flattened_up_to)]
   return pack_sequence_as(structure=shallow_tree, flat_sequence=results)
-
diff --git a/tensorflow/python/debug/examples/debug_tflearn_iris.py b/tensorflow/python/debug/examples/debug_tflearn_iris.py
index 00090b21fe35acb57d121ad4d46f00373bf92883..7cbaae46b4f60f51e95fd8a1109bd100fc42aa21 100644
--- a/tensorflow/python/debug/examples/debug_tflearn_iris.py
+++ b/tensorflow/python/debug/examples/debug_tflearn_iris.py
@@ -140,7 +140,7 @@ def main(_):
 
   # Make predictions, using tfdbg hook.
   predict_results = classifier.predict(test_input_fn, hooks=hooks)
-  print("A prediction result: %s" % predict_results.next())
+  print("A prediction result: %s" % next(predict_results))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index b3268c9047e264b8264ae37b404b51be6a88962f..5530193d4e1dd8f351b60c06502b2406c2c75d33 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -25,6 +25,7 @@ cc_library(
         "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/c/eager:tape",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/python:ndarray_tensor",
         "//tensorflow/python:ndarray_tensor_bridge",
         "//tensorflow/python:numpy_lib",
@@ -191,22 +192,6 @@ py_library(
     ],
 )
 
-cc_library(
-    name = "python_eager_op_gen",
-    srcs = ["python_eager_op_gen.cc"],
-    hdrs = ["python_eager_op_gen.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:proto_text",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/python:python_op_gen",
-    ],
-)
-
 py_library(
     name = "graph_only_ops",
     srcs = ["graph_only_ops.py"],
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index d04b004451223abd07b50c384d0cd83dce4dcb2d..dcfd03b4583efab06beca712d6cbabb635967136 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -37,7 +37,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
@@ -358,6 +360,8 @@ def gradients_function(f, params=None):
   assert y_grad.numpy() == (2 ** 3) - 2 * 2 * 3
   ```
 
+  Note that only tensors with real or complex dtypes are differentiable.
+
   Args:
    f: function to be differentiated. If `f` returns a scalar, this scalar will
      be differentiated. If `f` returns a tensor or list of tensors, by default
@@ -666,19 +670,19 @@ class GradientTape(object):
   be computed as:
 
   ```python
-  x = tf.constant(3.)
-  with tfe.GradientTape() as g:
+  x = tf.constant(3.0)
+  with tf.GradientTape() as g:
     g.watch(x)
     y = x * x
-  grad = g.gradient(y, [x])[0] # Will compute to 6.0
+  dy_dx = g.gradient(y, x) # Will compute to 6.0
   ```
 
   GradientTapes can be nested to compute higher-order derivatives. For example,
 
   ```python
   x = tf.constant(3.0)
-  with tfe.GradientTape() as g:
-    with tfe.GradientTape() as gg:
+  with tf.GradientTape() as g:
+    with tf.GradientTape() as gg:
       gg.watch(x)
       y = x * x
     dy_dx = gg.gradient(y, x)     # Will compute to 6.0
@@ -693,13 +697,16 @@ class GradientTape(object):
 
   ```python
   x = tf.constant(3.0)
-  with tfe.GradientTape(persistent=True) as g:
+  with tf.GradientTape(persistent=True) as g:
     g.watch(x)
     y = x * x
     z = y * y
   dz_dx = g.gradient(z, x)  # 108.0 (4*x^3 at x = 3)
   dy_dx = g.gradient(y, x)  # 6.0
   del g  # Drop the reference to the tape
+  ```
+
+  Note that only tensors with real or complex dtypes are differentiable.
   """
 
   def __init__(self, persistent=False):
@@ -712,13 +719,29 @@ class GradientTape(object):
     """
     self._tape = None
     self._persistent = persistent
+    self._recording = False
 
   def __enter__(self):
-    self._tape = tape.push_new_tape(persistent=self._persistent)
+    """Enters a context inside which operations are recorded on this tape."""
+    self._push_tape()
     return self
 
   def __exit__(self, typ, value, traceback):
+    """Exits the recording context, no further operations are traced."""
+    if self._recording:
+      self._pop_tape()
+
+  def _push_tape(self):
+    if self._recording:
+      raise ValueError("Tape is already recording.")
+    self._tape = tape.push_new_tape(persistent=self._persistent)
+    self._recording = True
+
+  def _pop_tape(self):
+    if not self._recording:
+      raise ValueError("Tape is not recording.")
     tape.pop_tape(self._tape)
+    self._recording = False
 
   def watch(self, tensor):
     """Ensures that `tensor` is being traced by this tape.
@@ -729,6 +752,72 @@ class GradientTape(object):
     for t in nest.flatten(tensor):
       tape.watch(_handle_or_self(t))
 
+  @tf_contextlib.contextmanager
+  def stop_recording(self):
+    """Temporarily stops recording operations on this tape.
+
+    Operations executed while this context manager is active will not be
+    recorded on the tape. This is useful for reducing the memory used by tracing
+    all computations.
+
+    For example:
+
+    ```
+      with tf.GradientTape(persistent=True) as t:
+        loss = compute_loss(model)
+        with t.stop_recording():
+          # The gradient computation below is not traced, saving memory.
+          grads = t.gradient(loss, model.variables)
+    ```
+
+    Yields:
+      None
+    Raises:
+      RuntimeError: if the tape is not currently recording.
+    """
+    if self._tape is None:
+      raise RuntimeError(
+          "Trying to stop recording a tape which is not recording.")
+    self._pop_tape()
+    try:
+      yield
+    finally:
+      self._push_tape()
+
+  def reset(self):
+    """Clears all information stored in this tape.
+
+    Equivalent to exiting and reentering the tape context manager with a new
+    tape. For example, the two following code blocks are equivalent:
+    ```
+    with tf.GradientTape() as t:
+      loss = loss_fn()
+    with tf.GradientTape() as t:
+      loss += other_loss_fn()
+    t.gradient(loss, ...)  # Only differentiates other_loss_fn, not loss_fn
+
+
+    # The following is equivalent to the above
+    with tf.GradientTape() as t:
+      loss = loss_fn()
+      t.reset()
+      loss += other_loss_fn()
+    t.gradient(loss, ...)  # Only differentiates other_loss_fn, not loss_fn
+    ```
+
+    This is useful if you don't want to exit the context manager for the tape,
+    or can't because the desired reset point is inside a control flow construct:
+
+    ```
+    with tf.GradientTape() as t:
+      loss = ...
+      if loss > k:
+        t.reset()
+    ```
+    """
+    self._pop_tape()
+    self._push_tape()
+
   def watched_variables(self):
     # Sorting variables by id, which is monotonically increasing in construction
     # order. This ensures unique order across executions.
@@ -756,9 +845,23 @@ class GradientTape(object):
        than once on a non-persistent tape.
     """
     if self._tape is None:
-      raise RuntimeError("GradientTape.gradient can only be called once "
-                         "on non-persistent tapes, and "
-                         "only when the context manager has exited.")
+      raise RuntimeError("GradientTape.gradient can only be called once on "
+                         "non-persistent tapes.")
+    if self._recording:
+      if not self._persistent:
+        self._pop_tape()
+      else:
+        logging.log_first_n(logging.WARN,
+                            "Calling GradientTape.gradient on a persistent "
+                            "tape inside it's context is significantly less "
+                            "efficient than calling it outside the context (it "
+                            "causes the gradient ops to be recorded on the "
+                            "tape, leading to increased CPU and memory usage). "
+                            "Only call GradientTape.gradient inside the "
+                            "context if you actually want to trace the "
+                            "gradient in order to compute higher order "
+                            "derrivatives.", 1)
+
     flat_sources = nest.flatten(sources)
     flat_sources = [_handle_or_self(x) for x in flat_sources]
 
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 8d9959fe20768c9d9b7e961e375e9a8f094d5f5b..826c6683b9668ab892883119a533ee8d497d7b58 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -96,6 +96,18 @@ class BackpropTest(test.TestCase):
     self.assertAllEqual(grads_and_vars[0][0], 1.0)
     self.assertAllEqual(id(grads_and_vars[0][1]), id(x))
 
+  def testWhereGradient(self):
+    # Note: where is special because only some of its arguments are of
+    # differentiable dtypes.
+
+    def f(x):
+      return array_ops.where(x < 10, x, x * x)
+
+    g = backprop.gradients_function(f)
+
+    self.assertAllEqual(g(5.)[0], 1.0)
+    self.assertAllEqual(g(50.)[0], 100.0)
+
   def testTwoTargets(self):
     with backprop.GradientTape() as t:
       x = constant_op.constant(3.0)
@@ -124,6 +136,14 @@ class BackpropTest(test.TestCase):
     grad_fn = backprop.gradients_function(f)
     self.assertAllEqual(2., grad_fn(1., dy=2.)[0])
 
+  def testGradientInteger(self):
+
+    def f(x):
+      return x + x
+
+    int_tensor = constant_op.constant(1)
+    self.assertEqual(backprop.gradients_function(f)(int_tensor)[0], None)
+
   def testErrors(self):
 
     @custom_gradient.custom_gradient
@@ -201,6 +221,21 @@ class BackpropTest(test.TestCase):
     self.assertTrue(ordered_variables[0] is v0)
     self.assertTrue(ordered_variables[1] is v1)
 
+  def testTapeStopRecording(self):
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(1.0)
+      with t.stop_recording():
+        y = x * x
+    self.assertEqual(t.gradient(y, x), None)
+
+  def testTapeReset(self):
+    with backprop.GradientTape() as t:
+      v = resource_variable_ops.ResourceVariable(1.0)
+      loss = v * v
+      t.reset()
+      loss += v * v
+    self.assertAllEqual(t.gradient(loss, v), 2.0)
+
   @test_util.assert_no_new_tensors
   def testGradientNone(self):
 
@@ -222,12 +257,22 @@ class BackpropTest(test.TestCase):
     self.evaluate(v1.initializer)
     with backprop.GradientTape() as t:
       loss = 2 * v1
-      with self.assertRaises(RuntimeError):
-        t.gradient(loss, [v1])
+      grad = t.gradient(loss, v1)
+    self.assertAllEqual(self.evaluate(grad), 2.0)
+
     with backprop.GradientTape(persistent=True) as t:
       loss = 2 * v1
-      grad = t.gradient(loss, [v1])
-    self.assertAllEqual(self.evaluate(grad[0]), 2.0)
+      grad = t.gradient(loss, v1)
+    self.assertAllEqual(self.evaluate(grad), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNestedSelfContexts(self):
+    v1 = resource_variable_ops.ResourceVariable(1.)
+    self.evaluate(v1.initializer)
+    with backprop.GradientTape() as t:
+      with self.assertRaises(ValueError):
+        with t:
+          pass
 
   @test_util.assert_no_new_tensors
   def testSecondGrad(self):
@@ -521,6 +566,23 @@ class BackpropTest(test.TestCase):
     self.assertEqual(self.evaluate(dy_dx), 2 * 3)
     del g
 
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
+  def testHigherOrderGradient(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = constant_op.constant(3.0)
+      g.watch(x)
+      y = x ** 3                      # y       := x^3
+      dy_dx = g.gradient(y, x)        # dy/dx   := 3x^2
+      d2y_dx2 = g.gradient(dy_dx, x)  # d2y/dx2 := 6x
+    d3y_dx3 = g.gradient(d2y_dx2, x)  # d3y/dx3 := 6
+    x = 3
+    self.assertEqual(self.evaluate(y), x ** 3)
+    self.assertEqual(self.evaluate(dy_dx), 3 * x ** 2)
+    self.assertEqual(self.evaluate(d2y_dx2), 6 * x)
+    self.assertEqual(self.evaluate(d3y_dx3), 6)
+    del g
+
   @test_util.assert_no_new_tensors
   @test_util.run_in_graph_and_eager_modes()
   def testPersistentNestedTape(self):
@@ -552,6 +614,18 @@ class BackpropTest(test.TestCase):
     grad = g.gradient(y, [v])[0]
     self.assertAllEqual(self.evaluate(grad), 2.0)
 
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
+  def testNestedGradients(self):
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as g:
+      g.watch(x)
+      y = x * x
+      z = y * y
+    dz_dx, dz_dy = g.gradient(z, [x, y])
+    self.assertEqual(self.evaluate(dz_dx), 108.0)
+    self.assertEqual(self.evaluate(dz_dy), 18.0)
+
   @test_util.assert_no_new_tensors
   def testEmptyParamsForValueAndGradFunction(self):
     def fn(a, b):
@@ -753,7 +827,7 @@ class BackpropTest(test.TestCase):
       return result, grad
 
     x = resource_variable_ops.ResourceVariable(
-        initial_value=3, name='X.' + self.id())
+        initial_value=3., name='X.' + self.id())
 
     def f():
       return my_square(x)
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 741bd2ac9c911f846a04bd54440d321973b9f544..120b298171b6272fdd63f76a538fa2c92a58616d 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -23,6 +23,7 @@ import collections
 
 import numpy as np
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import function_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
@@ -102,13 +103,15 @@ class CapturingGraph(ops.Graph):
   def clear_resource_control_flow_state(self):
     self._last_op_using_resource_tensor = {}
 
-  def maybe_capture_tensor(self, tensor):
+  def capture(self, tensor, name=None):
     if isinstance(tensor, ops.EagerTensor):
-      return capture_value(
-          self.captures, tensor, tensor.dtype, str(ops.uid()))
+      if name is None:
+        name = str(ops.uid())
+      return capture_value(self.captures, tensor, tensor.dtype, name)
     if tensor.graph is not self:
-      return capture_value(
-          self.captures, tensor, tensor.dtype, tensor.op.name)
+      if name is None:
+        name = tensor.op.name
+      return capture_value(self.captures, tensor, tensor.dtype, name)
     return tensor
 
   def create_op(
@@ -126,7 +129,7 @@ class CapturingGraph(ops.Graph):
     # forward the resources such as Identity and Switch can cause serialization
     # to fail.
     for i, inp in enumerate(inputs):
-      inputs[i] = self.maybe_capture_tensor(inp)
+      inputs[i] = self.capture(inp)
     return super(CapturingGraph, self).create_op(
         op_type, inputs, dtypes, input_types, name, attrs, op_def,
         compute_shapes, compute_device)
@@ -225,7 +228,7 @@ def _inference_name(n):
 class _EagerDefinedFunction(object):
   """Function object with the interface of tf _DefinedFunction."""
 
-  def __init__(self, name, graph, operations, inputs, outputs):
+  def __init__(self, name, graph, operations, inputs, outputs, attrs):
     """Initializes an eager defined function.
 
     Args:
@@ -235,6 +238,7 @@ class _EagerDefinedFunction(object):
         which will be in the function
       inputs: the tensors in the graph to be used as inputs to the function
       outputs: the tensors in the graph which will be outputs to the function
+      attrs: dict mapping names of attributes to their AttrValue values
     """
     fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
         graph._c_graph,  # pylint: disable=protected-access
@@ -246,6 +250,14 @@ class _EagerDefinedFunction(object):
         [],
         None,
         compat.as_str(""))
+
+    for name, attr_value in attrs.items():
+      serialized = attr_value.SerializeToString()
+      # TODO(iga): this creates and deletes a new TF_Status for every attr.
+      # It might be worth creating a convenient way to re-use status.
+      pywrap_tensorflow.TF_FunctionSetAttrValueProto(
+          fn, compat.as_str(name), serialized)
+
     # TODO(apassos) avoid creating a FunctionDef (specially to grab the
     # signature, but also in general it's nice not to depend on it.
     with c_api_util.tf_buffer() as buffer_:
@@ -287,25 +299,6 @@ def _flatten(sequence):
 
 class GraphModeFunction(object):
   """Callable object representing a graph-mode function.
-
-  Args:
-    name: str the name of the created function
-    input_placeholders: list of placeholder values (tensors) to feed when
-      calling the wrapped function.
-    extra_inputs: Tensor inputs this function definition closed over which
-      are passed as arguments. Need to track so gradients are supported
-      correctly.
-    graph: the Graph from which the operations will be pulled. Used as
-      a context when computing gradients.
-    operations: the subset of Operations in the graph used in the function
-      definition.
-    outputs: a flat list of the Tensors in the graph used as outputs to the
-      function
-    func_outputs: a possibly nested python object which will be returned by
-      this function. The Tensors in this structure will be replaced by their
-      corresponding values in outputs.
-    output_shapes: List of shapes of all tensors in outputs
-    variables: (optional) List of variables to watch during function execution.
   """
 
   def __init__(self,
@@ -317,9 +310,36 @@ class GraphModeFunction(object):
                outputs,
                func_outputs,
                output_shapes,
-               variables=None):
+               variables=None,
+               attrs=None):
+    """Initialize a GraphModeFunction.
+
+    Args:
+      name: str the name of the created function
+      input_placeholders: list of placeholder values (tensors) to feed when
+        calling the wrapped function.
+      extra_inputs: Tensor inputs this function definition closed over which
+        are passed as arguments. Need to track so gradients are supported
+        correctly.
+      graph: the Graph from which the operations will be pulled. Used as
+        a context when computing gradients.
+      operations: the subset of Operations in the graph used in the function
+        definition.
+      outputs: a flat list of the Tensors in the graph used as outputs to the
+        function
+      func_outputs: a possibly nested python object which will be returned by
+        this function. The Tensors in this structure will be replaced by their
+        corresponding values in outputs.
+      output_shapes: List of shapes of all tensors in outputs
+      variables: (optional) List of variables to watch during function
+        execution.
+      attrs: (optional) dict mapping names of attributes to their AttrValue
+        values. Attributes in `attrs` will be included in this function's
+        definition.
+    """
+    self._attrs = attrs or {}
     defined_function = _EagerDefinedFunction(
-        name, graph, operations, input_placeholders, outputs)
+        name, graph, operations, input_placeholders, outputs, self._attrs)
     if len(input_placeholders) != len(defined_function.signature.input_arg):
       raise ValueError("Internal error: invalid lengths. %s %s" % (
           len(input_placeholders), len(defined_function.signature.input_arg)))
@@ -372,7 +392,7 @@ class GraphModeFunction(object):
     forward_name = _forward_name(self._func_name)
     self._forward_fdef = _EagerDefinedFunction(
         forward_name, self._graph, self._ops, self._input_placeholders,
-        filtered_outputs + captures)
+        filtered_outputs + captures, self._attrs)
     all_inputs = self._out_grad_placeholders + captures
     # Excluding input ops from the body as we do not intend to execute these
     # operations when the function is executed.
@@ -386,7 +406,7 @@ class GraphModeFunction(object):
     bname = _backward_name(self._func_name)
     self._backward_function = GraphModeFunction(
         bname, all_inputs, [], self._graph, function_def_ops,
-        backward_outputs, in_gradients, output_shapes)
+        backward_outputs, in_gradients, output_shapes, attrs=self._attrs)
 
   def _backprop_call(self, args):
     """Calls the wrapped function and records the result on a tape."""
@@ -560,7 +580,7 @@ def _get_defun_inputs(args):
   return nest.pack_sequence_as(args, ret)
 
 
-def _defun_internal(name, func, args, kwds):
+def _defun_internal(name, func, compiled, args, kwds):
   """Defines and returns graph-mode version of func."""
   graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
   with context.graph_mode():
@@ -598,7 +618,7 @@ def _defun_internal(name, func, args, kwds):
       # call to convert_to_tensor, so we manually capture all such tensors.
       outputs_list = _flatten(func_outputs)
       func_def_outputs = [
-          tmp_graph.maybe_capture_tensor(x) for x in outputs_list
+          tmp_graph.capture(x) for x in outputs_list
           if x is not None
       ]
 
@@ -625,9 +645,14 @@ def _defun_internal(name, func, args, kwds):
     for f in tmp_graph._functions.values():  # pylint: disable=protected-access
       # TODO(ashankar): What about the gradient registry?
       _register(f._c_func.func)  # pylint: disable=protected-access
+
+  attrs = {}
+  if compiled:
+    attrs["_XlaCompile"] = attr_value_pb2.AttrValue(b=True)
+
   return GraphModeFunction(
       fname, all_inputs, extra_inputs, tmp_graph, operations, func_def_outputs,
-      func_outputs, output_shapes, variables)
+      func_outputs, output_shapes, variables, attrs)
 
 
 # Defun uses this instead of Tensor as a cache key. Using dtype because
@@ -669,15 +694,16 @@ def _register(fn):
 
 
 # TODO(apassos): better error messages for non-hashable arguments.
-def named_defun(func, name):
+def named_defun(func, name, compiled=False):
   """Defines a function with a given name.
 
-  See the documentation for `defun` for more information on the semantics of the
-  function.
+  See the documentation for `defun` for more information on the semantics of
+  this function.
 
   Args:
     func: the function to be wrapped.
     name: the name given to it.
+    compiled: if true, the framework will attempt to compile func with XLA.
 
   Returns:
     the wrapped function.
@@ -694,67 +720,272 @@ def named_defun(func, name):
 
     if cache_key not in arguments_to_functions:
       arguments_to_functions[cache_key] = _defun_internal(
-          name, func, args, kwds)
+          name, func, compiled, args, kwds)
     return arguments_to_functions[cache_key](*args)
 
   return decorated
 
 
-def defun(func):
-  """Decorator to compile func into graph_mode.
-
-  `defun` converts a function that constructs a TensorFlow graph into a function
-  that executes the graph. TensorFlow graphs typically execute faster and with a
-  lower memory-footprint than executing each of the operations that make up the
-  function individually as the TensorFlow runtime can optimize the graph and
-  execute sub-operations in parallel.
-
-  func must be a Python function that constructs a TensorFlow graph,
-  typically using functions in the tensorflow module.
-
-  Arguments to func can be either Tensor objects or Python
-  objects. Non-Tensor python objects are treated as constants, and new function
-  definitions are created internally based on their values.
+# TODO(akshayka): Remove the `compiled` flag and create a separate
+# API for xla compilation (`defun` is already complicated enough
+# as it is, and the keyword argument makes 'compiled' an overloaded concept)
+def defun(func=None, compiled=False):
+  """Compiles a Python function into a callable TensorFlow graph.
+
+  `defun` (short for "define function") trace-compiles a Python function
+  composed of TensorFlow operations into a callable that executes a @{tf.Graph}
+  containing those operations. When eager execution is enabled, the ability to
+  create graphs from Python functions makes it possible to incrementally trade
+  off debugability and interactivity for performance.  Functions compiled with
+  `defun` cannot be inspected with `pdb` and `print` statements; however,
+  executing a graph generated by `defun` sometimes takes less time and memory
+  than eagerly executing the corresponding Python function, since specifying
+  computations as graphs allows for optimizations like automatic buffer reuse
+  and parallelization among ops. Note that executing a `defun`-compiled function
+  incurs a small constant overhead, so eagerly executing sufficiently small
+  Python functions might take less time than executing their corresponding
+  `defun`-generated graphs.
+
+  For a Python function to be compatible with `defun`, the values of its keyword
+  arguments cannot be Tensors and all of its arguments, including its keyword
+  arguments, must be hashable Python objects or lists thereof. Additionally, it
+  must return zero or more @{tf.Tensor} objects.
+
+  _Example Usage_
 
-  func must return zero or more `tf.Tensor`.
+  ```python
+  import tensorflow as tf
 
-  Control flow constructs (e.g., `if`, `while`) are not yet compatible with
-  `defun`.
+  tf.enable_eager_execution()
 
-  Example:
-  ```python
+  # A simple example.
   def f(x, y):
     return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
 
-  @tfe.defun
-  def g(x, y):
-    return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
+  g = tf.contrib.eager.defun(f)
 
   x = tf.constant([[2.0, 3.0]])
   y = tf.constant([[3.0, -2.0]])
-  # The plain function and defun-compiled function should return the same value.
+
+  # `f` and `g` will return the same value, but `g` will be executed as a
+  # TensorFlow graph.
   assert f(x, y).numpy() == g(x, y).numpy()
 
-  # After the first invocation, the defun-compiled (graph) function runs faster
-  # than the plain function because the defun-compiled function does not involve
-  # Python interpreter overhead during the execution.
-  %time print(f(x, y))
-  %time print(g(x, y))
+  # `defun` is capable of compiling Python functions that close over Python
+  # objects, including Tensors and Variables.
+  @tf.contrib.eager.defun
+  def h():
+    return f(x, y)
+
+  assert h().numpy() == f(x, y)
+
+  # `defun` automatically lifts variables out of the graphs it creates,
+  # allowing you to compile the `call` methods of `tf.keras.layers.Layer` and
+  # `tf.keras.Model` objects.
+  class MyModel(tf.keras.Model):
+
+    def __init__(self, keep_probability=0.2):
+      self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
+      self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
+      self.keep_probability = keep_probability
+
+    def call(self, inputs, training=True):
+      x = self.dense2(self.dense1(inputs))
+      if training:
+        return tf.nn.dropout(x, self.keep_probability)
+      else:
+        return x
+
+  model = MyModel()
+  model.call = tf.contrib.eager.defun(model.call)
+  model(x, training=True)  # executes a graph, with dropout
+  model(x, training=False) # executes a graph, without dropout
+
+  # `defun`-compiled functions are differentiable.
+  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
+  with tf.GradientTape() as tape:
+    outputs = model(inputs)
+  gradient = tape.gradient(outputs, model.trainable_variables)
+  optimizer.apply_gradients((grad, var) for grad, var in zip(gradient,
+                            model.trainable_variables))
   ```
 
+  When using `defun`, there are subtleties regarding inputs, Python control
+  flow, and variable creation that one should be aware of. For concreteness, let
+  `f` be a Python function that returns zero or more @{tf.Tensor} objects and
+  let `F = defun(f)`. `F` builds a graph for each unique input signature it
+  sees, Python control flow is baked into graphs, and operations related to
+  variable initialization are automatically lifted out of the graphs that `F`
+  generates and placed in the eager context if executing eagerly or into an
+  outer graph otherwise.
+
+  _Tracing and Input Signatures_.
+  The signature of inputs supplied to `F` is defined to be a tuple of the shapes
+  and dtypes of Tensor-typed arguments and the values of non-Tensor arguments
+  and keyword arguments. Every time `F` is invoked, the signature of its inputs
+  are inferred. The first time `F(*args, **kwargs)` is invoked with a particular
+  signature, `f(*args, **kwargs)` is executed and all the TensorFlow operations
+  that `f` executes, along with the Tensors that flow between them, are recorded
+  in a TensorFlow graph. `F` caches this graph and binds it to the inputs'
+  signature; every subsequent invocation of `F` with inputs conforming to this
+  signature will immediately retrieve the cached graph and pass it to the
+  TensorFlow runtime for execution.
+
+  Be aware that because `F` only logs TensorFlow operations, all non-TensorFlow
+  operations that `f` executes will only shape the _construction_ of the graphs
+  that `F` executes: They won't be executed when the graphs themselves are
+  executed. For example, whereas the Python function
+
+  ```python
+  import tensorflow as tf
+  import numpy as np
+
+  matrix = tf.eye(5)
+  # `matrix` is assumed to be a Tensor
+  def add_noise():
+    return matrix + np.random.randn(matrix.shape[0], matrix.shape[1])
+  ```
+
+  will return a different output everytime it is invoked, the compiled function
+  `compiled = tf.contrib.eager.defun(add_noise)` will return the same value
+  every time it is called, since a particular random offset generated by NumPy
+  will be inserted into the graph as a TensorFlow constant. The solution is to
+  replace the call to `np.random.randn` with `tf.random_normal(matrix.shape)`.
+
+  _Python Control Flow_.
+  The structure of many machine learning computations depend upon whether one is
+  training or validating, and it is common to nest specialized logic under `if
+  training:` blocks. By mapping each input signature to a unique graph, `defun`
+  lets users transparently compile such code, as the following code snippet
+  demonstrates:
+
+  ```python
+  import tensorflow as tf
+
+  @tf.contrib.eager.defun
+  def lossy_matmul(W, x, training=True):
+    outputs = tf.matmul(W, x)
+    if training:
+      outputs = tf.nn.dropout(outputs, keep_probability=0.2)
+    return outputs
+
+  # Executes a graph that applies dropout.
+  lossy_outputs = lossy_matmul(W, x, training=True)
+
+  # Executes a graph that does not apply dropout.
+  exact_outputs = lossy_matmul(W, x, training=False)
+  ```
+
+  On the other hand, because `defun` generates graphs by tracing and not by
+  source code analysis, it fully unrolls Python `for` and `while` loops,
+  potentially creating large graphs. If your Python function has native loops
+  that run for many iterations, consider replacing them with @{tf.while_loop}
+  operations.
+
+  When constructing graphs, @{tf.Tensor} objects cannot be used as Python
+  `bool` objects. This means, for example, that you should replace code in `f`
+  resembling
+
+  ```python
+
+  if tensor < 10:
+    true_fn()
+  else:
+    false_fn()
+  ```
+
+  with `tf.cond(tensor < 10, true_fn, false_fn)`.
+
+  _Variables_
+  TensorFlow operations related to variable creation and initialization are
+  automatically lifted out of the graphs generated by `defun`. In practice, this
+  implies that variable creation and initialization only happen the first time
+  `F` is called, and that variables are reused every time thereafter. Many
+  TensorFlow APIs, like @{tf.keras.layers.Layer} objects, create variables the
+  first time they are called and reuse them thereafter. Automatic variable
+  lifting makes it possible to compile these APIs without extra effort, at the
+  cost of introducing a discrepancy between the semantics of executing Python
+  functions and their corresponding compiled functions. For example:
+
+  ```python
+  import tensorflow as tf
+
+  tf.enable_eager_execution()
+
+  def fn():
+    x = tf.contrib.eager.Variable(0.0)
+    x.assign_add(1.0)
+    return x.read_value()
+
+  # `fn` is a Python function, so x is created, initialized, and destroyed upon
+  # every invocation
+  assert(fn().numpy() == fn().numpy() == 1.0)
+
+  compiled = tf.contrib.eager.defun(fn)
+
+  # Compiling `fn` with `defun` hoists all variables outside of the generated
+  # graph, so initialization happens exactly once.
+  assert(compiled().numpy() == 1.0)
+  assert(compiled().numpy() == 2.0)
+  ```
+
+  Finally, because each input signature is bound to a unique graph, if your
+  Python function constructs `tf.contrib.eager.Variable` objects, then each
+  graph constructed for that Python function will reference a unique set of
+  variables. To circumvent this problem, we recommend against compiling Python
+  functions that create `tf.contrib.eager.Variable` objects. Instead, Python
+  functions should either lexically close over `tf.contrib.eager.Variable`
+  objects or accept them as arguments, preferably encapsulated in an
+  object-oriented container. If you must create variables inside your Python
+  function and you want each graph generated for it to reference the same set of
+  variables, add logic to your Python function that ensures that variables are
+  only created the first time it is called and are reused for every subsequent
+  invocation; note that this is precisely what @{tf.keras.layers.Layer} objects
+  do, so we recommend using them to represent variable-bearing computations
+  whenever possible.
+
   Args:
-    func: function to be compiled.
+    func: function to be compiled. If `func` is None, returns a
+      decorator that can be invoked with a single argument - `func`. The
+      end result is equivalent to providing all the arguments up front.
+      In other words, defun(compiled=True)(func) is equivalent to
+      defun(func, compiled=True). The former allows the following use case:
+        @tf.contrib.eager.defun(compiled=True)
+        def foo(...):
+          ...
+
+    compiled: If True, an attempt to compile `func` with XLA will be made.
+      If it fails, function will be run normally. Experimental.  Currently
+      supported only for execution on TPUs. For the vast majority of users,
+      this argument should be False.
 
   Returns:
-     A callable that will execute the compiled function (and return zero
-     or more `tf.Tensor` objects).
+     If `func` is not None, returns a callable that will execute the compiled
+     function (and return zero or more `tf.Tensor` objects).
+     If `func` is None, returns a decorator that, when invoked with a single
+     `func` argument, returns a callable equivalent to the case above.
   """
   # TODO(apassos): deal with captured global state. Deal with control flow.
-  try:
-    name = func.__name__
-  except AttributeError:
-    name = "function"
-  return tf_decorator.make_decorator(func, named_defun(func, name))
+  def decorated(function):
+    try:
+      name = function.__name__
+    except AttributeError:
+      name = "function"
+    return tf_decorator.make_decorator(
+        function, named_defun(function, name, compiled=compiled))
+
+  # This code path is for the `foo = tfe.defun(foo, ...)` use case
+  if func is not None:
+    return decorated(func)
+
+  # This code path is for the
+  #
+  # @tfe.defun(...)
+  # def foo(...):
+  #    ...
+  #
+  # use case, which is equivalent to `foo = tfe.defun(...)(foo)`
+  return decorated
 
 
 def make_defun_op(func, *args, **kwds):
@@ -806,7 +1037,7 @@ def make_defun_op(func, *args, **kwds):
   name = func.__name__
   if any(isinstance(x, ops.EagerTensor) for x in kwds.values()):
     raise ValueError("Tensor keyword arguments are not supported.")
-  return _defun_internal(name, func, args, kwds)
+  return _defun_internal(name, func, False, args, kwds)
 
 
 class AutomaticControlDependencies(object):
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 185f6d981cb36a277c9e63f1195501c1f86409d0..f53d6c26083cad8efd291a064393561c4bebfcfb 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -771,6 +771,21 @@ class AutomaticControlDependenciesTest(test.TestCase):
       self.assertAllEqual(val.eval(feed_dict={p: False}), 10.0)
       self.assertAllEqual(val.eval(feed_dict={p: True}), 20.0)
 
+  def testDefunWhileLoopWithCapturedLoopVars(self):
+    n = 3
+    x = constant_op.constant(list(range(n)))
+
+    @function.defun
+    def loop():
+      c = lambda i, x: i < n
+      b = lambda i, x: (i + 1, x + 1)
+      i, out = control_flow_ops.while_loop(c, b, (0, x))
+      return i, out
+
+    i, out = loop()
+    self.assertEqual(int(i), 3)
+    self.assertAllEqual(out, [3, 4, 5])
+
   def testDecorator(self):
     with context.graph_mode(), self.test_session():
       v = resource_variable_ops.ResourceVariable(1.0)
diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
deleted file mode 100644
index 9afab0077b666b36d77abea5a7d8c444b6400812..0000000000000000000000000000000000000000
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ /dev/null
@@ -1,1047 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/python/eager/python_eager_op_gen.h"
-
-#include <stdio.h>
-#include <sstream>
-#include <unordered_map>
-#include "tensorflow/core/framework/api_def.pb.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def.pb_text.h"
-#include "tensorflow/core/framework/op_def.pb.h"
-#include "tensorflow/core/framework/op_def_util.h"
-#include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/framework/tensor.pb_text.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/gtl/stl_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/python/framework/python_op_gen_internal.h"
-
-namespace tensorflow {
-namespace {
-
-const int kRightMargin = 78;
-
-constexpr char kEagerFallbackSuffix[] = "_eager_fallback";
-
-string AttrVarName(const string& attr_name,
-                   std::unordered_map<string, string>* attr_expressions) {
-  const string var = strings::StrCat("_attr_", attr_name);
-  if (attr_expressions != nullptr) (*attr_expressions)[attr_name] = var;
-  return var;
-}
-
-void AddInferredAttr(const string& indentation, const string& attr_name,
-                     const string& value_expression, string* result,
-                     std::unordered_map<string, string>* attr_expressions) {
-  strings::StrAppend(result, indentation,
-                     AttrVarName(attr_name, attr_expressions), " = ",
-                     value_expression, "\n");
-}
-
-string VectorToTuple(const std::vector<string>& l) {
-  if (l.size() == 1) return strings::StrCat("(", l.front(), ",)");
-  string ret = "(";
-  for (int i = 0; i < l.size(); ++i) {
-    if (i > 0) {
-      strings::StrAppend(&ret, ", ");
-    }
-    strings::StrAppend(&ret, l[i]);
-  }
-  strings::StrAppend(&ret, ")");
-  return ret;
-}
-
-void Unflatten(const string& prefix, const std::vector<string>& output_sizes,
-               const string& var, string* result) {
-  for (int i = 0; i < output_sizes.size(); ++i) {
-    if (!output_sizes[i].empty()) {
-      strings::StrAppend(result, prefix, var, " = ");
-      if (i > 0) strings::StrAppend(result, var, "[:", i, "] + ");
-      if (i + 1 < output_sizes.size()) {
-        // Special case i == 0 to avoid "0 +" in the generated code.
-        if (i == 0) {
-          strings::StrAppend(result, "[", var, "[:", output_sizes[i], "]] + ",
-                             var, "[", output_sizes[i], ":]");
-        } else {
-          strings::StrAppend(result, "[", var, "[", i, ":", i, " + ",
-                             output_sizes[i], "]] + ", var, "[", i, " + ",
-                             output_sizes[i], ":]");
-        }
-      } else {
-        strings::StrAppend(result, "[", var, "[", i, ":]]");
-      }
-      strings::StrAppend(result, "\n");
-    }
-  }
-}
-
-string TensorPBString(const TensorProto& pb) {
-  // Note: This gets used in the argument list, and so must survive naive
-  // word wrapping.
-  return strings::StrCat("\"\"\"", ProtoShortDebugString(pb), "\"\"\"");
-}
-
-const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
-  for (int i = 0; i < api_def.in_arg_size(); ++i) {
-    if (api_def.in_arg(i).name() == name) {
-      return &api_def.in_arg(i);
-    }
-  }
-  return nullptr;
-}
-
-class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
- public:
-  GenEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                   const string& function_name)
-      : python_op_gen_internal::GenPythonOp(op_def, api_def, function_name) {
-    op_name_ = function_name_;
-    str_util::ConsumePrefix(&op_name_, "_");
-  }
-  ~GenEagerPythonOp() override {}
-
-  string Code() override;
-
- protected:
-  void HandleGraphMode(const string& function_setup);
-
-  string GetEagerNotAllowedError();
-  void ExpectListArg(const string& indentation, const string& arg_name,
-                     string* output);
-  bool GetEagerFunctionSetup(const string& indentation, string* function_setup);
-  void GetOutputSizesAndNumOutputsExpr(std::vector<string>* output_sizes,
-                                       string* num_outputs_expr);
-
-  void AddEagerFunctionTeardown(const string& indentation,
-                                const std::vector<string>& output_sizes,
-                                bool execute_record_gradient);
-
-  bool AddEagerFastPathAndGraphCode(const string& parameters,
-                                    const std::vector<string>& output_sizes,
-                                    const string& eager_not_allowed_error);
-  bool AddEagerFallbackCode(const string& parameters,
-                            const std::vector<string>& output_sizes,
-                            const string& num_outputs_expr,
-                            const string& eager_not_allowed_error);
-  void AddEagerFastPathExecute();
-
-  void AddEagerInferredAttrs(const string& indentation);
-  void AddEagerInputCasts(const string& indentation);
-  void AddEagerAttrs(const string& indentation);
-  void AddEagerExecute(const string& indentation,
-                       const string& num_outputs_expr);
-
-  void AddAttrForArg(const string& attr, int arg_index) {
-    gtl::InsertIfNotPresent(&inferred_attrs_, attr,
-                            op_def_.input_arg(arg_index).name());
-    auto iter = attr_to_args_.find(attr);
-    if (iter == attr_to_args_.end()) {
-      attr_to_args_.insert(AttrToArgMap::value_type(attr, {arg_index}));
-    } else {
-      iter->second.push_back(arg_index);
-    }
-  }
-
-  // Returns a string expression representing a flattened list of all
-  // the inputs given by `*input_indices` (or all inputs if
-  // `input_indices` is nullptr).  `*output_sizes` can be used to unflatten.
-  string FlattenInputs(const std::vector<int>* input_indices,
-                       std::vector<string>* output_sizes) const;
-
-  StringPiece op_name_;
-  typedef std::unordered_map<string, std::vector<int>> AttrToArgMap;
-  AttrToArgMap attr_to_args_;
-  std::unordered_map<string, string> attr_expressions_;
-  // This has all the input args followed by those attrs that don't have
-  // defaults.
-  std::vector<python_op_gen_internal::ParamNames> params_no_default_;
-  // The parameters with defaults (these have to be listed after those without).
-  // No input args are included, just attrs.
-  std::vector<std::pair<python_op_gen_internal::ParamNames, string>>
-      params_with_default_;
-};
-
-string GetEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                        const string& function_name) {
-  return GenEagerPythonOp(op_def, api_def, function_name).Code();
-}
-
-string GenEagerPythonOp::FlattenInputs(
-    const std::vector<int>* input_indices,
-    std::vector<string>* output_sizes) const {
-  string inputs;
-  enum { STARTING, WAS_LIST_INPUT, WAS_SOLO_INPUT } inputs_state = STARTING;
-  const int n = input_indices != nullptr ? input_indices->size()
-                                         : op_def_.input_arg_size();
-  for (int j = 0; j < n; ++j) {
-    const int i = input_indices ? (*input_indices)[j] : j;
-    const auto& arg(op_def_.input_arg(i));
-    const bool is_list =
-        !arg.type_list_attr().empty() || !arg.number_attr().empty();
-    if (is_list) {
-      if (inputs_state == WAS_SOLO_INPUT) {
-        strings::StrAppend(&inputs, "] + ");
-      } else if (inputs_state == WAS_LIST_INPUT) {
-        strings::StrAppend(&inputs, " + ");
-      }
-      strings::StrAppend(&inputs, "list(", param_names_[i].GetRenameTo(), ")");
-      inputs_state = WAS_LIST_INPUT;
-      if (output_sizes != nullptr) {
-        if (!arg.number_attr().empty()) {
-          output_sizes->emplace_back(AttrVarName(arg.number_attr(), nullptr));
-        } else {
-          output_sizes->emplace_back(
-              strings::StrCat("len(", param_names_[i].GetRenameTo(), ")"));
-        }
-      }
-    } else {
-      if (inputs_state == WAS_SOLO_INPUT) {
-        strings::StrAppend(&inputs, ", ");
-      } else if (inputs_state == WAS_LIST_INPUT) {
-        strings::StrAppend(&inputs, " + [");
-      } else {
-        strings::StrAppend(&inputs, "[");
-      }
-      strings::StrAppend(&inputs, param_names_[i].GetRenameTo());
-      inputs_state = WAS_SOLO_INPUT;
-      if (output_sizes != nullptr) output_sizes->emplace_back();
-    }
-  }
-  if (inputs_state == STARTING) return "[]";
-  if (inputs_state == WAS_SOLO_INPUT) {
-    strings::StrAppend(&inputs, "]");
-  }
-  return inputs;
-}
-
-string GenEagerPythonOp::Code() {
-  if (api_def_.visibility() == ApiDef::SKIP) {
-    return "";
-  }
-
-  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
-    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
-    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
-    params_no_default_.emplace_back(api_def_arg.name(),
-                                    api_def_arg.rename_to());
-    if (!arg.type_attr().empty()) {
-      AddAttrForArg(arg.type_attr(), i);
-    } else if (!arg.type_list_attr().empty()) {
-      AddAttrForArg(arg.type_list_attr(), i);
-    }
-    if (!arg.number_attr().empty()) {
-      AddAttrForArg(arg.number_attr(), i);
-    }
-  }
-  for (int i = 0; i < op_def_.attr_size(); ++i) {
-    const auto& attr(op_def_.attr(i));
-    const auto& api_def_attr(api_def_.attr(i));
-    // Do not add inferred attrs to the Python function signature.
-    if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
-      if (api_def_attr.has_default_value()) {
-        if (attr.type() == "tensor") {
-          params_with_default_.emplace_back(
-              python_op_gen_internal::ParamNames(api_def_attr.name(),
-                                                 api_def_attr.rename_to()),
-              strings::StrCat(
-                  "_execute.make_tensor(",
-                  TensorPBString(api_def_attr.default_value().tensor()), ", \"",
-                  api_def_attr.rename_to(), "\")"));
-        } else if (attr.type() == "list(tensor)") {
-          std::vector<string> pbtxt;
-          for (const auto& pb : api_def_attr.default_value().list().tensor()) {
-            pbtxt.emplace_back(TensorPBString(pb));
-          }
-          params_with_default_.emplace_back(
-              python_op_gen_internal::ParamNames(api_def_attr.name(),
-                                                 api_def_attr.rename_to()),
-              strings::StrCat("[_execute.make_tensor(_pb, \"",
-                              api_def_attr.rename_to(), "\") for _pb in ",
-                              VectorToTuple(pbtxt), "]"));
-        } else {
-          params_with_default_.emplace_back(
-              python_op_gen_internal::ParamNames(api_def_attr.name(),
-                                                 api_def_attr.rename_to()),
-              python_op_gen_internal::AttrValueToPython(
-                  attr.type(), api_def_attr.default_value(), "_dtypes."));
-        }
-      } else {
-        params_no_default_.emplace_back(api_def_attr.name(),
-                                        api_def_attr.rename_to());
-      }
-    }
-  }
-
-  // Save the list of attr parameters (attrs that won't be inferred),
-  // those with defaults go at the end.
-  // Get the attrs in the order we want by taking the attrs without defaults
-  // from the end of params_no_default_, and adding params_no_default_.
-  attrs_.reserve(params_no_default_.size() - op_def_.input_arg_size() +
-                 params_with_default_.size());
-  for (int i = op_def_.input_arg_size(); i < params_no_default_.size(); ++i) {
-    attrs_.push_back(params_no_default_[i].GetName());
-  }
-  for (const auto& p : params_with_default_) {
-    attrs_.push_back(p.first.GetName());
-  }
-
-  param_names_.reserve(params_no_default_.size() + params_with_default_.size());
-  param_names_.insert(param_names_.begin(), params_no_default_.begin(),
-                      params_no_default_.end());
-  for (const auto& param_and_default : params_with_default_) {
-    param_names_.push_back(param_and_default.first);
-  }
-
-  string parameters;
-  for (const auto& param : params_no_default_) {
-    if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-    strings::StrAppend(&parameters, param.GetRenameTo());
-  }
-  for (const auto& param_and_default : params_with_default_) {
-    if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-    strings::StrAppend(&parameters, param_and_default.first.GetRenameTo(), "=",
-                       param_and_default.second);
-  }
-  if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-  strings::StrAppend(&parameters, "name=None");
-
-  // Add attr_expressions_ for attrs that are params.
-  for (int i = 0; i < attrs_.size(); ++i) {
-    const string& attr_name = attrs_[i];
-    const string& attr_api_name =
-        param_names_[i + op_def_.input_arg_size()].GetRenameTo();
-    attr_expressions_[attr_name] = attr_api_name;
-  }
-  // Add attr_expressions_ for attrs that are inferred.
-  for (int i = 0; i < op_def_.attr_size(); ++i) {
-    const auto& attr(op_def_.attr(i));
-    if (attr.type() == "int") {
-      auto arg_list = attr_to_args_.find(attr.name());
-      if (arg_list != attr_to_args_.end()) {
-        AttrVarName(attr.name(), &attr_expressions_);
-      }
-    }
-  }
-
-  string num_outputs_expr;
-  std::vector<string> output_sizes(num_outs_);
-  GetOutputSizesAndNumOutputsExpr(&output_sizes, &num_outputs_expr);
-
-  string eager_not_allowed_error = GetEagerNotAllowedError();
-
-  if (!AddEagerFastPathAndGraphCode(parameters, output_sizes,
-                                    eager_not_allowed_error)) {
-    return result_;
-  }
-
-  if (!AddEagerFallbackCode(parameters, output_sizes, num_outputs_expr,
-                            eager_not_allowed_error)) {
-    return result_;
-  }
-
-  return prelude_ + result_;
-}
-
-void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
-  // Handle graph-mode case
-  strings::StrAppend(&result_,
-                     "  _ctx = _context._context\n"
-                     "  if _ctx is None or not _ctx._eager_context.is_eager:\n",
-                     function_setup,
-                     "    _, _, _op = _op_def_lib._apply_op_helper(\n");
-  AddBodyNoReturn("        ");
-  if (num_outs_ > 0) {
-    strings::StrAppend(&result_, "    _result = _op.outputs[:]\n");
-    // Special case handling for stateful op with single list output
-    // that might be empty.
-    if (num_outs_ == 1 && op_def_.is_stateful() &&
-        (!op_def_.output_arg(0).number_attr().empty() ||
-         !op_def_.output_arg(0).type_list_attr().empty())) {
-      // TODO(josh11b): Can skip this if the number_attr/type_list_attr has
-      // a constraint indicating that this can never be empty.
-      strings::StrAppend(&result_,
-                         "    if not _result:\n"
-                         "      return _op\n");
-    }
-    strings::StrAppend(&result_, "    _inputs_flat = _op.inputs\n");
-
-    // Compute graph-mode attrs.
-    if (op_def_.attr_size() > 0) {
-      string attr_values;
-      for (int i = 0; i < op_def_.attr_size(); ++i) {
-        if (i > 0) strings::StrAppend(&attr_values, ", ");
-        const auto& attr_name(op_def_.attr(i).name());
-        strings::StrAppend(&attr_values, "\"", attr_name, "\", _op.get_attr(\"",
-                           attr_name, "\")");
-      }
-      strings::StrAppend(&attr_values, ")");
-      strings::StrAppend(&result_,
-                         WordWrap("    _attrs = (", attr_values, kRightMargin),
-                         "\n");
-    } else {
-      strings::StrAppend(&result_, "    _attrs = None\n");
-    }
-  } else {
-    strings::StrAppend(&result_, "    return _op\n");
-  }
-}
-
-string GenEagerPythonOp::GetEagerNotAllowedError() {
-  bool eager_allowed = true;
-  string ref_arg;
-  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
-    const auto& arg = op_def_.input_arg(i);
-    if (arg.is_ref()) {
-      eager_allowed = false;
-      DCHECK_EQ(op_def_.input_arg(i).name(), api_def_.in_arg(i).name());
-      ref_arg = api_def_.in_arg(i).rename_to();
-    }
-  }
-  for (int i = 0; i < op_def_.output_arg_size(); ++i) {
-    const auto& arg = op_def_.output_arg(i);
-    if (arg.is_ref()) {
-      eager_allowed = false;
-      DCHECK_EQ(op_def_.output_arg(i).name(), api_def_.out_arg(i).name());
-      ref_arg = api_def_.out_arg(i).rename_to();
-    }
-  }
-
-  if (eager_allowed) return "";
-
-  return strings::StrCat("raise RuntimeError(\"", op_name_,
-                         " op does not support eager execution. ", "Arg '",
-                         ref_arg, "' is a ref.\")\n");
-}
-
-void GenEagerPythonOp::ExpectListArg(const string& indentation,
-                                     const string& arg_name, string* output) {
-  strings::StrAppend(output, indentation, "if not isinstance(", arg_name,
-                     ", (list, tuple)):\n", indentation, "  raise TypeError(\n",
-                     indentation, "      \"Expected list for '", arg_name,
-                     "' argument to \"\n", indentation, "      \"'", op_name_,
-                     "' Op, not %r.\" % ", arg_name, ")\n");
-}
-
-bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
-                                             string* function_setup) {
-  // Validate list inputs, infer length attrs.
-  for (int i = 0; i < op_def_.attr_size(); ++i) {
-    const auto& attr(op_def_.attr(i));
-    if (attr.type() == "int") {
-      auto arg_list = attr_to_args_.find(attr.name());
-      if (arg_list != attr_to_args_.end()) {
-        // Inferred int attrs are the lengths of inputs. Validate those
-        // inputs are lists and have the same length.
-        for (auto iter = arg_list->second.begin();
-             iter != arg_list->second.end(); ++iter) {
-          const string& arg_api_name = param_names_[*iter].GetRenameTo();
-          ExpectListArg(indentation, arg_api_name, function_setup);
-          if (iter == arg_list->second.begin()) {
-            AddInferredAttr(indentation, attr.name(),
-                            strings::StrCat("len(", arg_api_name, ")"),
-                            function_setup, &attr_expressions_);
-          } else {
-            const auto& attr_var = attr_expressions_[attr.name()];
-            strings::StrAppend(
-                function_setup, indentation, "if len(", arg_api_name,
-                ") != ", attr_var, ":\n", indentation, "  raise ValueError(\n",
-                indentation, "      \"List argument '", arg_api_name, "' to '",
-                op_name_, "' Op with length %d \"\n", indentation,
-                "      \"must match length %d of argument '",
-                inferred_attrs_[attr.name()], "'.\" %\n", indentation,
-                "      (len(", arg_api_name, "), ", attr_var, "))\n");
-          }
-        }
-      }
-    }
-  }
-
-  for (int i = 0; i < attrs_.size(); ++i) {
-    const string& attr_name = attrs_[i];
-    const auto& param = param_names_[i + op_def_.input_arg_size()];
-    const auto& attr = *FindAttr(attr_name, op_def_);
-    const string& attr_api_name = param.GetRenameTo();
-    StringPiece attr_type = attr.type();
-    attr_expressions_[attr_name] = attr_api_name;
-    const int default_index = i - (attrs_.size() - params_with_default_.size());
-    if (default_index >= 0) {
-      const string& default_value = params_with_default_[default_index].second;
-      strings::StrAppend(function_setup, indentation, "if ", attr_api_name,
-                         " is None:\n");
-      strings::StrAppend(function_setup, indentation, "  ", attr_api_name,
-                         " = ", default_value, "\n");
-    }
-    if (str_util::StartsWith(attr_type, "list(")) {
-      ExpectListArg(indentation, attr_api_name, function_setup);
-    }
-
-    if (attr_type == "string") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = _execute.make_str(", attr_api_name, ", \"",
-                         attr_api_name, "\")\n");
-    } else if (attr_type == "list(string)") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = [_execute.make_str(_s, \"", attr_api_name,
-                         "\") for _s in ", attr_api_name, "]\n");
-    } else if (attr_type == "int") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = _execute.make_int(", attr_api_name, ", \"",
-                         attr_api_name, "\")\n");
-    } else if (attr_type == "list(int)") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = [_execute.make_int(_i, \"", attr_api_name,
-                         "\") for _i in ", attr_api_name, "]\n");
-    } else if (attr_type == "float") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = _execute.make_float(", attr_api_name, ", \"",
-                         attr_api_name, "\")\n");
-    } else if (attr_type == "list(float)") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = [_execute.make_float(_f, \"", attr_api_name,
-                         "\") for _f in ", attr_api_name, "]\n");
-    } else if (attr_type == "bool") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = _execute.make_bool(", attr_api_name, ", \"",
-                         attr_api_name, "\")\n");
-    } else if (attr_type == "list(bool)") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = [_execute.make_bool(_b, \"", attr_api_name,
-                         "\") for _b in ", attr_api_name, "]\n");
-    } else if (attr_type == "type") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = _execute.make_type(", attr_api_name, ", \"",
-                         attr_api_name, "\")\n");
-    } else if (attr_type == "list(type)") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = [_execute.make_type(_t, \"", attr_api_name,
-                         "\") for _t in ", attr_api_name, "]\n");
-    } else if (attr_type == "shape") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = _execute.make_shape(", attr_api_name, ", \"",
-                         attr_api_name, "\")\n");
-    } else if (attr_type == "list(shape)") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = [_execute.make_shape(_s, \"", attr_api_name,
-                         "\") for _s in ", attr_api_name, "]\n");
-    } else if (attr_type == "tensor") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = _execute.make_tensor(", attr_api_name, ", \"",
-                         attr_api_name, "\")\n");
-    } else if (attr_type == "list(tensor)") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = [_execute.make_tensor(_t, \"", attr_api_name,
-                         "\") for _t in ", attr_api_name, "]\n");
-    } else if (attr_type != "func") {
-      *function_setup =
-          strings::StrCat("# No definition for ", function_name_,
-                          " since we don't support attrs with type\n"
-                          "# '",
-                          attr_type, "' right now.\n\n");
-      return false;
-    }
-  }
-  return true;
-}
-
-// If output i is list output, output_sizes[i] will be set to a
-// string with the python expression that will evaluate to its
-// length. output_sizes[i] is empty for non-list outputs.
-void GenEagerPythonOp::GetOutputSizesAndNumOutputsExpr(
-    std::vector<string>* output_sizes, string* num_outputs_expr) {
-  // Expression representing the number of outputs.
-  int num_fixed_outputs = 0;
-  for (int i = 0; i < num_outs_; ++i) {
-    const auto& arg(op_def_.output_arg(i));
-    if (!arg.number_attr().empty()) {
-      if (!num_outputs_expr->empty()) {
-        strings::StrAppend(num_outputs_expr, " + ");
-      }
-      (*output_sizes)[i] = attr_expressions_[arg.number_attr()];
-      strings::StrAppend(num_outputs_expr, (*output_sizes)[i]);
-    } else if (!arg.type_list_attr().empty()) {
-      if (!num_outputs_expr->empty()) {
-        strings::StrAppend(num_outputs_expr, " + ");
-      }
-      // Have to be careful to use an expression that works in both
-      // graph and eager paths here.
-      const auto iter = inferred_attrs_.find(arg.type_list_attr());
-      if (iter == inferred_attrs_.end()) {
-        (*output_sizes)[i] = strings::StrCat(
-            "len(", attr_expressions_[arg.type_list_attr()], ")");
-      } else {
-        (*output_sizes)[i] = strings::StrCat("len(", iter->second, ")");
-      }
-      strings::StrAppend(num_outputs_expr, (*output_sizes)[i]);
-    } else {
-      ++num_fixed_outputs;
-    }
-  }
-  if (num_fixed_outputs > 0) {
-    if (!num_outputs_expr->empty()) {
-      strings::StrAppend(num_outputs_expr, " + ");
-    }
-    strings::StrAppend(num_outputs_expr, num_fixed_outputs);
-  } else if (num_outputs_expr->empty()) {
-    *num_outputs_expr = "0";
-  }
-}
-
-void GenEagerPythonOp::AddEagerFunctionTeardown(
-    const string& indentation, const std::vector<string>& output_sizes,
-    bool execute_record_gradient) {
-  if (num_outs_ > 0) {
-    if (execute_record_gradient) {
-      strings::StrAppend(&result_, indentation, "_execute.record_gradient(\n",
-                         "      \"", op_def_.name(),
-                         "\", _inputs_flat, _attrs, _result, name)\n");
-    }
-    if (num_outs_ == 1 && !output_sizes[0].empty()) {
-      // Single list result.
-    } else if (num_outs_ == 1) {
-      // Execute returns a single-element list which we need to destructure.
-      strings::StrAppend(&result_, indentation, "_result, = _result\n");
-    } else {
-      // Have multiple outputs, so we will need to reformat the return
-      // value of execute() to be a list with one entry per op output
-      // (that entry will be a list of tensors if that output is of list
-      // type).
-      // For list outputs, convert the right subrange of _result into a list.
-      Unflatten(indentation, output_sizes, "_result", &result_);
-      // Convert to a named tuple.
-      strings::StrAppend(&result_, indentation, "_result = _", op_def_.name(),
-                         "Output._make(_result)\n");
-    }
-  } else {
-    strings::StrAppend(&result_, indentation, "_result = None\n");
-  }
-  strings::StrAppend(&result_, indentation, "return _result\n\n");
-}
-
-bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
-    const string& parameters, const std::vector<string>& output_sizes,
-    const string& eager_not_allowed_error) {
-  AddExport();
-  AddDefLine(function_name_, parameters);
-  AddDocStringDescription();
-  AddDocStringArgs();
-  AddDocStringInputs();
-  AddDocStringAttrs();
-  AddDocStringNameArg();
-  AddOutputGlobals();  // Added to prelude_
-  AddDocStringOutputs();
-  strings::StrAppend(&result_, "  \"\"\"\n");
-
-  // Handle graph-mode case
-  string function_setup;
-  if (!GetEagerFunctionSetup("    ", &function_setup)) {
-    result_ = function_setup;
-    return false;
-  }
-  HandleGraphMode(function_setup);
-  AddEagerFunctionTeardown("    ", output_sizes,
-                           true /* execute_record_gradient */);
-
-  // Handle eager-mode case
-  strings::StrAppend(&result_, "  else:\n");
-
-  if (eager_not_allowed_error.empty()) {
-    AddEagerFastPathExecute();
-  } else {
-    strings::StrAppend(&result_, "    ", eager_not_allowed_error);
-  }
-
-  strings::StrAppend(&result_, "\n\n");
-  return true;
-}
-
-bool GenEagerPythonOp::AddEagerFallbackCode(
-    const string& parameters, const std::vector<string>& output_sizes,
-    const string& num_outputs_expr, const string& eager_not_allowed_error) {
-  if (!eager_not_allowed_error.empty()) {
-    strings::StrAppend(&result_, "  ", eager_not_allowed_error);
-    return true;
-  }
-
-  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix),
-             strings::StrCat(parameters, ", ctx=None"));
-  strings::StrAppend(
-      &result_, "  r\"\"\"This is the slowpath function for Eager mode.\n");
-  strings::StrAppend(&result_, "  This is for function ", function_name_,
-                     "\n  \"\"\"\n");
-
-  strings::StrAppend(&result_, "  _ctx = ctx if ctx else _context.context()\n");
-
-  string function_setup;
-  if (!GetEagerFunctionSetup("  ", &function_setup)) {
-    result_ = function_setup;
-    return false;
-  }
-  strings::StrAppend(&result_, function_setup);
-
-  AddEagerInferredAttrs("  ");
-  AddEagerInputCasts("  ");
-  strings::StrAppend(
-      &result_, "  _inputs_flat = ", FlattenInputs(nullptr, nullptr), "\n");
-  AddEagerAttrs("  ");
-  AddEagerExecute("  ", num_outputs_expr);
-
-  AddEagerFunctionTeardown("  ", output_sizes,
-                           true /* execute_record_gradient */);
-
-  return true;
-}
-
-void GenEagerPythonOp::AddEagerFastPathExecute() {
-  string fastpath_execute_params = strings::StrCat(
-      "_ctx._context_handle, _ctx._eager_context.device_name, \"",
-      op_def_.name(), "\", ", "name, _ctx._post_execution_callbacks");
-  string fallback_params;
-
-  for (int i = 0; i < api_def_.in_arg_size(); i++) {
-    const string param_name = param_names_[i].GetRenameTo();
-    strings::StrAppend(&fastpath_execute_params, ", ", param_name);
-    if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
-    strings::StrAppend(&fallback_params, param_name);
-  }
-
-  for (const auto& attr : api_def_.attr()) {
-    if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
-      strings::StrAppend(&fastpath_execute_params, ", \"", attr.name(), "\", ",
-                         attr.rename_to());
-
-      if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
-      strings::StrAppend(&fallback_params, attr.rename_to(), "=",
-                         attr.rename_to());
-    }
-  }
-
-  if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
-  strings::StrAppend(&fallback_params, "name=name");
-
-  strings::StrAppend(&result_, "    try:\n");
-  strings::StrAppend(
-      &result_, "      ",
-      "_result = _pywrap_tensorflow.TFE_Py_FastPathExecute(\n",
-      WordWrap(strings::StrCat("        "),
-               strings::StrCat(fastpath_execute_params, ")"), kRightMargin),
-      "\n");
-
-  if (op_def_.output_arg_size() > 1) {
-    const string output_tuple_name =
-        strings::StrCat("_", op_def_.name(), "Output");
-    strings::StrAppend(&result_, "      ", "_result = ", output_tuple_name,
-                       "._make(_result)\n");
-  }
-  strings::StrAppend(&result_, "      ", "return _result\n");
-
-  // Handle fallback.
-  if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
-  strings::StrAppend(&fallback_params, "ctx=_ctx");
-  strings::StrAppend(&result_, "    ", "except _core._FallbackException:\n");
-  strings::StrAppend(
-      &result_, "      ", "return ", function_name_, kEagerFallbackSuffix,
-      "(\n",
-      WordWrap(strings::StrCat("          "),
-               strings::StrCat(fallback_params, ")"), kRightMargin),
-      "\n");
-
-  // Any errors thrown from execute need to be unwrapped from
-  // _NotOkStatusException.
-  strings::StrAppend(&result_, "    ",
-                     "except _core._NotOkStatusException as e:\n");
-  strings::StrAppend(&result_, "      ", "if name is not None:\n");
-  strings::StrAppend(&result_, "        ",
-                     "message = e.message + \" name: \" + name\n");
-  strings::StrAppend(&result_, "      ", "else:\n");
-  strings::StrAppend(&result_, "        ", "message = e.message\n");
-  strings::StrAppend(
-      &result_, "      ",
-      "_six.raise_from(_core._status_to_exception(e.code, message), None)\n");
-}
-
-void GenEagerPythonOp::AddEagerInferredAttrs(const string& indentation) {
-  // Figure out values for inferred attrs, and cast to eager tensors.
-  for (int i = 0; i < op_def_.attr_size(); ++i) {
-    const auto& attr(op_def_.attr(i));
-    const auto& api_def_attr(api_def_.attr(i));
-    auto arg_list = attr_to_args_.find(attr.name());
-    if (arg_list != attr_to_args_.end()) {
-      if (attr.type() == "type") {
-        std::vector<string> output_sizes;
-        const string flattened =
-            FlattenInputs(&arg_list->second, &output_sizes);
-        string conversion = strings::StrCat("_execute.args_to_matching_eager(",
-                                            flattened, ", _ctx");
-        if (attr.has_default_value()) {
-          strings::StrAppend(
-              &conversion, ", ",
-              python_op_gen_internal::AttrValueToPython(
-                  attr.type(), api_def_attr.default_value(), "_dtypes."));
-        }
-        strings::StrAppend(&conversion, ")");
-        const string var_name = AttrVarName(attr.name(), &attr_expressions_);
-        if (output_sizes.size() == 1) {
-          // Avoid creating a temporary variable in the case where
-          // we can easily assign to the right value directly.
-          const string inputs_var =
-              param_names_[arg_list->second.front()].GetRenameTo();
-          if (output_sizes.front().empty()) {
-            strings::StrAppend(&result_, indentation, var_name, ", (",
-                               inputs_var, ",) = ", conversion, "\n");
-          } else {
-            strings::StrAppend(&result_, indentation, var_name, ", ",
-                               inputs_var, " = ", conversion, "\n");
-          }
-        } else {
-          const string inputs_var = strings::StrCat("_inputs_", attr.name());
-          strings::StrAppend(&result_, indentation, var_name, ", ", inputs_var,
-                             " = ", conversion, "\n");
-          // Convert from a flat list of eager tensors back to the
-          // parameter variables.
-          Unflatten(indentation, output_sizes, inputs_var, &result_);
-          std::vector<string> p;
-          for (int j : arg_list->second) {
-            p.emplace_back(param_names_[j].GetRenameTo());
-          }
-          strings::StrAppend(&result_, indentation, VectorToTuple(p), " = ",
-                             inputs_var, "\n");
-        }
-      } else if (attr.type() == "list(type)") {
-        // NOTE: We ignore default values for these attrs, since it is
-        // unclear how you would use it, and the one use case is
-        // parse_single_sequence_example which only needs it for
-        // backwards compatibility.
-        const string var_name = AttrVarName(attr.name(), &attr_expressions_);
-        string inputs_var;
-        string conversion;
-        if (arg_list->second.size() > 1) {
-          // If you have more than one list(tensor) argument, their types
-          // have to match.
-          std::vector<string> lists;
-          for (auto iter = arg_list->second.begin();
-               iter != arg_list->second.end(); ++iter) {
-            lists.push_back(param_names_[*iter].GetRenameTo());
-          }
-          inputs_var = VectorToTuple(lists);
-          conversion = "_execute.args_to_mixed_eager_tensors";
-        } else {
-          // For one list(tensor) argument, we just convert every
-          // element of the list to an eager tensor.
-          inputs_var = param_names_[arg_list->second.front()].GetRenameTo();
-          conversion = "_execute.convert_to_mixed_eager_tensors";
-        }
-        strings::StrAppend(&result_, indentation, var_name, ", ", inputs_var,
-                           " = ", conversion, "(", inputs_var, ", _ctx)\n");
-      }
-    }
-  }
-}
-
-void GenEagerPythonOp::AddEagerInputCasts(const string& indentation) {
-  // Cast remaining args to eager tensors
-  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
-    const auto& arg(op_def_.input_arg(i));
-    if (!arg.type_attr().empty() || !arg.type_list_attr().empty()) continue;
-    const string& param = param_names_[i].GetRenameTo();
-    const string fn = arg.number_attr().empty() ? "" : "n_";
-    const string dtype =
-        python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
-    strings::StrAppend(&result_, indentation, param, " = _ops.convert_", fn,
-                       "to_tensor(", param, ", ", dtype, ")\n");
-  }
-}
-
-void GenEagerPythonOp::AddEagerAttrs(const string& indentation) {
-  // Compute eager attrs
-  if (op_def_.attr_size() > 0) {
-    string attr_values;
-    for (int i = 0; i < op_def_.attr_size(); ++i) {
-      if (i > 0) strings::StrAppend(&attr_values, ", ");
-      const auto& attr_name(op_def_.attr(i).name());
-      strings::StrAppend(&attr_values, "\"", attr_name, "\", ",
-                         attr_expressions_[attr_name]);
-    }
-    strings::StrAppend(&attr_values, ")");
-    strings::StrAppend(
-        &result_,
-        WordWrap(indentation, strings::StrCat("_attrs = (", attr_values),
-                 kRightMargin),
-        "\n");
-  } else {
-    strings::StrAppend(&result_, indentation, "_attrs = None\n");
-  }
-}
-
-void GenEagerPythonOp::AddEagerExecute(const string& indentation,
-                                       const string& num_outputs_expr) {
-  const string return_prefix =
-      strings::StrCat(indentation, "_result = _execute.execute(");
-  const string return_args = strings::StrCat(
-      "b\"", op_def_.name(), "\", ", num_outputs_expr,
-      ", inputs=_inputs_flat, attrs=_attrs, ctx=_ctx, name=name)");
-  strings::StrAppend(&result_,
-                     // Wrap the arguments, and indent to the (.
-                     WordWrap(return_prefix, return_args, kRightMargin), "\n");
-}
-
-string GetEagerPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                         const std::vector<string>& hidden_ops,
-                         bool require_shapes,
-                         const string& source_file_name = "") {
-  string result;
-  // Header
-  // TODO(josh11b): Mention the library for which wrappers are being generated.
-  strings::StrAppend(&result, R"("""Python wrappers around TensorFlow ops.
-
-This file is MACHINE GENERATED! Do not edit.
-)");
-
-  // Mention the original source file so someone tracing back through
-  // generated Python code will know where to look next.
-  if (!source_file_name.empty()) {
-    strings::StrAppend(&result, "Original C++ source file: ");
-    strings::StrAppend(&result, source_file_name);
-    strings::StrAppend(&result, "\n");
-  }
-
-  strings::StrAppend(&result, R"("""
-
-import collections as _collections
-import six as _six
-
-from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
-from tensorflow.python.eager import context as _context
-from tensorflow.python.eager import core as _core
-from tensorflow.python.eager import execute as _execute
-from tensorflow.python.framework import dtypes as _dtypes
-from tensorflow.python.framework import errors as _errors
-from tensorflow.python.framework import tensor_shape as _tensor_shape
-
-from tensorflow.core.framework import op_def_pb2 as _op_def_pb2
-# Needed to trigger the call to _set_call_cpp_shape_fn.
-from tensorflow.python.framework import common_shapes as _common_shapes
-from tensorflow.python.framework import op_def_registry as _op_def_registry
-from tensorflow.python.framework import ops as _ops
-from tensorflow.python.framework import op_def_library as _op_def_library
-from tensorflow.python.util.tf_export import tf_export
-
-)");
-
-  // We'll make a copy of ops that filters out descriptions.
-  OpList cleaned_ops;
-  auto out = cleaned_ops.mutable_op();
-  out->Reserve(ops.op_size());
-  for (const auto& op_def : ops.op()) {
-    const auto* api_def = api_defs.GetApiDef(op_def.name());
-
-    if (api_def->visibility() == ApiDef::SKIP) {
-      continue;
-    }
-    // An op is hidden if either its ApiDef visibility is HIDDEN
-    // or it is in the hidden_ops list.
-    bool is_hidden = api_def->visibility() == ApiDef::HIDDEN;
-    bool hidden_by_api_def = is_hidden;
-    if (!is_hidden) {
-      for (const string& hidden : hidden_ops) {
-        if (op_def.name() == hidden) {
-          is_hidden = true;
-          break;
-        }
-      }
-    }
-
-    string function_name;
-    python_op_gen_internal::GenerateLowerCaseOpName(op_def.name(),
-                                                    &function_name);
-    bool is_reserved = python_op_gen_internal::IsPythonReserved(function_name);
-
-    // Prefix an op with underscore if the op is listed in hidden_ops or
-    // name is reserved or it is of the exceptions in IsOpWithUnderscorePrefix.
-    // Do not add underscores to ops set to HIDDEN in ApiDef otherwise.
-    // TODO(annarev): don't prefix with underscores even if op is in hidden_ops.
-    if (is_hidden) {
-      if (!hidden_by_api_def || is_reserved ||
-          python_op_gen_internal::IsOpWithUnderscorePrefix(function_name)) {
-        function_name = strings::StrCat("_", function_name);
-      }
-    } else if (is_reserved) {
-      // When users create custom python wrappers, they may link in the
-      // default op registry by accident, and because they can't
-      // enumerate all 'hidden' symbols, this guard is to prevent
-      // instantiating a python reserved word in their wrapper.
-      continue;
-    }
-
-    strings::StrAppend(&result,
-                       GetEagerPythonOp(op_def, *api_def, function_name));
-
-    if (!require_shapes) {
-      strings::StrAppend(&result, "_ops.RegisterShape(\"", op_def.name(),
-                         "\")(None)\n\n");
-    }
-
-    auto added = out->Add();
-    *added = op_def;
-    RemoveNonDeprecationDescriptionsFromOpDef(added);
-  }
-
-  result.append(R"(def _InitOpDefLibrary(op_list_proto_bytes):
-  op_list = _op_def_pb2.OpList()
-  op_list.ParseFromString(op_list_proto_bytes)
-  _op_def_registry.register_op_list(op_list)
-  op_def_lib = _op_def_library.OpDefLibrary()
-  op_def_lib.add_op_list(op_list)
-  return op_def_lib
-)");
-
-  result.append("# ");
-  auto ops_text = ProtoDebugString(cleaned_ops);
-  str_util::StripTrailingWhitespace(&ops_text);
-  result.append(str_util::StringReplace(ops_text, "\n", "\n# ", true));
-  result.append("\n");
-  strings::Appendf(&result, "_op_def_lib = _InitOpDefLibrary(b\"%s\")\n",
-                   str_util::CEscape(cleaned_ops.SerializeAsString()).c_str());
-  return result;
-}
-
-}  // namespace
-
-void PrintEagerPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                         const std::vector<string>& hidden_ops,
-                         bool require_shapes, const string& source_file_name) {
-  printf("%s", GetEagerPythonOps(ops, api_defs, hidden_ops, require_shapes,
-                                 source_file_name)
-                   .c_str());
-}
-
-string GetEagerPythonWrappers(const char* op_list_buf, size_t op_list_len) {
-  string op_list_str(op_list_buf, op_list_len);
-  OpList ops;
-  ops.ParseFromString(op_list_str);
-
-  ApiDefMap api_def_map(ops);
-  return GetEagerPythonOps(ops, api_def_map, {}, false);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/python/eager/python_eager_op_gen.h b/tensorflow/python/eager/python_eager_op_gen.h
deleted file mode 100644
index d27b00139d129aba1c511a21afce749eae8b32ed..0000000000000000000000000000000000000000
--- a/tensorflow/python/eager/python_eager_op_gen.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
-#define TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
-
-#include <string>
-#include <vector>
-#include "tensorflow/core/framework/op_def.pb.h"
-#include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-// hidden_ops should be a list of Op names that should get a leading _
-// in the output. Prints the output to stdout.
-// Optional fourth argument is the name of the original C++ source file
-// where the ops' REGISTER_OP() calls reside.
-void PrintEagerPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                         const std::vector<string>& hidden_ops,
-                         bool require_shapes,
-                         const string& source_file_name = "");
-
-// Get the python wrappers for a list of ops in a OpList.
-// `op_list_buf` should be a pointer to a buffer containing
-// the binary encoded OpList proto, and `op_list_len` should be the
-// length of that buffer.
-string GetEagerPythonWrappers(const char* op_list_buf, size_t op_list_len);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index b5b4e394e33bd3adcf8e90fa4c35f87fbbb5a155..a62af4a06c7d61216ffd3aae7fe3070363b99d94 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -299,7 +299,7 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
         GetContext(context), handle.get(), handle_dtype,
         static_cast<TF_DataType>(desired_dtype), self->status));
     if (TF_GetCode(self->status) != TF_OK) {
-      PyErr_SetString(PyExc_ValueError,
+      PyErr_SetString(PyExc_TypeError,
                       tensorflow::strings::StrCat(
                           "Error while casting from DataType ", handle_dtype,
                           " to ", desired_dtype, ". ", TF_Message(self->status))
@@ -650,6 +650,12 @@ tensorflow::int64 EagerTensor_id(const PyObject* tensor) {
   return reinterpret_cast<const EagerTensor*>(tensor)->id;
 }
 
+tensorflow::DataType EagerTensor_dtype(const PyObject* tensor) {
+  CHECK(EagerTensor_CheckExact(tensor));
+  return static_cast<tensorflow::DataType>(TFE_TensorHandleDataType(
+      reinterpret_cast<const EagerTensor*>(tensor)->handle));
+}
+
 PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) {
   if (!PyType_Check(base_class)) {
     PyErr_SetString(
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
index 63ab1ed84d5ba3f280904be8dd202912e42241d0..bc042eb19e6a911fa95c9107401ffb2fe2fc3556 100644
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -16,11 +16,13 @@ limitations under the License.
 #define TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
 
 #include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/lib/core/numpy.h"
 
 bool EagerTensor_CheckExact(const PyObject* o);
 tensorflow::int64 EagerTensor_id(const PyObject* tensor);
+tensorflow::DataType EagerTensor_dtype(const PyObject* tensor);
 
 namespace tensorflow {
 TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype);
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 691b613e48b217c595fe0f3249c493facf756d47..9bc8b9bc726f9187f89855aefc15d7b4f6427e60 100644
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -120,6 +120,9 @@ PyObject* TFE_Py_TapeSetNew(PyObject* persistent);
 // Removes the passed tape from the set of active tapes.
 void TFE_Py_TapeSetRemove(PyObject* tape);
 
+// Adds the passed tape to the set of active tapes.
+void TFE_Py_TapeSetAdd(PyObject* tape);
+
 // Returns true if the tape stack is empty.
 PyObject* TFE_Py_TapeSetIsEmpty();
 
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 4ecba1a46be8fffa413a3d7fb2e643dc00f1b0ef..62deb41e9b5ed3706f9a427b6e5c674824a48ce7 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -174,6 +174,8 @@ bool IsInteger(PyObject* py_value) {
 #endif
 }
 
+// This function considers a Dimension._value of None to be valid, and sets the
+// value to be -1 in that case.
 bool ParseDimensionValue(const string& key, PyObject* py_value,
                          TF_Status* status, int64_t* value) {
   if (IsInteger(py_value)) {
@@ -191,6 +193,11 @@ bool ParseDimensionValue(const string& key, PyObject* py_value,
     return false;
   }
 
+  if (dimension_value.get() == Py_None) {
+    *value = -1;
+    return true;
+  }
+
   return ParseInt64Value(key, dimension_value.get(), status, value);
 }
 
@@ -843,6 +850,24 @@ static tensorflow::int64 FastTensorId(PyObject* tensor) {
   return id;
 }
 
+static tensorflow::DataType FastTensorDtype(PyObject* tensor) {
+  if (EagerTensor_CheckExact(tensor)) {
+    return EagerTensor_dtype(tensor);
+  }
+  PyObject* dtype_field = PyObject_GetAttrString(tensor, "dtype");
+  if (dtype_field == nullptr) {
+    return tensorflow::DT_INVALID;
+  }
+  PyObject* enum_field = PyObject_GetAttrString(dtype_field, "_type_enum");
+  Py_DECREF(dtype_field);
+  if (dtype_field == nullptr) {
+    return tensorflow::DT_INVALID;
+  }
+  tensorflow::int64 id = MakeInt(enum_field);
+  Py_DECREF(enum_field);
+  return static_cast<tensorflow::DataType>(id);
+}
+
 class GradientTape
     : public tensorflow::eager::GradientTape<PyObject, PyObject> {
  public:
@@ -991,6 +1016,14 @@ PyObject* TFE_Py_TapeSetNew(PyObject* persistent) {
   return reinterpret_cast<PyObject*>(tape);
 }
 
+void TFE_Py_TapeSetAdd(PyObject* tape) {
+  Py_INCREF(tape);
+  if (!GetTapeSet()->insert(reinterpret_cast<TFE_Py_Tape*>(tape)).second) {
+    // Already exists in the tape set.
+    Py_DECREF(tape);
+  }
+}
+
 PyObject* TFE_Py_TapeSetIsEmpty() {
   if (*ThreadTapeIsStopped() || GetTapeSet()->empty()) {
     Py_RETURN_TRUE;
@@ -1053,15 +1086,18 @@ PyObject* TFE_Py_TapeSetShouldRecord(PyObject* tensors) {
   // TODO(apassos) consider not building a list and changing the API to check
   // each tensor individually.
   std::vector<tensorflow::int64> tensor_ids;
+  std::vector<tensorflow::DataType> dtypes;
   tensor_ids.reserve(len);
+  dtypes.reserve(len);
   for (int i = 0; i < len; ++i) {
     PyObject* item = PySequence_Fast_GET_ITEM(seq, i);
     tensor_ids.push_back(FastTensorId(item));
+    dtypes.push_back(FastTensorDtype(item));
   }
   Py_DECREF(seq);
   auto tape_set = *tape_set_ptr;
   for (TFE_Py_Tape* tape : tape_set) {
-    if (tape->tape->ShouldRecord(tensor_ids)) {
+    if (tape->tape->ShouldRecord(tensor_ids, dtypes)) {
       Py_RETURN_TRUE;
     }
   }
@@ -1169,9 +1205,27 @@ PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape) {
 }
 
 namespace {
-void TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
-                            const std::vector<tensorflow::int64>& input_ids,
-                            PyObject* backward_function) {
+std::vector<tensorflow::DataType> MakeTensorDtypeList(PyObject* tensors) {
+  PyObject* seq = PySequence_Fast(tensors, "expected a sequence");
+  if (seq == nullptr) {
+    return {};
+  }
+  int len = PySequence_Fast_GET_SIZE(seq);
+  std::vector<tensorflow::DataType> list;
+  list.reserve(len);
+  for (int i = 0; i < len; ++i) {
+    PyObject* tensor = PySequence_Fast_GET_ITEM(seq, i);
+    list.push_back(FastTensorDtype(tensor));
+  }
+  Py_DECREF(seq);
+  return list;
+}
+
+void TapeSetRecordOperation(
+    PyObject* op_type, PyObject* output_tensors,
+    const std::vector<tensorflow::int64>& input_ids,
+    const std::vector<tensorflow::DataType>& input_dtypes,
+    PyObject* backward_function) {
   std::vector<tensorflow::eager::TapeTensor> output_info;
   PyObject* seq = PySequence_Fast(output_tensors,
                                   "expected a sequence of integer tensor ids");
@@ -1206,7 +1260,7 @@ void TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
   for (TFE_Py_Tape* tape : SafeTapeSet()) {
     Py_INCREF(backward_function);
     tape->tape->RecordOperation(
-        op_type_str, output_info, input_ids, backward_function,
+        op_type_str, output_info, input_ids, input_dtypes, backward_function,
         [backward_function]() { Py_DECREF(backward_function); });
   }
 }
@@ -1221,7 +1275,11 @@ void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
   std::vector<tensorflow::int64> input_ids = MakeTensorIDList(input_tensors);
   if (PyErr_Occurred()) return;
 
-  TapeSetRecordOperation(op_type, output_tensors, input_ids, backward_function);
+  std::vector<tensorflow::DataType> input_dtypes =
+      MakeTensorDtypeList(input_tensors);
+  if (PyErr_Occurred()) return;
+  TapeSetRecordOperation(op_type, output_tensors, input_ids, input_dtypes,
+                         backward_function);
 }
 
 void TFE_Py_TapeSetDeleteTrace(tensorflow::int64 tensor_id) {
@@ -1290,6 +1348,8 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyObject> {
     return result;
   }
 
+  void MarkAsResult(PyObject* gradient) const final { Py_INCREF(gradient); }
+
   PyObject* Zeros(tensorflow::TensorShape shape,
                   tensorflow::DataType dtype) const final {
     PyObject* py_shape = PyTuple_New(shape.dims());
@@ -1710,10 +1770,12 @@ PyObject* RecordGradient(PyObject* op_name, PyObject* inputs, PyObject* attrs,
                          PyObject* results, PyObject* name) {
   std::vector<tensorflow::int64> input_ids = MakeTensorIDList(inputs);
   if (PyErr_Occurred()) return nullptr;
+  std::vector<tensorflow::DataType> input_dtypes = MakeTensorDtypeList(inputs);
+  if (PyErr_Occurred()) return nullptr;
 
   bool should_record = false;
   for (TFE_Py_Tape* tape : SafeTapeSet()) {
-    if (tape->tape->ShouldRecord(input_ids)) {
+    if (tape->tape->ShouldRecord(input_ids, input_dtypes)) {
       should_record = true;
       break;
     }
@@ -1744,7 +1806,8 @@ PyObject* RecordGradient(PyObject* op_name, PyObject* inputs, PyObject* attrs,
   Py_DECREF(callback_args);
   if (backward_function == nullptr) return nullptr;
 
-  TapeSetRecordOperation(op_name, results, input_ids, backward_function);
+  TapeSetRecordOperation(op_name, results, input_ids, input_dtypes,
+                         backward_function);
 
   Py_DECREF(backward_function);
 
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index ad82266beca05d9f508a702124390fd934161ffd..caa217b70cabfdc3fdec3528ea1e7ca553072fbe 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -39,6 +39,11 @@ def push_new_tape(persistent=False):
   return Tape(tape)
 
 
+def push_tape(tape):
+  """Pushes an existing tape onto the tape stack."""
+  pywrap_tensorflow.TFE_Py_TapeSetAdd(tape._tape)  # pylint: disable=protected-access
+
+
 def watch(tensor):
   """Marks this tensor to be watched by all tapes in the stack.
 
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 56dec1eaa1f6087cb2348463378ddf40d2c73927..0754041f9eb50b429d02a06f9f0357c3431d3df5 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -21,6 +21,7 @@ py_library(
         ":export",
         ":exporter",
         ":inputs",
+        ":keras",
         ":linear",
         ":model_fn",
         ":parsing_utils",
@@ -38,6 +39,10 @@ py_library(
         ":gc",
         "//tensorflow/python:errors",
         "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:util",
+        "//tensorflow/python/estimator:metric_keys",
+        "//tensorflow/python/estimator:util",
     ],
 )
 
@@ -91,6 +96,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:tag_constants",
         "@six_archive//:six",
     ],
 )
@@ -444,16 +450,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "util_test",
-    srcs = ["util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":util",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
 py_library(
     name = "estimator",
     srcs = [
@@ -488,6 +484,7 @@ py_library(
 py_test(
     name = "estimator_test",
     srcs = ["estimator_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["notsan"],  # b/67510291
     deps = [
@@ -643,7 +640,6 @@ py_library(
         ":metric_keys",
         ":model_fn",
         ":prediction_keys",
-        ":util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:control_flow_ops",
@@ -657,6 +653,7 @@ py_library(
         "//tensorflow/python:string_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python:weights_broadcast_ops",
         "//tensorflow/python/feature_column",
         "//tensorflow/python/ops/losses",
@@ -909,3 +906,65 @@ py_test(
         "//tensorflow/python:training",
     ],
 )
+
+py_library(
+    name = "keras",
+    srcs = ["keras.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":estimator",
+        ":export_export",
+        ":model_fn",
+        ":run_config",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:training",
+        "//tensorflow/python:training_util",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:layers",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python/saved_model:signature_constants",
+    ],
+)
+
+py_test(
+    name = "keras_test",
+    size = "large",
+    srcs = ["keras_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras:engine",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 6d7a3299f70c8748ffdab680411e1401a0b2277a..4e6010a162be6e6b7288900b428d7841db6e453c 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -57,7 +57,7 @@ def _get_transformed_features(features, sorted_feature_columns):
 
   Args:
     features: a dicionary of name to Tensor.
-    feature_columns: a list/set of tf.feature_column.
+    sorted_feature_columns: a list/set of tf.feature_column, sorted by name.
 
   Returns:
     result_features: a list of the transformed features, sorted by the name.
@@ -96,14 +96,18 @@ def _get_transformed_features(features, sorted_feature_columns):
   return result_features
 
 
-def _local_variable(tensor, name=None):
+def _local_variable(initial_value, name=None):
   """Stores a tensor as a local Variable for faster read."""
-  return variable_scope.variable(
-      initial_value=tensor,
+  result = variable_scope.variable(
+      initial_value=initial_value,
       trainable=False,
       collections=[ops.GraphKeys.LOCAL_VARIABLES],
       validate_shape=False,
       name=name)
+  if isinstance(initial_value, ops.Tensor):
+    # Match the resulting variable's shape if the initial_value is a Tensor.
+    result.set_shape(initial_value.shape)
+  return result
 
 
 def _group_features_by_num_buckets(sorted_feature_columns):
@@ -256,7 +260,7 @@ class _CacheTrainingStatesUsingHashTable(object):
     elif dtypes.as_dtype(dtypes.string).is_compatible_with(example_ids.dtype):
       empty_key = ''
     else:
-      raise ValueError('Unsupported example_id_feature dtype %s.',
+      raise ValueError('Unsupported example_id_feature dtype %s.' %
                        example_ids.dtype)
     # Cache holds latest <tree_id, node_id, logits> for each example.
     # tree_id and node_id are both int32 but logits is a float32.
@@ -264,7 +268,10 @@ class _CacheTrainingStatesUsingHashTable(object):
     # bitcast the ids to int32.
     self._table_ref = lookup_ops.mutable_dense_hash_table_v2(
         empty_key=empty_key, value_dtype=dtypes.float32, value_shape=[3])
-    self._example_ids = example_ids
+    self._example_ids = ops.convert_to_tensor(example_ids)
+    if self._example_ids.shape.ndims not in (None, 1):
+      raise ValueError('example_id should have rank 1, but got %s' %
+                       self._example_ids)
     self._logits_dimension = logits_dimension
 
   def lookup(self):
@@ -278,6 +285,9 @@ class _CacheTrainingStatesUsingHashTable(object):
         array_ops.bitcast(cached_tree_ids, dtypes.int32))
     cached_node_ids = array_ops.squeeze(
         array_ops.bitcast(cached_node_ids, dtypes.int32))
+    if self._example_ids.shape.ndims is not None:
+      cached_logits.set_shape(
+          [self._example_ids.shape[0], self._logits_dimension])
     return (cached_tree_ids, cached_node_ids, cached_logits)
 
   def insert(self, tree_ids, node_ids, logits):
@@ -668,13 +678,18 @@ def _create_classification_head_and_closed_form(n_classes, weight_column,
                                                 label_vocabulary):
   """Creates a head for classifier and the closed form gradients/hessians."""
   head = _create_classification_head(n_classes, weight_column, label_vocabulary)
-  if n_classes == 2 and weight_column is None and label_vocabulary is None:
+  if (n_classes == 2 and head.logits_dimension == 1 and weight_column is None
+      and label_vocabulary is None):
     # Use the closed-form gradients/hessians for 2 class.
     def _grad_and_hess_for_logloss(logits, labels):
+      """A closed form gradient and hessian for logistic loss."""
       # TODO(youngheek): add weights handling.
       predictions = math_ops.reciprocal(math_ops.exp(-logits) + 1.0)
       normalizer = math_ops.reciprocal(
           math_ops.cast(array_ops.size(predictions), dtypes.float32))
+      labels = math_ops.cast(labels, dtypes.float32)
+      labels = head_lib._check_dense_labels_match_logits_and_reshape(  # pylint: disable=protected-access
+          labels, logits, head.logits_dimension)
       gradients = (predictions - labels) * normalizer
       hessians = predictions * (1.0 - predictions) * normalizer
       return gradients, hessians
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index 95bb9b5a3b5c0bb6bd2b2932fe60180161421cec..9ea4f484744762a98c67207d582bcc5b7be8d850 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import resources
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import session_run_hook
 
 NUM_FEATURES = 3
 
@@ -60,7 +61,7 @@ def _make_train_input_fn(is_classification):
   """Makes train input_fn for classification/regression."""
 
   def _input_fn():
-    features_dict = dict(FEATURES_DICT)
+    features_dict = dict(FEATURES_DICT)  # copies the dict to add an entry.
     features_dict[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
     labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
     return features_dict, labels
@@ -72,7 +73,7 @@ def _make_train_input_fn_dataset(is_classification, batch=None, repeat=None):
   """Makes input_fn using Dataset."""
 
   def _input_fn():
-    features_dict = dict(FEATURES_DICT)
+    features_dict = dict(FEATURES_DICT)  # copies the dict to add an entry.
     features_dict[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
     labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
     if batch:
@@ -121,6 +122,39 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
 
     return ensemble_proto
 
+  def testFirstCheckpointWorksFine(self):
+    """Tests that eval/pred doesn't crash with the very first checkpoint.
+
+    The step-0 checkpoint will have only an empty ensemble, and a separate eval
+    job might read from it and crash.
+    This test ensures that prediction/evaluation works fine with it.
+    """
+    input_fn = _make_train_input_fn(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+
+    class BailOutWithoutTraining(session_run_hook.SessionRunHook):
+
+      def before_run(self, run_context):
+        raise StopIteration('to bail out.')
+
+    est.train(input_fn, steps=100,  # must stop at 0 anyway.
+              hooks=[BailOutWithoutTraining()])
+    self._assert_checkpoint(
+        est.model_dir, global_step=0, finalized_trees=0, attempted_layers=0)
+    # Empty ensemble returns 0 logits, so that all output labels are 0.
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 0.6)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [0], [0], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
   def testTrainAndEvaluateBinaryClassifier(self):
     input_fn = _make_train_input_fn(is_classification=True)
 
@@ -160,6 +194,69 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     self.assertAllClose([[0], [1], [1], [0], [0]],
                         [pred['class_ids'] for pred in predictions])
 
+  def testTrainClassifierWithRankOneLabel(self):
+    """Tests that label with rank-1 tensor is also accepted by classifier."""
+    def _input_fn_with_rank_one_label():
+      return FEATURES_DICT, [0., 1., 1., 0., 0.]
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(_input_fn_with_rank_one_label, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=_input_fn_with_rank_one_label, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+
+  def testTrainClassifierWithLabelVocabulary(self):
+    apple, banana = 'apple', 'banana'
+    def _input_fn_with_label_vocab():
+      return FEATURES_DICT, [[apple], [banana], [banana], [apple], [apple]]
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5,
+        label_vocabulary=[apple, banana])
+    est.train(input_fn=_input_fn_with_label_vocab, steps=5)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=_input_fn_with_label_vocab, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
+  def testTrainClassifierWithIntegerLabel(self):
+    def _input_fn_with_integer_label():
+      return (FEATURES_DICT,
+              constant_op.constant([[0], [1], [1], [0], [0]], dtypes.int32))
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(input_fn=_input_fn_with_integer_label, steps=5)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=_input_fn_with_integer_label, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
   def testTrainClassifierWithDataset(self):
     train_input_fn = _make_train_input_fn_dataset(is_classification=True)
     predict_input_fn = numpy_io.numpy_input_fn(
@@ -219,6 +316,26 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
         [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
         [pred['predictions'] for pred in predictions])
 
+  def testTrainRegressorWithRankOneLabel(self):
+    """Tests that label with rank-1 tensor is also accepted by regressor."""
+    def _input_fn_with_rank_one_label():
+      return FEATURES_DICT, [1.5, 0.3, 0.2, 2., 5.]
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(_input_fn_with_rank_one_label, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=_input_fn_with_rank_one_label, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+
   def testTrainRegressorWithDataset(self):
     train_input_fn = _make_train_input_fn_dataset(is_classification=False)
     predict_input_fn = numpy_io.numpy_input_fn(
diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
index 973a6ec74777de0ccd4bfa9380574fe23062a4ea..1feac36f356cc5b2615217b7ca69a79d2a781ca6 100644
--- a/tensorflow/python/estimator/canned/dnn.py
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -126,7 +126,8 @@ def _dnn_model_fn(features,
                   activation_fn=nn.relu,
                   dropout=None,
                   input_layer_partitioner=None,
-                  config=None):
+                  config=None,
+                  tpu_estimator_spec=False):
   """Deep Neural Net model_fn.
 
   Args:
@@ -147,6 +148,8 @@ def _dnn_model_fn(features,
     input_layer_partitioner: Partitioner for input layer. Defaults
       to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
     config: `RunConfig` object to configure the runtime settings.
+    tpu_estimator_spec: Whether to return a `_TPUEstimatorSpec` or
+      or `model_fn.EstimatorSpec` instance.
 
   Returns:
     An `EstimatorSpec` instance.
@@ -182,12 +185,20 @@ def _dnn_model_fn(features,
         input_layer_partitioner=input_layer_partitioner)
     logits = logit_fn(features=features, mode=mode)
 
-    return head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        optimizer=optimizer,
-        logits=logits)
+    if tpu_estimator_spec:
+      return head._create_tpu_estimator_spec(  # pylint: disable=protected-access
+          features=features,
+          mode=mode,
+          labels=labels,
+          optimizer=optimizer,
+          logits=logits)
+    else:
+      return head.create_estimator_spec(
+          features=features,
+          mode=mode,
+          labels=labels,
+          optimizer=optimizer,
+          logits=logits)
 
 
 @tf_export('estimator.DNNClassifier')
@@ -320,17 +331,8 @@ class DNNClassifier(estimator.Estimator):
       loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
         to reduce training loss over batch. Defaults to `SUM`.
     """
-    if n_classes == 2:
-      head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
-          weight_column=weight_column,
-          label_vocabulary=label_vocabulary,
-          loss_reduction=loss_reduction)
-    else:
-      head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
-          n_classes, weight_column=weight_column,
-          label_vocabulary=label_vocabulary,
-          loss_reduction=loss_reduction)
-
+    head = head_lib._binary_logistic_or_multi_class_head(  # pylint: disable=protected-access
+        n_classes, weight_column, label_vocabulary, loss_reduction)
     def _model_fn(features, labels, mode, config):
       """Call the defined shared _dnn_model_fn."""
       return _dnn_model_fn(
diff --git a/tensorflow/python/estimator/canned/dnn_testing_utils.py b/tensorflow/python/estimator/canned/dnn_testing_utils.py
index 62b13c3200dd782c14dc427b62f9b03086c7174f..06a648777f8f730b4c739a69528090c5821f2681 100644
--- a/tensorflow/python/estimator/canned/dnn_testing_utils.py
+++ b/tensorflow/python/estimator/canned/dnn_testing_utils.py
@@ -134,7 +134,7 @@ def mock_head(testcase, hidden_units, logits_dimension, expected_logits):
       hidden_weights_names + hidden_biases_names +
       [LOGITS_WEIGHTS_NAME + '/part_0:0', LOGITS_BIASES_NAME + '/part_0:0'])
 
-  def _create_estimator_spec(
+  def _create_tpu_estimator_spec(
       features, mode, logits, labels, train_op_fn=None, optimizer=None):
     del features, labels  # Not used.
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
@@ -149,19 +149,29 @@ def mock_head(testcase, hidden_units, logits_dimension, expected_logits):
           train_op = train_op_fn(loss)
         elif optimizer is not None:
           train_op = optimizer.minimize(loss, global_step=None)
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(
             mode=mode, loss=loss, train_op=train_op)
       elif mode == model_fn.ModeKeys.EVAL:
-        return model_fn.EstimatorSpec(mode=mode, loss=array_ops.identity(loss))
+        return model_fn._TPUEstimatorSpec(
+            mode=mode, loss=array_ops.identity(loss))
       elif mode == model_fn.ModeKeys.PREDICT:
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(
             mode=mode, predictions={'logits': array_ops.identity(logits)})
       else:
         testcase.fail('Invalid mode: {}'.format(mode))
 
+  def _create_estimator_spec(
+      features, mode, logits, labels, train_op_fn=None, optimizer=None):
+    tpu_spec = _create_tpu_estimator_spec(
+        features, mode, logits, labels, train_op_fn, optimizer)
+    return tpu_spec.as_estimator_spec()
+
   head = test.mock.NonCallableMagicMock(spec=head_lib._Head)
   head.logits_dimension = logits_dimension
-  head.create_estimator_spec = test.mock.MagicMock(wraps=_create_estimator_spec)
+  head._create_tpu_estimator_spec = test.mock.MagicMock(
+      wraps=_create_tpu_estimator_spec)
+  head.create_estimator_spec = test.mock.MagicMock(
+      wraps=_create_estimator_spec)
 
   return head
 
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index 48f448d7f5f917d8d2faeee46e79cf120286c4fc..04fe4d97e40d60f7e5a5c9c2e9b40a08678f35d1 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -24,7 +24,6 @@ import collections
 import six
 
 from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator import util
 from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export_output
@@ -32,6 +31,7 @@ from tensorflow.python.feature_column import feature_column as feature_column_li
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -45,6 +45,7 @@ from tensorflow.python.ops.losses import losses
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.summary import summary
 from tensorflow.python.training import training_util
+from tensorflow.python.util import function_utils
 
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
@@ -69,6 +70,35 @@ def _summary_key(head_name, val):
   return '%s/%s' % (val, head_name) if head_name else val
 
 
+def _create_eval_metrics_tuple(fn, kwargs):
+  """Creates TPU eval metrics tuple.
+
+  Helper function to make eval_metric tuple (eval_metric_fn, fn_kwargs) used
+  by `TPUEstimator`. TPUEstimator requires that `eval_metric_fn` take
+  exclusively Tensor arguments. This helper can help create such a function from
+  a more generic function that can take both Tensor and non-Tensor arguments.
+
+  Args:
+    fn: A eval_metric_fn that takes both Tensor and non-Tensor arguments.
+        This function must return a dict of form
+        {'metric name': (metric_tensor, eval_op)}
+    kwargs: Dict of arguments for `fn`.
+
+  Returns:
+    `eval_metric` tuple that can be passed to a `model_fn._TPUEstimatorSpec`.
+  """
+  tensor_kwargs = {}
+  nontensor_kwargs = {}
+  for k, v in six.iteritems(kwargs):
+    if tensor_util.is_tensor(v):
+      tensor_kwargs[k] = v
+    else:
+      nontensor_kwargs[k] = v
+  def _fn(**tensors):
+    return fn(**dict(nontensor_kwargs, **tensors))
+  return (_fn, tensor_kwargs)
+
+
 class _Head(object):
   """Interface for the head/top of a model.
 
@@ -174,7 +204,6 @@ class _Head(object):
 
   # TODO(b/65403806): By default, collect regularization_losses from
   # GraphKeys.REGULARIZATION_LOSSES collection.
-  @abc.abstractmethod
   def create_estimator_spec(
       self, features, mode, logits, labels=None, optimizer=None,
       train_op_fn=None, regularization_losses=None):
@@ -203,7 +232,47 @@ class _Head(object):
     Returns:
       `EstimatorSpec`.
     """
-    raise NotImplementedError('Calling an abstract method.')
+    try:
+      tpu_estimator_spec = (
+          self._create_tpu_estimator_spec(
+              features, mode, logits, labels, optimizer, train_op_fn,
+              regularization_losses))
+      return tpu_estimator_spec.as_estimator_spec()
+    except NotImplementedError:
+      # Not all subclasses of _Head will have implemented
+      # _create_tpu_estimator_spec. If it is implemented, we can use it to
+      # create our `EstimatorSpec` here.
+      raise NotImplementedError(
+          'Subclasses of _Head must implement `create_estimator_spec()` or '
+          '_create_tpu_estimator_spec().')
+
+  def _create_tpu_estimator_spec(
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None, regularization_losses=None):
+    """Returns `model_fn._TPUEstimatorSpec` that a model_fn can return.
+
+    Args:
+      features: Input `dict` of `Tensor` or `SparseTensor` objects.
+      mode: Estimator's `ModeKeys`.
+      logits: logits `Tensor` to be used by the head.
+      labels: Labels `Tensor`, or `dict` of same.
+      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
+        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
+        updates variables and increments `global_step`.
+      train_op_fn: Function that takes a scalar loss `Tensor` and returns an op
+        to optimize the model with the loss in TRAIN mode. Used if `optimizer`
+        is `None`. Exactly one of `train_op_fn` and `optimizer` must be set in
+        TRAIN mode. None is allowed in other modes. If you want to optimize loss
+        yourself you can pass `lambda _: tf.no_op()` and then use
+        EstimatorSpec.loss to compute and apply gradients.
+      regularization_losses: A list of additional scalar losses to be added to
+        the training loss, such as regularization losses.
+
+    Returns:
+      A `model_fn._TPUEstimatorSpec' instance.
+    """
+    raise NotImplementedError(
+        'TPUEstimatorSpec not available for this model head.')
 
 
 def _check_dense_labels_match_logits_and_reshape(
@@ -392,7 +461,7 @@ def _validate_loss_fn_args(loss_fn):
   Raises:
     ValueError: If the signature is unexpected.
   """
-  loss_fn_args = util.fn_args(loss_fn)
+  loss_fn_args = function_utils.fn_args(loss_fn)
   for required_arg in ['labels', 'logits']:
     if required_arg not in loss_fn_args:
       raise ValueError(
@@ -415,7 +484,7 @@ def _call_loss_fn(loss_fn, labels, logits, features, expected_loss_dim=1):
   Returns:
     Loss Tensor with shape [D0, D1, ... DN, expected_loss_dim].
   """
-  loss_fn_args = util.fn_args(loss_fn)
+  loss_fn_args = function_utils.fn_args(loss_fn)
   kwargs = {}
   if 'features' in loss_fn_args:
     kwargs['features'] = features
@@ -702,10 +771,10 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
         weights=weights,
         processed_labels=label_ids)
 
-  def create_estimator_spec(
+  def _create_tpu_estimator_spec(
       self, features, mode, logits, labels=None, optimizer=None,
       train_op_fn=None, regularization_losses=None):
-    """Returns an `EstimatorSpec`.
+    """Returns a `model_fn._TPUEstimatorSpec`.
 
     Args:
       features: Input `dict` of `Tensor` or `SparseTensor` objects.
@@ -727,7 +796,7 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
         `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
         avoid scaling errors.
     Returns:
-      `EstimatorSpec`.
+      A `model_fn._TPUEstimatorSpec` instance.
     Raises:
       ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
         mode, or if both are set.
@@ -761,7 +830,7 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
         classifier_output = _classification_output(
             scores=probabilities, n_classes=self._n_classes,
             label_vocabulary=self._label_vocabulary)
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
             mode=model_fn.ModeKeys.PREDICT,
             predictions=predictions,
             export_outputs={
@@ -781,16 +850,17 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
         regularized_training_loss = training_loss
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
             loss=regularized_training_loss,
-            eval_metric_ops=self._eval_metric_ops(
-                labels=label_ids,
-                class_ids=class_ids,
-                weights=weights,
-                unreduced_loss=unreduced_loss,
-                regularization_loss=regularization_loss))
+            eval_metrics=_create_eval_metrics_tuple(self._eval_metric_ops, {
+                'labels': label_ids,
+                'class_ids': class_ids,
+                'weights': weights,
+                'unreduced_loss': unreduced_loss,
+                'regularization_loss': regularization_loss
+            }))
 
       # Train.
       if optimizer is not None:
@@ -824,7 +894,7 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
         summary.scalar(
             _summary_key(self._name, keys.LOSS_REGULARIZATION),
             regularization_loss)
-    return model_fn.EstimatorSpec(
+    return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
@@ -1060,7 +1130,7 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
         weights=weights,
         processed_labels=labels)
 
-  def create_estimator_spec(
+  def _create_tpu_estimator_spec(
       self, features, mode, logits, labels=None, optimizer=None,
       train_op_fn=None, regularization_losses=None):
     """Returns an `EstimatorSpec`.
@@ -1122,7 +1192,7 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
         classifier_output = _classification_output(
             scores=probabilities, n_classes=2,
             label_vocabulary=self._label_vocabulary)
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
             mode=model_fn.ModeKeys.PREDICT,
             predictions=predictions,
             export_outputs={
@@ -1146,18 +1216,22 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
 
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
             loss=regularized_training_loss,
-            eval_metric_ops=self._eval_metric_ops(
-                labels=processed_labels,
-                logits=logits,
-                logistic=logistic,
-                class_ids=class_ids,
-                weights=weights,
-                unreduced_loss=unreduced_loss,
-                regularization_loss=regularization_loss))
+            eval_metrics=_create_eval_metrics_tuple(
+                self._eval_metric_ops,
+                {
+                    'labels': processed_labels,
+                    'logits': logits,
+                    'logistic': logistic,
+                    'class_ids': class_ids,
+                    'weights': weights,
+                    'unreduced_loss': unreduced_loss,
+                    'regularization_loss': regularization_loss
+                }
+            ))
 
       # Train.
       if optimizer is not None:
@@ -1190,7 +1264,7 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
         summary.scalar(
             _summary_key(self._name, keys.LOSS_REGULARIZATION),
             regularization_loss)
-    return model_fn.EstimatorSpec(
+    return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
@@ -1322,7 +1396,25 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         weights=weights,
         processed_labels=labels)
 
-  def create_estimator_spec(
+  def _eval_metric_ops(self, weights, unreduced_loss, regularization_loss):
+    """Returns the Eval metric ops."""
+    keys = metric_keys.MetricKeys
+    # Estimator already adds a metric for loss.
+    eval_metric_ops = {
+        _summary_key(self._name, keys.LOSS_MEAN):
+            metrics_lib.mean(
+                values=unreduced_loss,
+                weights=weights)
+    }
+    if regularization_loss is not None:
+      regularization_loss_key = _summary_key(
+          self._name, keys.LOSS_REGULARIZATION)
+      eval_metric_ops[regularization_loss_key] = metrics_lib.mean(
+          values=regularization_loss,
+          name=keys.LOSS_REGULARIZATION)
+    return eval_metric_ops
+
+  def _create_tpu_estimator_spec(
       self, features, mode, logits, labels=None, optimizer=None,
       train_op_fn=None, regularization_losses=None):
     """Returns an `EstimatorSpec`.
@@ -1348,7 +1440,7 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
         avoid scaling errors.
     Returns:
-      `EstimatorSpec`.
+      A `model_fn._TPUEstimatorSpec` instance.
     Raises:
       ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
         mode, or if both are set.
@@ -1369,7 +1461,7 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
       if mode == model_fn.ModeKeys.PREDICT:
         regression_output = export_output.RegressionOutput(
             value=predicted_value)
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
             mode=model_fn.ModeKeys.PREDICT,
             predictions=predictions,
             export_outputs={
@@ -1390,25 +1482,18 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
 
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
-        keys = metric_keys.MetricKeys
-        # Estimator already adds a metric for loss.
-        eval_metric_ops = {
-            _summary_key(self._name, keys.LOSS_MEAN):
-                metrics_lib.mean(
-                    values=unreduced_loss,
-                    weights=weights)
-        }
-        if regularization_loss is not None:
-          regularization_loss_key = _summary_key(
-              self._name, keys.LOSS_REGULARIZATION)
-          eval_metric_ops[regularization_loss_key] = metrics_lib.mean(
-              values=regularization_loss,
-              name=keys.LOSS_REGULARIZATION)
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
             loss=regularized_training_loss,
-            eval_metric_ops=eval_metric_ops)
+            eval_metrics=_create_eval_metrics_tuple(
+                self._eval_metric_ops,
+                {
+                    'weights': weights,
+                    'unreduced_loss': unreduced_loss,
+                    'regularization_loss': regularization_loss,
+                }
+            ))
 
       # Train.
       if optimizer is not None:
@@ -1441,7 +1526,7 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         summary.scalar(
             _summary_key(self._name, keys.LOSS_REGULARIZATION),
             regularization_loss)
-    return model_fn.EstimatorSpec(
+    return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
@@ -1460,21 +1545,40 @@ def _assert_range(labels, n_classes, message=None):
       return array_ops.identity(labels)
 
 
-# TODO(b/69000400): Delete this method.
-def _weights(features, weight_column):
-  """Fetches weights from features."""
-  with ops.name_scope(None, 'weights', values=features.values()):
-    if weight_column is None:
-      return 1.
-    if isinstance(weight_column, six.string_types):
-      weight_column = feature_column_lib.numeric_column(
-          key=weight_column, shape=(1,))
-    if not isinstance(weight_column, feature_column_lib._NumericColumn):  # pylint: disable=protected-access
-      raise TypeError('Weight column must be either a string or _NumericColumn.'
-                      ' Given type: {}.'.format(type(weight_column)))
-    weights = weight_column._get_dense_tensor(  # pylint: disable=protected-access
-        feature_column_lib._LazyBuilder(features))  # pylint: disable=protected-access
-    if not (weights.dtype.is_floating or weights.dtype.is_integer):
-      raise ValueError('Weight column should be castable to float. '
-                       'Given dtype: {}'.format(weights.dtype))
-    return math_ops.to_float(weights, name='weights')
+def _binary_logistic_or_multi_class_head(
+    n_classes, weight_column, label_vocabulary, loss_reduction):
+  """Creates either binary or multi-class head.
+
+  Args:
+    n_classes: Number of label classes.
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+      weights. It is used to down weight or boost examples during training. It
+      will be multiplied by the loss of the example. If it is a string, it is
+      used as a key to fetch weight tensor from the `features`. If it is a
+      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+      then weight_column.normalizer_fn is applied on it to get weight tensor.
+    label_vocabulary: A list of strings represents possible label values. If
+      given, labels must be string type and have any value in
+      `label_vocabulary`. If it is not given, that means labels are
+      already encoded as integer or float within [0, 1] for `n_classes=2` and
+      encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+      Also there will be errors if vocabulary is not provided and labels are
+      string.
+    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+      to reduce training loss over batch. Defaults to `SUM`.
+
+  Returns:
+    `head._Head` instance.
+  """
+  if n_classes == 2:
+    head = _binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_column=weight_column,
+        label_vocabulary=label_vocabulary,
+        loss_reduction=loss_reduction)
+  else:
+    head = _multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes, weight_column=weight_column,
+        label_vocabulary=label_vocabulary,
+        loss_reduction=loss_reduction)
+  return head
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
index 32a6339936281152cad0ce1317ef7ae4810acd33..ecca3e8b0d82864c5fda6b94cc75db0521d5e8d3 100644
--- a/tensorflow/python/estimator/canned/head_test.py
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -86,6 +86,98 @@ def _sigmoid(logits):
   return 1 / (1 + np.exp(-logits))
 
 
+class CreateEstimatorSpecTest(test.TestCase):
+
+  class _HeadWithTPUSupport(head_lib._Head):
+    """Head that overrides _create_tpu_estimator_spec."""
+
+    def name(self):
+      return 'HeadWithTPUSupport'
+
+    def logits_dimension(self):
+      return None
+
+    def create_loss(self, features, mode, logits, labels):
+      return None
+
+    def _create_tpu_estimator_spec(self, features, mode, logits, labels=None,
+                                   optimizer=None, train_op_fn=None,
+                                   regularization_losses=None):
+      return model_fn._TPUEstimatorSpec(
+          mode=model_fn.ModeKeys.EVAL,
+          loss=constant_op.constant(0.0, dtype=dtypes.float32))
+
+  class _HeadWithOutTPUSupport(head_lib._Head):
+    """Head that overrides create_estimator_spec."""
+
+    def name(self):
+      return 'HeadWithOutTPUSupport'
+
+    def logits_dimension(self):
+      return None
+
+    def create_loss(self, features, mode, logits, labels):
+      return None
+
+    def create_estimator_spec(self, features, mode, logits, labels=None,
+                              optimizer=None, train_op_fn=None,
+                              regularization_losses=None):
+      return model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.EVAL,
+          loss=constant_op.constant(0.0, dtype=dtypes.float32))
+
+  class _InvalidHead(head_lib._Head):
+    """Head that overrides neither estimator_spec functions."""
+
+    def name(self):
+      return 'InvalidHead'
+
+    def logits_dimension(self):
+      return None
+
+    def create_loss(self, features, mode, logits, labels):
+      return None
+
+  def test_head_override_tpu_estimator_spec(self):
+    """Test for `_Head` that overrides _create_tpu_estimator_spec."""
+    head = self._HeadWithTPUSupport()
+
+    tpu_spec = head._create_tpu_estimator_spec(
+        features=None, mode=None, logits=None)
+    self.assertTrue(isinstance(tpu_spec, model_fn._TPUEstimatorSpec))
+    est_spec = head.create_estimator_spec(
+        features=None, mode=None, logits=None)
+    self.assertTrue(isinstance(est_spec, model_fn.EstimatorSpec))
+
+  def test_head_override_estimator_spec(self):
+    """Test for `_Head` that overrides create_estimator_spec."""
+    head = self._HeadWithOutTPUSupport()
+
+    with self.assertRaisesRegexp(
+        NotImplementedError,
+        'TPUEstimatorSpec not available for this model head.'):
+      _ = head._create_tpu_estimator_spec(
+          features=None, mode=None, logits=None)
+    est_spec = head.create_estimator_spec(
+        features=None, mode=None, logits=None)
+    self.assertTrue(isinstance(est_spec, model_fn.EstimatorSpec))
+
+  def test_invalid_head_class(self):
+    head = self._InvalidHead()
+
+    with self.assertRaisesRegexp(
+        NotImplementedError,
+        'TPUEstimatorSpec not available for this model head.'):
+      _ = head._create_tpu_estimator_spec(
+          features=None, mode=None, logits=None)
+    with self.assertRaisesRegexp(
+        NotImplementedError,
+        r'Subclasses of _Head must implement `create_estimator_spec\(\)` or '
+        r'_create_tpu_estimator_spec\(\).'):
+      _ = head.create_estimator_spec(
+          features=None, mode=None, logits=None)
+
+
 class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/estimator/canned/metric_keys.py b/tensorflow/python/estimator/canned/metric_keys.py
index f374d3154982e3b7cdc637e9e3606b3a2947cbf3..4f7c849ba4b058492c55dd27e0bf79f8d540ece9 100644
--- a/tensorflow/python/estimator/canned/metric_keys.py
+++ b/tensorflow/python/estimator/canned/metric_keys.py
@@ -42,3 +42,8 @@ class MetricKeys(object):
   ACCURACY_AT_THRESHOLD = 'accuracy/positive_threshold_%g'
   PRECISION_AT_THRESHOLD = 'precision/positive_threshold_%g'
   RECALL_AT_THRESHOLD = 'recall/positive_threshold_%g'
+
+  # The following require a class id applied.
+  PROBABILITY_MEAN_AT_CLASS = 'probability_mean/class%d'
+  AUC_AT_CLASS = 'auc/class%d'
+  AUC_PR_AT_CLASS = 'auc_precision_recall/class%d'
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index cc8023a5e7d30d68eeb2d0132e001f1d3e4ce9e3..f09c90bec87f173df18c0327b1b615fe6dbbda1f 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -36,10 +36,9 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config
-from tensorflow.python.estimator import util
-from tensorflow.python.estimator.export.export import build_all_signature_defs
-from tensorflow.python.estimator.export.export import get_temp_export_dir
-from tensorflow.python.estimator.export.export import get_timestamped_export_dir
+from tensorflow.python.estimator.export import export as export_helpers
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
@@ -51,7 +50,6 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import builder as saved_model_builder
 from tensorflow.python.saved_model import constants
-from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.summary import summary
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import device_setter
@@ -64,6 +62,7 @@ from tensorflow.python.training import training_util
 from tensorflow.python.training import warm_starting_util
 from tensorflow.python.util import compat
 from tensorflow.python.util import compat_internal
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -372,6 +371,21 @@ class Estimator(object):
     else:
       return []
 
+  def eval_dir(self, name=None):
+    """Shows directory name where evaluation metrics are dumped.
+
+    Args:
+      name: Name of the evaluation if user needs to run multiple evaluations on
+        different data sets, such as on training data vs test data. Metrics for
+        different evaluations are saved in separate folders, and appear
+        separately in tensorboard.
+
+    Returns:
+      A string which is the path of directory contains evaluation metrics.
+    """
+    return os.path.join(self._model_dir, 'eval' if not name else
+                        'eval_' + name)
+
   def evaluate(self, input_fn, steps=None, hooks=None, checkpoint_path=None,
                name=None):
     """Evaluates the model given evaluation data input_fn.
@@ -423,11 +437,25 @@ class Estimator(object):
       hooks = _check_hooks_type(hooks)
       hooks.extend(self._convert_eval_steps_to_hooks(steps))
 
-      return self._evaluate_model(
-          input_fn=input_fn,
-          hooks=hooks,
-          checkpoint_path=checkpoint_path,
-          name=name)
+      # Check that model has been trained (if nothing has been set explicitly).
+      if not checkpoint_path:
+        latest_path = saver.latest_checkpoint(self._model_dir)
+        if not latest_path:
+          logging.info('Could not find trained model in model_dir: {}, running '
+                       'initialization to evaluate.'.format(self._model_dir))
+        checkpoint_path = latest_path
+
+      with ops.Graph().as_default():
+        (scaffold, update_op,
+         eval_dict, all_hooks) = self._evaluate_build_graph(
+             input_fn, hooks, checkpoint_path)
+        return self._evaluate_run(
+            checkpoint_path=checkpoint_path,
+            scaffold=scaffold,
+            update_op=update_op,
+            eval_dict=eval_dict,
+            all_hooks=all_hooks,
+            output_dir=self.eval_dir(name))
 
   def _convert_eval_steps_to_hooks(self, steps):
     if steps is None:
@@ -609,73 +637,321 @@ class Estimator(object):
           are provided, or no checkpoint can be found.
     """
     # pylint: enable=line-too-long
+    return self._export_saved_model_for_mode(
+        export_dir_base,
+        serving_input_receiver_fn,
+        assets_extra=assets_extra,
+        as_text=as_text,
+        checkpoint_path=checkpoint_path,
+        strip_default_attrs=strip_default_attrs,
+        mode=model_fn_lib.ModeKeys.PREDICT)
+
+  def _export_saved_model_for_mode(
+      self, export_dir_base, input_receiver_fn,
+      assets_extra=None,
+      as_text=False,
+      checkpoint_path=None,
+      strip_default_attrs=False,
+      mode=model_fn_lib.ModeKeys.PREDICT):
+    # pylint: disable=line-too-long
+    """Exports a single train/eval/predict graph as a SavedModel.
+
+    This method is a wrapper for _export_all_saved_models, and wraps a raw
+    input_receiver_fn in a dictionary to pass in to that function.
+    See _export_all_saved_models for full docs.
+
+    See tf.contrib.estimator.export_saved_model_for_mode for the currently
+    exposed version of this function.
+
+    Args:
+      export_dir_base: A string containing a directory in which to create
+        timestamped subdirectories containing exported SavedModels.
+      input_receiver_fn: a function that takes no argument and
+        returns the appropriate subclass of `InputReceiver`.
+      assets_extra: A dict specifying how to populate the assets.extra directory
+        within the exported SavedModel, or `None` if no extra assets are needed.
+      as_text: whether to write the SavedModel proto in text format.
+      checkpoint_path: The checkpoint path to export.  If `None` (the default),
+        the most recent checkpoint found within the model directory is chosen.
+      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+        removed from the NodeDefs. For a detailed guide, see
+        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+      mode: tf.estimator.ModeKeys value indicating with mode will be exported.
+
+    Returns:
+      The string path to the exported directory.
+
+    Raises:
+      ValueError: if input_receiver_fn is None, no export_outputs
+        are provided, or no checkpoint can be found.
+    """
+    # pylint: enable=line-too-long
+    if not input_receiver_fn:
+      raise ValueError('An input_receiver_fn must be defined.')
+
+    input_receiver_fn_map = {mode: input_receiver_fn}
+
+    return self._export_all_saved_models(
+        export_dir_base,
+        input_receiver_fn_map,
+        assets_extra=assets_extra,
+        as_text=as_text,
+        checkpoint_path=checkpoint_path,
+        strip_default_attrs=strip_default_attrs)
+
+  def _export_all_saved_models(
+      self, export_dir_base, input_receiver_fn_map,
+      assets_extra=None,
+      as_text=False,
+      checkpoint_path=None,
+      strip_default_attrs=False):
+    # pylint: disable=line-too-long
+    """Exports a SavedModel containing MetaGraphDefs for each requested mode.
+
+    See tf.contrib.estimator.export_all_saved_models for the currently
+    exposed version of this function.
+
+    For each mode passed in via the input_receiver_fn_map,
+    this method builds a new graph by calling the input_receiver_fn to obtain
+    feature and label `Tensor`s. Next, this method calls the `Estimator`'s
+    model_fn in the passed mode to generate the model graph based on
+    those features and labels, and restores the given checkpoint
+    (or, lacking that, the most recent checkpoint) into the graph.
+    Only one of the modes is used for saving variables to the SavedModel
+    (order of preference: TRAIN, EVAL, then PREDICT), such that up to three
+    MetaGraphDefs are saved with a single set of variables in a single
+    SavedModel directory.
+
+    For the variables and MetaGraphDefs, a timestamped export directory below
+    export_dir_base, and writes a `SavedModel` into it containing
+    the `MetaGraphDef` for the given mode and its associated signatures.
+
+    For prediction, the exported `MetaGraphDef` will provide one `SignatureDef`
+    for each element of the export_outputs dict returned from the model_fn,
+    named using the same keys.  One of these keys is always
+    signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, indicating which
+    signature will be served when a serving request does not specify one.
+    For each signature, the outputs are provided by the corresponding
+    `ExportOutput`s, and the inputs are always the input receivers provided by
+    the serving_input_receiver_fn.
+
+    For training and evaluation, the train_op is stored in an extra collection,
+    and loss, metrics, and predictions are included in a SignatureDef for the
+    mode in question.
+
+    Extra assets may be written into the SavedModel via the assets_extra
+    argument.  This should be a dict, where each key gives a destination path
+    (including the filename) relative to the assets.extra directory.  The
+    corresponding value gives the full path of the source file to be copied.
+    For example, the simple case of copying a single file without renaming it
+    is specified as `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
+
+    Args:
+      export_dir_base: A string containing a directory in which to create
+        timestamped subdirectories containing exported SavedModels.
+      input_receiver_fn_map: dict of tf.estimator.ModeKeys to input_receiver_fn
+        mappings, where the input_receiver_fn is a function that takes no
+        argument and returns the appropriate subclass of `InputReceiver`.
+      assets_extra: A dict specifying how to populate the assets.extra directory
+        within the exported SavedModel, or `None` if no extra assets are needed.
+      as_text: whether to write the SavedModel proto in text format.
+      checkpoint_path: The checkpoint path to export.  If `None` (the default),
+        the most recent checkpoint found within the model directory is chosen.
+      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+        removed from the NodeDefs. For a detailed guide, see
+        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+
+    Returns:
+      A dict of tf.estimator.ModeKeys value to string path for each exported
+      directory.
+
+    Raises:
+      ValueError: if any input_receiver_fn is None, no export_outputs
+        are provided, or no checkpoint can be found.
+    """
+    # pylint: enable=line-too-long
+    # TODO(b/65561022): Consider allowing multiple input_receiver_fns per mode.
     with context.graph_mode():
-      if serving_input_receiver_fn is None:
-        raise ValueError('serving_input_receiver_fn must be defined.')
+      if not checkpoint_path:
+        # Locate the latest checkpoint
+        checkpoint_path = saver.latest_checkpoint(self._model_dir)
+      if not checkpoint_path:
+        raise ValueError("Couldn't find trained model at %s." % self._model_dir)
+
+      export_dir = export_helpers.get_timestamped_export_dir(export_dir_base)
+      temp_export_dir = export_helpers.get_temp_export_dir(export_dir)
+
+      builder = saved_model_builder.SavedModelBuilder(temp_export_dir)
+
+      save_variables = True
+      # Note that the order in which we run here matters, as the first
+      # mode we pass through will be used to save the variables. We run TRAIN
+      # first, as that is also the mode used for checkpoints, and therefore
+      # we are not likely to have vars in PREDICT that are not in the checkpoint
+      # created by TRAIN.
+      if input_receiver_fn_map.get(model_fn_lib.ModeKeys.TRAIN):
+        self._add_meta_graph_for_mode(
+            builder, input_receiver_fn_map, checkpoint_path,
+            strip_default_attrs, save_variables,
+            mode=model_fn_lib.ModeKeys.TRAIN)
+        save_variables = False
+      if input_receiver_fn_map.get(model_fn_lib.ModeKeys.EVAL):
+        self._add_meta_graph_for_mode(
+            builder, input_receiver_fn_map, checkpoint_path,
+            strip_default_attrs, save_variables,
+            mode=model_fn_lib.ModeKeys.EVAL)
+        save_variables = False
+      if input_receiver_fn_map.get(model_fn_lib.ModeKeys.PREDICT):
+        self._add_meta_graph_for_mode(
+            builder, input_receiver_fn_map, checkpoint_path,
+            strip_default_attrs, save_variables,
+            mode=model_fn_lib.ModeKeys.PREDICT)
+        save_variables = False
+
+      if save_variables:
+        raise ValueError('No valid modes for exporting found. Got {}.'.format(
+            input_receiver_fn_map.keys()))
+
+      builder.save(as_text)
+
+      # Add the extra assets
+      if assets_extra:
+        assets_extra_path = os.path.join(compat.as_bytes(temp_export_dir),
+                                         compat.as_bytes('assets.extra'))
+        for dest_relative, source in assets_extra.items():
+          dest_absolute = os.path.join(compat.as_bytes(assets_extra_path),
+                                       compat.as_bytes(dest_relative))
+          dest_path = os.path.dirname(dest_absolute)
+          gfile.MakeDirs(dest_path)
+          gfile.Copy(source, dest_absolute)
+
+      gfile.Rename(temp_export_dir, export_dir)
+      return export_dir
+
+  def _add_meta_graph_for_mode(
+      self, builder, input_receiver_fn_map, checkpoint_path,
+      strip_default_attrs, save_variables=True,
+      mode=model_fn_lib.ModeKeys.PREDICT):
+    # pylint: disable=line-too-long
+    """Loads variables and adds them along with a MetaGraphDef for saving.
 
-      with ops.Graph().as_default() as g:
-        self._create_and_assert_global_step(g)
-        random_seed.set_random_seed(self._config.tf_random_seed)
-        serving_input_receiver = serving_input_receiver_fn()
+    Args:
+      builder: instance of SavedModelBuilder that will be used for saving.
+      input_receiver_fn_map: dict of tf.estimator.ModeKeys to input_receiver_fn
+        mappings, where the input_receiver_fn is a function that takes no
+        argument and returns the appropriate subclass of `InputReceiver`.
+      checkpoint_path: The checkpoint path to export.  If `None` (the default),
+        the most recent checkpoint found within the model directory is chosen.
+      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+        removed from the NodeDefs. For a detailed guide, see
+        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+      save_variables: bool, whether variables should be saved. If False, just
+        the MetaGraphDef will be saved. Note that save_variables should only be
+        True for the first call to this function, and the SavedModelBuilder will
+        raise an error if that is not the case.
+      mode: tf.estimator.ModeKeys value indicating which mode will be exported.
+    """
+    # pylint: enable=line-too-long
+    input_receiver_fn = input_receiver_fn_map[mode]
+    with ops.Graph().as_default() as g:
+      self._create_and_assert_global_step(g)
+      random_seed.set_random_seed(self._config.tf_random_seed)
 
-        # Call the model_fn and collect the export_outputs.
-        estimator_spec = self._call_model_fn(
-            features=serving_input_receiver.features,
-            labels=None,
-            mode=model_fn_lib.ModeKeys.PREDICT,
-            config=self.config)
-
-        # Build the SignatureDefs from receivers and all outputs
-        signature_def_map = build_all_signature_defs(
-            serving_input_receiver.receiver_tensors,
-            estimator_spec.export_outputs,
-            serving_input_receiver.receiver_tensors_alternatives)
-
-        if not checkpoint_path:
-          # Locate the latest checkpoint
-          checkpoint_path = saver.latest_checkpoint(self._model_dir)
-        if not checkpoint_path:
-          raise ValueError(
-              "Couldn't find trained model at %s." % self._model_dir)
-
-        export_dir = get_timestamped_export_dir(export_dir_base)
-        temp_export_dir = get_temp_export_dir(export_dir)
-
-        # TODO(soergel): Consider whether MonitoredSession makes sense here
-        with tf_session.Session(config=self._session_config) as session:
-
-          saver_for_restore = estimator_spec.scaffold.saver or saver.Saver(
-              sharded=True)
-          saver_for_restore.restore(session, checkpoint_path)
+      input_receiver = input_receiver_fn()
+
+      # Call the model_fn and collect the export_outputs.
+      estimator_spec = self._call_model_fn(
+          features=input_receiver.features,
+          labels=getattr(input_receiver, 'labels', None),
+          mode=mode,
+          config=self.config)
+
+      export_outputs = self._get_export_outputs_for_spec(estimator_spec)
+
+      # Build the SignatureDefs from receivers and all outputs
+      signature_def_map = export_helpers.build_all_signature_defs(
+          input_receiver.receiver_tensors,
+          export_outputs,
+          getattr(input_receiver, 'receiver_tensors_alternatives', None),
+          serving_only=(mode == model_fn_lib.ModeKeys.PREDICT))
+
+      with tf_session.Session(config=self._session_config) as session:
 
-          local_init_op = (
-              estimator_spec.scaffold.local_init_op or
-              monitored_session.Scaffold.default_local_init_op())
+        export_tags = model_fn_lib.EXPORT_TAG_MAP[mode]
 
-          # Perform the export
-          builder = saved_model_builder.SavedModelBuilder(temp_export_dir)
+        local_init_op = (
+            estimator_spec.scaffold.local_init_op or
+            monitored_session.Scaffold.default_local_init_op())
+
+        saver_for_restore = estimator_spec.scaffold.saver or saver.Saver(
+            sharded=True)
+
+        try:
+          saver_for_restore.restore(session, checkpoint_path)
+        except errors.NotFoundError as e:
+          msg = ('Could not load all requested variables from the checkpoint. '
+                 'Please make sure your model_fn does not expect variables '
+                 'that were not saved in the checkpoint.\n\n'
+                 'Encountered error with mode `{}` while restoring checkpoint '
+                 'from: `{}`. Full Traceback:\n\n{}').format(
+                     mode, checkpoint_path, e)
+          raise ValueError(msg)
+
+        # We add the train op explicitly for now, so that we don't have to
+        # change the Builder public interface. Note that this is a no-op
+        # for prediction, where train_op is None.
+        builder._add_train_op(estimator_spec.train_op)  # pylint: disable=protected-access
+
+        meta_graph_kwargs = dict(
+            tags=export_tags,
+            signature_def_map=signature_def_map,
+            assets_collection=ops.get_collection(
+                ops.GraphKeys.ASSET_FILEPATHS),
+            strip_default_attrs=strip_default_attrs,
+            legacy_init_op=local_init_op)
+
+        if save_variables:
           builder.add_meta_graph_and_variables(
-              session, [tag_constants.SERVING],
-              signature_def_map=signature_def_map,
-              assets_collection=ops.get_collection(
-                  ops.GraphKeys.ASSET_FILEPATHS),
-              legacy_init_op=local_init_op,
-              strip_default_attrs=strip_default_attrs)
-          builder.save(as_text)
-
-        # Add the extra assets
-        if assets_extra:
-          assets_extra_path = os.path.join(compat.as_bytes(temp_export_dir),
-                                           compat.as_bytes('assets.extra'))
-          for dest_relative, source in assets_extra.items():
-            dest_absolute = os.path.join(compat.as_bytes(assets_extra_path),
-                                         compat.as_bytes(dest_relative))
-            dest_path = os.path.dirname(dest_absolute)
-            gfile.MakeDirs(dest_path)
-            gfile.Copy(source, dest_absolute)
-
-        gfile.Rename(temp_export_dir, export_dir)
-        return export_dir
+              session, **meta_graph_kwargs)
+        else:
+          builder.add_meta_graph(**meta_graph_kwargs)
+
+  def _get_export_outputs_for_spec(self, estimator_spec):
+    """Given an EstimatorSpec, determine what our export outputs should be.
+
+    EstimatorSpecs contain export_outputs that are used for serving, but for
+    training and eval graphs, we must wrap the tensors of interest in
+    appropriate ExportOutput objects.
+
+    Args:
+      estimator_spec: EstimatorSpec object that will be exported.
+
+    Returns:
+      a dict mapping export_output_name to ExportOutput object.
+
+    Raises:
+      ValueError: if an appropriate ExportOutput cannot be found for the
+        passed EstimatorSpec.mode
+    """
+    mode = estimator_spec.mode
+    if mode == model_fn_lib.ModeKeys.PREDICT:
+      outputs = estimator_spec.export_outputs
+    else:
+      if mode == model_fn_lib.ModeKeys.TRAIN:
+        output_class = export_output.TrainOutput
+      elif mode == model_fn_lib.ModeKeys.EVAL:
+        output_class = export_output.EvalOutput
+      else:
+        raise ValueError(
+            'Export output type not found for mode: {}'.format(mode))
+
+      export_out = output_class(
+          loss=estimator_spec.loss,
+          predictions=estimator_spec.predictions,
+          metrics=estimator_spec.eval_metric_ops)
+      outputs = {mode: export_out}
+
+    return outputs
 
   def _get_features_from_input_fn(self, input_fn, mode):
     """Extracts the `features` from return values of `input_fn`."""
@@ -790,7 +1066,7 @@ class Estimator(object):
     Raises:
       ValueError: if input_fn takes invalid arguments.
     """
-    input_fn_args = util.fn_args(input_fn)
+    input_fn_args = function_utils.fn_args(input_fn)
     kwargs = {}
     if 'mode' in input_fn_args:
       kwargs['mode'] = mode
@@ -816,7 +1092,7 @@ class Estimator(object):
     Raises:
       ValueError: if model_fn returns invalid objects.
     """
-    model_fn_args = util.fn_args(self._model_fn)
+    model_fn_args = function_utils.fn_args(self._model_fn)
     kwargs = {}
     if 'labels' in model_fn_args:
       kwargs['labels'] = labels
@@ -1064,70 +1340,59 @@ class Estimator(object):
         _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
     return loss
 
-  def _evaluate_model(self,
-                      input_fn,
-                      hooks=None,
-                      checkpoint_path=None,
-                      name=''):
-    """Evaluates the model using the training.evaluation library."""
-    # Check that model has been trained (if nothing has been set explicitly).
-    if not checkpoint_path:
-      latest_path = saver.latest_checkpoint(self._model_dir)
-      if not latest_path:
-        logging.info('Could not find trained model in model_dir: {}, running '
-                     'initialization to evaluate.'.format(self._model_dir))
-      checkpoint_path = latest_path
-
-    # Setup output directory.
-    eval_dir = os.path.join(self._model_dir, 'eval' if not name else
-                            'eval_' + name)
-
-    with ops.Graph().as_default() as g:
-      random_seed.set_random_seed(self._config.tf_random_seed)
-      global_step_tensor = self._create_and_assert_global_step(g)
-      features, labels, input_hooks = (
-          self._get_features_and_labels_from_input_fn(
-              input_fn, model_fn_lib.ModeKeys.EVAL))
-      estimator_spec = self._call_model_fn(
-          features, labels, model_fn_lib.ModeKeys.EVAL, self.config)
-
-      # Call to warm_start has to be after model_fn is called.
-      self._maybe_warm_start(checkpoint_path)
-
-      if model_fn_lib.LOSS_METRIC_KEY in estimator_spec.eval_metric_ops:
-        raise ValueError(
-            'Metric with name "%s" is not allowed, because Estimator ' % (
-                model_fn_lib.LOSS_METRIC_KEY) +
-            'already defines a default metric with the same name.')
-      estimator_spec.eval_metric_ops[
-          model_fn_lib.LOSS_METRIC_KEY] = metrics_lib.mean(estimator_spec.loss)
+  def _evaluate_build_graph(self, input_fn, hooks=None, checkpoint_path=None):
+    """Builds the graph and related hooks to run evaluation."""
+    random_seed.set_random_seed(self._config.tf_random_seed)
+    global_step_tensor = self._create_and_assert_global_step(
+        ops.get_default_graph())
+    features, labels, input_hooks = (
+        self._get_features_and_labels_from_input_fn(input_fn,
+                                                    model_fn_lib.ModeKeys.EVAL))
+    estimator_spec = self._call_model_fn(
+        features, labels, model_fn_lib.ModeKeys.EVAL, self.config)
+
+    # Call to warm_start has to be after model_fn is called.
+    self._maybe_warm_start(checkpoint_path)
+
+    if model_fn_lib.LOSS_METRIC_KEY in estimator_spec.eval_metric_ops:
+      raise ValueError(
+          'Metric with name "%s" is not allowed, because Estimator ' %
+          (model_fn_lib.LOSS_METRIC_KEY) +
+          'already defines a default metric with the same name.')
+    estimator_spec.eval_metric_ops[
+        model_fn_lib.LOSS_METRIC_KEY] = metrics_lib.mean(estimator_spec.loss)
 
-      update_op, eval_dict = _extract_metric_update_ops(
-          estimator_spec.eval_metric_ops)
+    update_op, eval_dict = _extract_metric_update_ops(
+        estimator_spec.eval_metric_ops)
 
-      if ops.GraphKeys.GLOBAL_STEP in eval_dict:
-        raise ValueError(
-            'Metric with name `global_step` is not allowed, because Estimator '
-            'already defines a default metric with the same name.')
-      eval_dict[ops.GraphKeys.GLOBAL_STEP] = global_step_tensor
-
-      all_hooks = list(input_hooks)
-      all_hooks.extend(hooks)
-      all_hooks.extend(list(estimator_spec.evaluation_hooks or []))
-
-      eval_results = evaluation._evaluate_once(  # pylint: disable=protected-access
-          checkpoint_path=checkpoint_path,
-          master=self._config.evaluation_master,
-          scaffold=estimator_spec.scaffold,
-          eval_ops=update_op,
-          final_ops=eval_dict,
-          hooks=all_hooks,
-          config=self._session_config)
-
-      _write_dict_to_summary(
-          output_dir=eval_dir,
-          dictionary=eval_results,
-          current_global_step=eval_results[ops.GraphKeys.GLOBAL_STEP])
+    if ops.GraphKeys.GLOBAL_STEP in eval_dict:
+      raise ValueError(
+          'Metric with name `global_step` is not allowed, because Estimator '
+          'already defines a default metric with the same name.')
+    eval_dict[ops.GraphKeys.GLOBAL_STEP] = global_step_tensor
+
+    all_hooks = list(input_hooks)
+    all_hooks.extend(hooks)
+    all_hooks.extend(list(estimator_spec.evaluation_hooks or []))
+
+    return estimator_spec.scaffold, update_op, eval_dict, all_hooks
+
+  def _evaluate_run(self, checkpoint_path, scaffold, update_op, eval_dict,
+                    all_hooks, output_dir):
+    """Run evaluation."""
+    eval_results = evaluation._evaluate_once(  # pylint: disable=protected-access
+        checkpoint_path=checkpoint_path,
+        master=self._config.evaluation_master,
+        scaffold=scaffold,
+        eval_ops=update_op,
+        final_ops=eval_dict,
+        hooks=all_hooks,
+        config=self._session_config)
+
+    _write_dict_to_summary(
+        output_dir=output_dir,
+        dictionary=eval_results,
+        current_global_step=eval_results[ops.GraphKeys.GLOBAL_STEP])
 
     return eval_results
 
@@ -1225,7 +1490,7 @@ def _get_replica_device_setter(config):
 
 def _verify_model_fn_args(model_fn, params):
   """Verifies model fn arguments."""
-  args = set(util.fn_args(model_fn))
+  args = set(function_utils.fn_args(model_fn))
   if 'features' not in args:
     raise ValueError('model_fn (%s) must include features argument.' % model_fn)
   if params is not None and 'params' not in args:
@@ -1351,6 +1616,7 @@ class _DatasetInitializerHook(training.SessionRunHook):
     session.run(self._initializer)
 
 VocabInfo = warm_starting_util.VocabInfo  # pylint: disable=invalid-name
+tf_export('estimator.VocabInfo', allow_multiple_exports=True)(VocabInfo)
 
 
 @tf_export('estimator.WarmStartSettings')
diff --git a/tensorflow/python/estimator/estimator_lib.py b/tensorflow/python/estimator/estimator_lib.py
index 3815f4247056444f85996c62a2a4e3fff03e3f2a..f188f2d4e6096a968691e94201ba69efe506833f 100644
--- a/tensorflow/python/estimator/estimator_lib.py
+++ b/tensorflow/python/estimator/estimator_lib.py
@@ -39,6 +39,7 @@ from tensorflow.python.estimator.exporter import Exporter
 from tensorflow.python.estimator.exporter import FinalExporter
 from tensorflow.python.estimator.exporter import LatestExporter
 from tensorflow.python.estimator.inputs import inputs
+from tensorflow.python.estimator.keras import model_to_estimator
 from tensorflow.python.estimator.model_fn import EstimatorSpec
 from tensorflow.python.estimator.model_fn import ModeKeys
 from tensorflow.python.estimator.run_config import RunConfig
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index 76b45b7f57633b2e8cb5948befa22142a69db5aa..1b701899487ede96a90a6be299323fe278e5027f 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -33,7 +33,6 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config
-from tensorflow.python.estimator import util
 from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.export import export_output
 from tensorflow.python.estimator.inputs import numpy_io
@@ -72,6 +71,7 @@ from tensorflow.python.training import saver_test_utils
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training
 from tensorflow.python.util import compat
+from tensorflow.python.util import function_utils
 
 _TMP_DIR = '/tmp'
 _ANOTHER_TMP_DIR = '/another_tmp'
@@ -332,7 +332,7 @@ class EstimatorConstructorTest(test.TestCase):
       _, _, _, _, _ = features, labels, mode, config, params
 
     est = estimator.Estimator(model_fn=model_fn)
-    model_fn_args = util.fn_args(est.model_fn)
+    model_fn_args = function_utils.fn_args(est.model_fn)
     self.assertEqual(
         set(['features', 'labels', 'mode', 'config']), set(model_fn_args))
 
@@ -342,7 +342,7 @@ class EstimatorConstructorTest(test.TestCase):
       _, _ = features, labels
 
     est = estimator.Estimator(model_fn=model_fn)
-    model_fn_args = util.fn_args(est.model_fn)
+    model_fn_args = function_utils.fn_args(est.model_fn)
     self.assertEqual(
         set(['features', 'labels', 'mode', 'config']), set(model_fn_args))
 
@@ -1061,6 +1061,15 @@ class EstimatorDatasetIntegrationTest(test.TestCase):
 
 class EstimatorEvaluateTest(test.TestCase):
 
+  def test_eval_dir(self):
+    est = estimator.Estimator(
+        model_fn=model_fn_global_step_incrementer,
+        model_dir='some_path')
+    expected_eval_dir = os.path.join('some_path', 'eval')
+    self.assertEqual(expected_eval_dir, est.eval_dir())
+    expected_eval_dir_name = os.path.join('some_path', 'eval_a_name')
+    self.assertEqual(expected_eval_dir_name, est.eval_dir('a_name'))
+
   def test_input_fn_args(self):
     expected_mode = model_fn_lib.ModeKeys.EVAL
     expected_params = {'batch_size': 10}
@@ -1385,7 +1394,7 @@ class EstimatorEvaluateTest(test.TestCase):
     # Get last evaluation Event written.
     for key in ['foo/0', 'foo/1', 'foo/2']:
       self.assertTrue(
-          check_eventfile_for_keyword(key, os.path.join(est.model_dir, 'eval')),
+          check_eventfile_for_keyword(key, est.eval_dir()),
           '{} should be part of reported summaries.'.format(key))
 
 
@@ -1865,6 +1874,41 @@ def _model_fn_for_export_tests(features, labels, mode):
           'test': export_output.ClassificationOutput(scores, classes)})
 
 
+def _x_y_input_fn():
+  return ({'x': constant_op.constant([[1], [1]]),
+           'y': constant_op.constant([[2], [2]])},
+          constant_op.constant([[1], [1]]))
+
+
+def _model_fn_with_x_y(features, labels, mode):
+  _ = labels
+  variables.Variable(1., name='weight')
+  scores = constant_op.constant([3.])
+  classes = constant_op.constant(['wumpus'])
+  if mode == model_fn_lib.ModeKeys.PREDICT:
+    variables.Variable(36., name='name_collision')
+    return model_fn_lib.EstimatorSpec(
+        mode,
+        predictions=constant_op.constant(10.),
+        export_outputs={
+            'test': export_output.ClassificationOutput(scores, classes)})
+  else:
+    prefix = 'eval_' if mode == model_fn_lib.ModeKeys.EVAL else ''
+
+    multiplied = math_ops.multiply(
+        features['x'], features['y'], name='{}multiplied'.format(prefix))
+    metrics = {'mean': metrics_lib.mean(features['x'] - features['y'],
+                                        name='{}mean'.format(prefix))}
+    variables.Variable(1., name='later_var')
+    variables.Variable(3., name='name_collision')
+    return model_fn_lib.EstimatorSpec(
+        mode,
+        predictions=multiplied,
+        loss=constant_op.constant(1.),
+        train_op=state_ops.assign_add(training.get_global_step(), 1),
+        eval_metric_ops=metrics)
+
+
 def _model_fn_with_saveables_for_export_tests(features, labels, mode):
   _, _ = features, labels
   table = saver_test_utils.CheckpointedOp(name='v2')
@@ -1881,21 +1925,41 @@ def _model_fn_with_saveables_for_export_tests(features, labels, mode):
           'test': export_output.PredictOutput({'prediction': prediction})})
 
 
+def _get_serving_input_receiver_fn():
+  feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
+                  'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
+  return export.build_parsing_serving_input_receiver_fn(feature_spec)
+
+
+def _get_supervised_input_receiver_fn():
+  feature_spec = {
+      'x': array_ops.placeholder(
+          dtype=dtypes.int64, shape=(2, 1), name='feature_x'),
+      'y': array_ops.placeholder(
+          dtype=dtypes.int64, shape=(2, 1), name='feature_y')
+      }
+  label_spec = array_ops.placeholder(
+      dtype=dtypes.float32, shape=[1], name='truth')
+
+  return export.build_raw_supervised_input_receiver_fn(feature_spec, label_spec)
+
+
 _VOCAB_FILE_CONTENT = 'emerson\nlake\npalmer\n'
 _EXTRA_FILE_CONTENT = 'kermit\npiggy\nralph\n'
 
 
 class EstimatorExportTest(test.TestCase):
 
-  def test_export_savedmodel_proto_roundtrip(self):
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
-    est.train(input_fn=dummy_input_fn, steps=1)
+  def test_export_savedmodel_proto_roundtrip_raw_receiver(self):
     feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
                     'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
     serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
         feature_spec)
 
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
+    est.train(input_fn=dummy_input_fn, steps=1)
+
     # Perform the export.
     export_dir_base = os.path.join(
         compat.as_bytes(tmpdir), compat.as_bytes('export'))
@@ -1904,6 +1968,248 @@ class EstimatorExportTest(test.TestCase):
 
     # Check that all the files are in the right places.
     self.assertTrue(gfile.Exists(export_dir_base))
+    self._validate_exported_files(export_dir)
+
+    # Restore, to validate that the export was well-formed.
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('input_example_tensor' in graph_ops)
+        self.assertTrue('ParseExample/ParseExample' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+  def test_export_saved_model_train(self):
+    self._test_export_saved_model_for_mode(
+        _get_supervised_input_receiver_fn(), model_fn_lib.ModeKeys.TRAIN)
+
+  def test_export_saved_model_eval(self):
+    self._test_export_saved_model_for_mode(
+        _get_supervised_input_receiver_fn(), model_fn_lib.ModeKeys.EVAL)
+
+  def test_export_saved_model_predict(self):
+    self._test_export_saved_model_for_mode(
+        _get_serving_input_receiver_fn(), model_fn_lib.ModeKeys.PREDICT)
+
+  def _test_export_saved_model_for_mode(self, input_receiver_fn, mode):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
+    est.train(input_fn=_x_y_input_fn, steps=1)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = est._export_saved_model_for_mode(
+        export_dir_base, input_receiver_fn, mode=mode)
+
+    # Check that all the files are in the right places.
+    self.assertTrue(gfile.Exists(export_dir_base))
+    self._validate_exported_files(export_dir)
+
+    # Restore, to validate that the export was well-formed.
+    tag_set = model_fn_lib.EXPORT_TAG_MAP[mode]
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, tag_set, export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertFalse('name_collision_1' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_receiver_map(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('input_example_tensor' in graph_ops)
+        self.assertTrue('ParseExample/ParseExample' in graph_ops)
+        self.assertFalse('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_train_only(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('multiplied' in graph_ops)
+        self.assertTrue('mean/update_op' in graph_ops)
+        self.assertFalse('eval_multiplied' in graph_ops)
+        self.assertTrue('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_eval_only(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.EVAL], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('eval_multiplied' in graph_ops)
+        self.assertTrue('eval_mean/value' in graph_ops)
+        self.assertFalse('multiplied' in graph_ops)
+        self.assertTrue('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_no_serving(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('multiplied' in graph_ops)
+        self.assertFalse('eval_multiplied' in graph_ops)
+        self.assertTrue('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.EVAL], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('eval_multiplied' in graph_ops)
+        self.assertFalse('multiplied' in graph_ops)
+        # TODO(karmel): is this the desired behavior when names are shared?
+        self.assertTrue('feature_x_1' in graph_ops)
+        self.assertTrue('feature_y_1' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_three_defs(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    # Restore, to validate that the export was well-formed.
+    for tag_set in model_fn_lib.EXPORT_TAG_MAP.values():
+      with ops.Graph().as_default() as graph:
+        with session.Session(graph=graph) as sess:
+          loader.load(sess, tag_set, export_dir)
+          graph_ops = [x.name for x in graph.get_operations()]
+          self.assertTrue('global_step/Assign' in graph_ops)
+          self.assertTrue('global_step/Initializer/zeros' in graph_ops)
+          self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_all_vars(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('later_var' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertFalse('later_var' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_name_collision(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('name_collision' in graph_ops)
+        self.assertFalse('name_collision_1' in graph_ops)
+        collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+        self.assertEqual(3, collection_vars[-1].eval())
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('name_collision' in graph_ops)
+        self.assertFalse('name_collision_1' in graph_ops)
+        collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+        # This is a non-obvious detail: when we load the estimator spec
+        # for predict, name_collision gets set to 36. However, we then restore
+        # from checkpoint, which should overwrite that var and make it the 3
+        # from training. In practice, this would not be a good way to write
+        # a model_fn, but leaving this check in for now to ensure consistency
+        # with what would happen given our current order of spec, then
+        # checkpoint.
+        self.assertEqual(3, collection_vars[-1].eval())
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def _test_export_all_saved_models(self, input_receiver_fn_map):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_with_x_y)
+    est.train(input_fn=_x_y_input_fn, steps=1)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = est._export_all_saved_models(
+        export_dir_base, input_receiver_fn_map)
+
+    # Check that all the files are in the right places.
+    self.assertTrue(gfile.Exists(export_dir_base))
+
+    self._validate_exported_files(export_dir)
+
+    return export_dir, tmpdir
+
+  def _validate_exported_files(self, export_dir):
     self.assertTrue(gfile.Exists(export_dir))
     self.assertTrue(gfile.Exists(os.path.join(
         compat.as_bytes(export_dir),
@@ -1918,17 +2224,41 @@ class EstimatorExportTest(test.TestCase):
         compat.as_bytes(export_dir),
         compat.as_bytes('variables/variables.data-00000-of-00001'))))
 
-    # Restore, to validate that the export was well-formed.
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.SERVING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('input_example_tensor' in graph_ops)
-        self.assertTrue('ParseExample/ParseExample' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
+  def test_export_all_saved_models_var_not_found(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
 
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
+    def _model_fn_with_predict_only_vars(features, labels, mode):
+      _, _ = features, labels
+      if mode == model_fn_lib.ModeKeys.PREDICT:
+        variables.Variable(1., name='only_in_predict')
+      else:
+        variables.Variable(1., name='otherwise')
+
+      prediction = constant_op.constant(1.)
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          predictions=prediction,
+          loss=constant_op.constant(1.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          export_outputs={
+              'test': export_output.PredictOutput({'prediction': prediction})
+          })
+
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_with_predict_only_vars)
+    est.train(input_fn=_x_y_input_fn, steps=1)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+
+    err_regex = r'Could not load all requested variables[\w\W]*infer'
+    with self.assertRaisesRegexp(ValueError, err_regex):
+      est._export_all_saved_models(export_dir_base, input_receiver_fn_map)
 
   def test_export_savedmodel_with_saveables_proto_roundtrip(self):
     tmpdir = tempfile.mkdtemp()
@@ -2161,6 +2491,43 @@ class EstimatorExportTest(test.TestCase):
 
     self.assertTrue(self.mock_saver.restore.called)
 
+  def test_scaffold_is_used_for_saver_multiple_modes(self):
+    tmpdir = tempfile.mkdtemp()
+
+    def _model_fn_scaffold(features, labels, mode):
+      _, _ = features, labels
+      variables.Variable(1., name='weight')
+      real_saver = saver.Saver()
+      self.mock_saver = test.mock.Mock(
+          wraps=real_saver, saver_def=real_saver.saver_def)
+      scores = constant_op.constant([3.])
+      if mode == model_fn_lib.ModeKeys.PREDICT:
+        scaffold = training.Scaffold(saver=self.mock_saver)
+      else:
+        scaffold = training.Scaffold()
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          predictions=constant_op.constant([[1.]]),
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          scaffold=scaffold,
+          export_outputs={'test': export_output.ClassificationOutput(scores)})
+
+    est = estimator.Estimator(model_fn=_model_fn_scaffold)
+    est.train(dummy_input_fn, steps=1)
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    est._export_all_saved_models(export_dir_base, input_receiver_fn_map)
+
+    self.assertTrue(self.mock_saver.restore.called)
+
   def test_scaffold_is_used_for_local_init(self):
     tmpdir = tempfile.mkdtemp()
 
@@ -2206,6 +2573,61 @@ class EstimatorExportTest(test.TestCase):
         my_int_value = sess.run(my_int)
         self.assertEqual(12345, my_int_value)
 
+  def test_scaffold_is_used_for_local_init_multiple_modes(self):
+    tmpdir = tempfile.mkdtemp()
+
+    def _model_fn_scaffold(features, labels, mode):
+      _, _ = features, labels
+      my_int = variables.Variable(1, name='my_int',
+                                  collections=[ops.GraphKeys.LOCAL_VARIABLES])
+      scores = constant_op.constant([3.])
+      with ops.control_dependencies([
+          variables.local_variables_initializer(),
+          lookup_ops.tables_initializer()
+      ]):
+        assign_op = state_ops.assign(my_int, 12345)
+
+      custom_local_init_op = None
+      if mode == model_fn_lib.ModeKeys.PREDICT:
+        # local_initSop must be an Operation, not a Tensor.
+        custom_local_init_op = control_flow_ops.group(assign_op)
+
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          predictions=constant_op.constant([[1.]]),
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          scaffold=training.Scaffold(local_init_op=custom_local_init_op),
+          export_outputs={'test': export_output.ClassificationOutput(scores)})
+
+    est = estimator.Estimator(model_fn=_model_fn_scaffold)
+    est.train(dummy_input_fn, steps=1)
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = est._export_all_saved_models(
+        export_dir_base, input_receiver_fn_map)
+
+    # Restore, to validate that the custom local_init_op runs.
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        my_int = graph.get_tensor_by_name('my_int:0')
+        my_int_value = sess.run(my_int)
+        self.assertEqual(12345, my_int_value)
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        my_int = graph.get_tensor_by_name('my_int:0')
+        my_int_value = sess.run(my_int)
+        self.assertEqual(1, my_int_value)
+
   def test_features_labels_mode(self):
     given_features = {'test-features': constant_op.constant([[1], [1]])}
 
@@ -2485,5 +2907,6 @@ class EstimatorIntegrationTest(test.TestCase):
                                        serving_input_receiver_fn)
     self.assertTrue(gfile.Exists(export_dir))
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 41c1f5a2e25cd6ba1e93744b5b82ecc34c4375d0..48ae8cd49791c27a1e9674ed1be19d543d690b35 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -14,7 +14,6 @@
 # ==============================================================================
 """Configuration and utilities for receiving inputs at serving time."""
 
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -37,69 +36,104 @@ from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
-
 _SINGLE_FEATURE_DEFAULT_NAME = 'feature'
 _SINGLE_RECEIVER_DEFAULT_NAME = 'input'
+_SINGLE_LABEL_DEFAULT_NAME = 'label'
+
+
+def _wrap_and_check_receiver_tensors(receiver_tensors):
+  """Ensure that receiver_tensors is a dict of str to Tensor mappings.
+
+  Args:
+    receiver_tensors: dict of str to Tensors, or a single Tensor.
+
+  Returns:
+    dict of str to Tensors; this is the original dict if one was passed, or
+    the original tensor wrapped in a dictionary.
+
+  Raises:
+    ValueError: if receiver_tensors is None, or has non-string keys,
+      or non-Tensor values
+  """
+  if receiver_tensors is None:
+    raise ValueError('receiver_tensors must be defined.')
+  if not isinstance(receiver_tensors, dict):
+    receiver_tensors = {_SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
+  for name, tensor in receiver_tensors.items():
+    _check_tensor_key(name, error_label='receiver_tensors')
+    _check_tensor(tensor, name, error_label='receiver_tensor')
+  return receiver_tensors
+
+
+def _check_tensor(tensor, name, error_label='feature'):
+  """Check that passed `tensor` is a Tensor or SparseTensor."""
+  if not (isinstance(tensor, ops.Tensor) or
+          isinstance(tensor, sparse_tensor.SparseTensor)):
+    fmt_name = ' {}'.format(name) if name else ''
+    value_error = ValueError('{}{} must be a Tensor or SparseTensor.'.format(
+        error_label, fmt_name))
+    # NOTE(ericmc): This if-else block is a specific carve-out for
+    # LabeledTensor, which has a `.tensor` attribute and which is
+    # convertible to tf.Tensor via ops.convert_to_tensor.
+    # Allowing all types convertible to tf.Tensor is considered by soergel@
+    # to be too permissive.
+    # TODO(soergel): accept any type convertible to Tensor,
+    # as in cl/193238295 snapshot #6.
+    if hasattr(tensor, 'tensor'):
+      try:
+        ops.convert_to_tensor(tensor)
+      except TypeError:
+        raise value_error
+    else:
+      raise value_error
+
+
+def _check_tensor_key(name, error_label='feature'):
+  if not isinstance(name, six.string_types):
+    raise ValueError('{} keys must be strings: {}.'.format(error_label, name))
 
 
 @tf_export('estimator.export.ServingInputReceiver')
-class ServingInputReceiver(collections.namedtuple(
-    'ServingInputReceiver',
-    ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])):
+class ServingInputReceiver(
+    collections.namedtuple(
+        'ServingInputReceiver',
+        ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])):
   """A return type for a serving_input_receiver_fn.
 
   The expected return values are:
     features: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
-      `SparseTensor`, specifying the features to be passed to the model.
-    receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
-      input nodes where this receiver expects to be fed by default.  Typically,
-      this is a single placeholder expecting serialized `tf.Example` protos.
+      `SparseTensor`, specifying the features to be passed to the model. Note:
+      if `features` passed is not a dict, it will be wrapped in a dict with a
+      single entry, using 'feature' as the key.  Consequently, the model must
+      accept a feature dict of the form {'feature': tensor}.  You may use
+      `TensorServingInputReceiver` if you want the tensor to be passed as is.
+    receiver_tensors: A `Tensor`, `SparseTensor`, or dict of string to `Tensor`
+      or `SparseTensor`, specifying input nodes where this receiver expects to
+      be fed by default.  Typically, this is a single placeholder expecting
+      serialized `tf.Example` protos.
     receiver_tensors_alternatives: a dict of string to additional
-      groups of receiver tensors, each of which may be a `Tensor` or a dict of
-      string to `Tensor`.  These named receiver tensor alternatives generate
-      additional serving signatures, which may be used to feed inputs at
-      different points within the input receiver subgraph.  A typical usage is
-      to allow feeding raw feature `Tensor`s *downstream* of the
-      tf.parse_example() op.  Defaults to None.
+      groups of receiver tensors, each of which may be a `Tensor`,
+      `SparseTensor`, or dict of string to `Tensor` or`SparseTensor`.
+      These named receiver tensor alternatives generate additional serving
+      signatures, which may be used to feed inputs at different points within
+      the input receiver subgraph.  A typical usage is to allow feeding raw
+      feature `Tensor`s *downstream* of the tf.parse_example() op.
+      Defaults to None.
   """
 
-  def __new__(cls, features, receiver_tensors,
+  def __new__(cls,
+              features,
+              receiver_tensors,
               receiver_tensors_alternatives=None):
     if features is None:
       raise ValueError('features must be defined.')
     if not isinstance(features, dict):
       features = {_SINGLE_FEATURE_DEFAULT_NAME: features}
     for name, tensor in features.items():
-      if not isinstance(name, six.string_types):
-        raise ValueError('feature keys must be strings: {}.'.format(name))
-      if not (isinstance(tensor, ops.Tensor)
-              or isinstance(tensor, sparse_tensor.SparseTensor)):
-        value_error = ValueError(
-            'feature {} must be a Tensor or SparseTensor.'.format(name))
-        # NOTE(ericmc): This if-else block is a specific carve-out for
-        # LabeledTensor, which has a `.tensor` attribute and which is
-        # convertible to tf.Tensor via ops.convert_to_tensor.
-        # Allowing all types convertible to tf.Tensor is considered by soergel@
-        # to be too permissive.
-        if hasattr(tensor, 'tensor'):
-          try:
-            ops.convert_to_tensor(tensor)
-          except TypeError:
-            raise value_error
-        else:
-          raise value_error
-
-    if receiver_tensors is None:
-      raise ValueError('receiver_tensors must be defined.')
-    if not isinstance(receiver_tensors, dict):
-      receiver_tensors = {_SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
-    for name, tensor in receiver_tensors.items():
-      if not isinstance(name, six.string_types):
-        raise ValueError(
-            'receiver_tensors keys must be strings: {}.'.format(name))
-      if not isinstance(tensor, ops.Tensor):
-        raise ValueError(
-            'receiver_tensor {} must be a Tensor.'.format(name))
+      _check_tensor_key(name)
+      _check_tensor(tensor, name)
+
+    receiver_tensors = _wrap_and_check_receiver_tensors(receiver_tensors)
 
     if receiver_tensors_alternatives is not None:
       if not isinstance(receiver_tensors_alternatives, dict):
@@ -109,20 +143,16 @@ class ServingInputReceiver(collections.namedtuple(
       for alternative_name, receiver_tensors_alt in (
           six.iteritems(receiver_tensors_alternatives)):
         if not isinstance(receiver_tensors_alt, dict):
-          receiver_tensors_alt = {_SINGLE_RECEIVER_DEFAULT_NAME:
-                                  receiver_tensors_alt}
+          receiver_tensors_alt = {
+              _SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors_alt
+          }
           # Updating dict during iteration is OK in this case.
           receiver_tensors_alternatives[alternative_name] = (
               receiver_tensors_alt)
         for name, tensor in receiver_tensors_alt.items():
-          if not isinstance(name, six.string_types):
-            raise ValueError(
-                'receiver_tensors keys must be strings: {}.'.format(name))
-          if not (isinstance(tensor, ops.Tensor)
-                  or isinstance(tensor, sparse_tensor.SparseTensor)):
-            raise ValueError(
-                'receiver_tensor {} must be a Tensor or SparseTensor.'.format(
-                    name))
+          _check_tensor_key(name, error_label='receiver_tensors_alternative')
+          _check_tensor(
+              tensor, name, error_label='receiver_tensors_alternative')
 
     return super(ServingInputReceiver, cls).__new__(
         cls,
@@ -132,9 +162,10 @@ class ServingInputReceiver(collections.namedtuple(
 
 
 @tf_export('estimator.export.TensorServingInputReceiver')
-class TensorServingInputReceiver(collections.namedtuple(
-    'TensorServingInputReceiver',
-    ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])):
+class TensorServingInputReceiver(
+    collections.namedtuple(
+        'TensorServingInputReceiver',
+        ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])):
   """A return type for a serving_input_receiver_fn.
 
   This is for use with models that expect a single `Tensor` or `SparseTensor`
@@ -155,25 +186,27 @@ class TensorServingInputReceiver(collections.namedtuple(
   The expected return values are:
     features: A single `Tensor` or `SparseTensor`, representing the feature
       to be passed to the model.
-    receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
-      input nodes where this receiver expects to be fed by default.  Typically,
-      this is a single placeholder expecting serialized `tf.Example` protos.
+    receiver_tensors: A `Tensor`, `SparseTensor`, or dict of string to `Tensor`
+      or `SparseTensor`, specifying input nodes where this receiver expects to
+      be fed by default.  Typically, this is a single placeholder expecting
+      serialized `tf.Example` protos.
     receiver_tensors_alternatives: a dict of string to additional
-      groups of receiver tensors, each of which may be a `Tensor` or a dict of
-      string to `Tensor`.  These named receiver tensor alternatives generate
-      additional serving signatures, which may be used to feed inputs at
-      different points within the input receiver subgraph.  A typical usage is
-      to allow feeding raw feature `Tensor`s *downstream* of the
-      tf.parse_example() op.  Defaults to None.
+      groups of receiver tensors, each of which may be a `Tensor`,
+      `SparseTensor`, or dict of string to `Tensor` or`SparseTensor`.
+      These named receiver tensor alternatives generate additional serving
+      signatures, which may be used to feed inputs at different points within
+      the input receiver subgraph.  A typical usage is to allow feeding raw
+      feature `Tensor`s *downstream* of the tf.parse_example() op.
+      Defaults to None.
   """
 
-  def __new__(cls, features, receiver_tensors,
+  def __new__(cls,
+              features,
+              receiver_tensors,
               receiver_tensors_alternatives=None):
     if features is None:
       raise ValueError('features must be defined.')
-    if not (isinstance(features, ops.Tensor)
-            or isinstance(features, sparse_tensor.SparseTensor)):
-      raise ValueError('feature must be a Tensor or SparseTensor.')
+    _check_tensor(features, None)
 
     receiver = ServingInputReceiver(
         features=features,
@@ -187,6 +220,49 @@ class TensorServingInputReceiver(collections.namedtuple(
         receiver_tensors_alternatives=receiver.receiver_tensors_alternatives)
 
 
+class SupervisedInputReceiver(
+    collections.namedtuple('SupervisedInputReceiver',
+                           ['features', 'labels', 'receiver_tensors'])):
+  """A return type for a training_input_receiver_fn or eval_input_receiver_fn.
+
+  This differs from a ServingInputReceiver in that (1) this receiver expects
+  a set of labels to be passed in with features, and (2) this receiver does
+  not support receiver_tensors_alternatives, which are primarily used for
+  serving.
+
+  The expected return values are:
+    features: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
+      `SparseTensor`, specifying the features to be passed to the model.
+    labels: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
+      `SparseTensor`, specifying the labels to be passed to the model.
+    receiver_tensors: A `Tensor`, `SparseTensor`, or dict of string to `Tensor`
+      or `SparseTensor`, specifying input nodes where this receiver expects to
+      be fed by default.  Typically, this is a single placeholder expecting
+      serialized `tf.Example` protos.
+
+  """
+
+  def __new__(cls, features, labels, receiver_tensors):
+    # Both features and labels can be dicts or raw tensors.
+    for input_vals, error_label in ((features, 'feature'), (labels, 'label')):
+      if input_vals is None:
+        raise ValueError('{}s must be defined.'.format(error_label))
+      if isinstance(input_vals, dict):
+        for name, tensor in input_vals.items():
+          _check_tensor_key(name, error_label=error_label)
+          _check_tensor(tensor, name, error_label=error_label)
+      else:
+        _check_tensor(input_vals, None, error_label=error_label)
+
+    receiver_tensors = _wrap_and_check_receiver_tensors(receiver_tensors)
+
+    return super(SupervisedInputReceiver, cls).__new__(
+        cls,
+        features=features,
+        labels=labels,
+        receiver_tensors=receiver_tensors)
+
+
 @tf_export('estimator.export.build_parsing_serving_input_receiver_fn')
 def build_parsing_serving_input_receiver_fn(feature_spec,
                                             default_batch_size=None):
@@ -204,11 +280,13 @@ def build_parsing_serving_input_receiver_fn(feature_spec,
   Returns:
     A serving_input_receiver_fn suitable for use in serving.
   """
+
   def serving_input_receiver_fn():
     """An input_fn that expects a serialized tf.Example."""
-    serialized_tf_example = array_ops.placeholder(dtype=dtypes.string,
-                                                  shape=[default_batch_size],
-                                                  name='input_example_tensor')
+    serialized_tf_example = array_ops.placeholder(
+        dtype=dtypes.string,
+        shape=[default_batch_size],
+        name='input_example_tensor')
     receiver_tensors = {'examples': serialized_tf_example}
     features = parsing_ops.parse_example(serialized_tf_example, feature_spec)
     return ServingInputReceiver(features, receiver_tensors)
@@ -216,6 +294,25 @@ def build_parsing_serving_input_receiver_fn(feature_spec,
   return serving_input_receiver_fn
 
 
+def _placeholder_from_tensor(t, default_batch_size=None):
+  shape_list = t.get_shape().as_list()
+  shape_list[0] = default_batch_size
+  shape = tensor_shape.TensorShape(shape_list)
+
+  # Reuse the feature tensor's op name (t.op.name) for the placeholder,
+  # excluding the index from the tensor's name (t.name):
+  # t.name = "%s:%d" % (t.op.name, t._value_index)
+  return array_ops.placeholder(dtype=t.dtype, shape=shape, name=t.op.name)
+
+
+def _placeholders_from_receiver_tensors_dict(input_vals,
+                                             default_batch_size=None):
+  return {
+      name: _placeholder_from_tensor(t, default_batch_size)
+      for name, t in input_vals.items()
+  }
+
+
 @tf_export('estimator.export.build_raw_serving_input_receiver_fn')
 def build_raw_serving_input_receiver_fn(features, default_batch_size=None):
   """Build a serving_input_receiver_fn expecting feature Tensors.
@@ -231,19 +328,12 @@ def build_raw_serving_input_receiver_fn(features, default_batch_size=None):
   Returns:
     A serving_input_receiver_fn.
   """
+
   def serving_input_receiver_fn():
     """A serving_input_receiver_fn that expects features to be fed directly."""
-    receiver_tensors = {}
-    for name, t in features.items():
-      shape_list = t.get_shape().as_list()
-      shape_list[0] = default_batch_size
-      shape = tensor_shape.TensorShape(shape_list)
-
-      # Reuse the feature tensor's op name (t.op.name) for the placeholder,
-      # excluding the index from the tensor's name (t.name):
-      # t.name = "%s:%d" % (t.op.name, t._value_index)
-      receiver_tensors[name] = array_ops.placeholder(
-          dtype=t.dtype, shape=shape, name=t.op.name)
+    receiver_tensors = _placeholders_from_receiver_tensors_dict(
+        features, default_batch_size)
+
     # TODO(b/34885899): remove the unnecessary copy
     # The features provided are simply the placeholders, but we defensively copy
     # the dict because it may be mutated.
@@ -252,13 +342,101 @@ def build_raw_serving_input_receiver_fn(features, default_batch_size=None):
   return serving_input_receiver_fn
 
 
+def build_raw_supervised_input_receiver_fn(features,
+                                           labels,
+                                           default_batch_size=None):
+  """Build a supervised_input_receiver_fn for raw features and labels.
+
+  This function wraps tensor placeholders in a supervised_receiver_fn
+  with the expectation that the features and labels appear precisely as
+  the model_fn expects them. Features and labels can therefore be dicts of
+  tensors, or raw tensors.
+
+  Args:
+    features: a dict of string to `Tensor` or `Tensor`.
+    labels: a dict of string to `Tensor` or `Tensor`.
+    default_batch_size: the number of query examples expected per batch.
+        Leave unset for variable batch size (recommended).
+
+  Returns:
+    A supervised_input_receiver_fn.
+
+  Raises:
+    ValueError: if features and labels have overlapping keys.
+  """
+  # Check for overlapping keys before beginning.
+  try:
+    feat_keys = features.keys()
+  except AttributeError:
+    feat_keys = [_SINGLE_RECEIVER_DEFAULT_NAME]
+  try:
+    label_keys = labels.keys()
+  except AttributeError:
+    label_keys = [_SINGLE_LABEL_DEFAULT_NAME]
+
+  overlap_keys = set(feat_keys) & set(label_keys)
+  if overlap_keys:
+    raise ValueError('Features and labels must have distinct keys. '
+                     'Found overlapping keys: {}'.format(overlap_keys))
+
+  def supervised_input_receiver_fn():
+    """A receiver_fn that expects pass-through features and labels."""
+    if not isinstance(features, dict):
+      features_cp = _placeholder_from_tensor(features, default_batch_size)
+      receiver_features = {_SINGLE_RECEIVER_DEFAULT_NAME: features_cp}
+    else:
+      receiver_features = _placeholders_from_receiver_tensors_dict(
+          features, default_batch_size)
+      features_cp = receiver_features
+
+    if not isinstance(labels, dict):
+      labels_cp = _placeholder_from_tensor(labels, default_batch_size)
+      receiver_labels = {_SINGLE_LABEL_DEFAULT_NAME: labels_cp}
+    else:
+      receiver_labels = _placeholders_from_receiver_tensors_dict(
+          labels, default_batch_size)
+      labels_cp = receiver_labels
+
+    receiver_tensors = dict(receiver_features)
+    receiver_tensors.update(receiver_labels)
+    return SupervisedInputReceiver(features_cp, labels_cp, receiver_tensors)
+
+  return supervised_input_receiver_fn
+
+
 ### Below utilities are specific to SavedModel exports.
 
 
 def build_all_signature_defs(receiver_tensors,
                              export_outputs,
-                             receiver_tensors_alternatives=None):
-  """Build `SignatureDef`s for all export outputs."""
+                             receiver_tensors_alternatives=None,
+                             serving_only=True):
+  """Build `SignatureDef`s for all export outputs.
+
+  Args:
+    receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
+      input nodes where this receiver expects to be fed by default.  Typically,
+      this is a single placeholder expecting serialized `tf.Example` protos.
+    export_outputs: a dict of ExportOutput instances, each of which has
+      an as_signature_def instance method that will be called to retrieve
+      the signature_def for all export output tensors.
+    receiver_tensors_alternatives: a dict of string to additional
+      groups of receiver tensors, each of which may be a `Tensor` or a dict of
+      string to `Tensor`.  These named receiver tensor alternatives generate
+      additional serving signatures, which may be used to feed inputs at
+      different points within the input receiver subgraph.  A typical usage is
+      to allow feeding raw feature `Tensor`s *downstream* of the
+      tf.parse_example() op.  Defaults to None.
+    serving_only: boolean; if true, resulting signature defs will only include
+      valid serving signatures. If false, all requested signatures will be
+      returned.
+
+  Returns:
+    signature_def representing all passed args.
+
+  Raises:
+    ValueError: if export_outputs is not a dict
+  """
   if not isinstance(receiver_tensors, dict):
     receiver_tensors = {_SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
   if export_outputs is None or not isinstance(export_outputs, dict):
@@ -279,11 +457,12 @@ def build_all_signature_defs(receiver_tensors,
     for receiver_name, receiver_tensors_alt in (
         six.iteritems(receiver_tensors_alternatives)):
       if not isinstance(receiver_tensors_alt, dict):
-        receiver_tensors_alt = {_SINGLE_RECEIVER_DEFAULT_NAME:
-                                receiver_tensors_alt}
+        receiver_tensors_alt = {
+            _SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors_alt
+        }
       for output_key, export_output in export_outputs.items():
-        signature_name = '{}:{}'.format(receiver_name or 'None',
-                                        output_key or 'None')
+        signature_name = '{}:{}'.format(receiver_name or 'None', output_key or
+                                        'None')
         try:
           signature = export_output.as_signature_def(receiver_tensors_alt)
           signature_def_map[signature_name] = signature
@@ -293,17 +472,27 @@ def build_all_signature_defs(receiver_tensors,
   _log_signature_report(signature_def_map, excluded_signatures)
 
   # The above calls to export_output.as_signature_def should return only
-  # valid signatures; if there is a validity problem, they raise ValueError,
-  # which we ignore above. Consequently the call to is_valid_signature here
-  # should not remove anything else; it's just an extra sanity check.
-  return {k: v for k, v in signature_def_map.items()
-          if signature_def_utils.is_valid_signature(v)}
+  # valid signatures; if there is a validity problem, they raise a ValueError,
+  # in which case we exclude that signature from signature_def_map above.
+  # The is_valid_signature check ensures that the signatures produced are
+  # valid for serving, and acts as an additional sanity check for export
+  # signatures produced for serving. We skip this check for training and eval
+  # signatures, which are not intended for serving.
+  if serving_only:
+    signature_def_map = {
+        k: v
+        for k, v in signature_def_map.items()
+        if signature_def_utils.is_valid_signature(v)
+    }
+  return signature_def_map
 
 
 _FRIENDLY_METHOD_NAMES = {
     signature_constants.CLASSIFY_METHOD_NAME: 'Classify',
     signature_constants.REGRESS_METHOD_NAME: 'Regress',
     signature_constants.PREDICT_METHOD_NAME: 'Predict',
+    signature_constants.SUPERVISED_TRAIN_METHOD_NAME: 'Train',
+    signature_constants.SUPERVISED_EVAL_METHOD_NAME: 'Eval',
 }
 
 
@@ -335,8 +524,8 @@ def _log_signature_report(signature_def_map, excluded_signatures):
 
   if not signature_def_map:
     logging.warn('Export includes no signatures!')
-  elif (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-        not in signature_def_map):
+  elif (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY not in
+        signature_def_map):
     logging.warn('Export includes no default signature!')
 
 
@@ -376,6 +565,5 @@ def get_temp_export_dir(timestamped_export_dir):
   """
   (dirname, basename) = os.path.split(timestamped_export_dir)
   temp_export_dir = os.path.join(
-      compat.as_bytes(dirname),
-      compat.as_bytes('temp-{}'.format(basename)))
+      compat.as_bytes(dirname), compat.as_bytes('temp-{}'.format(basename)))
   return temp_export_dir
diff --git a/tensorflow/python/estimator/export/export_output.py b/tensorflow/python/estimator/export/export_output.py
index 87b964be37197dac99b8ce4398cbdaf3b4989c7f..d387ea2940e7a450afe28b884c52113355c70fe6 100644
--- a/tensorflow/python/estimator/export/export_output.py
+++ b/tensorflow/python/estimator/export/export_output.py
@@ -38,6 +38,8 @@ class ExportOutput(object):
 
   __metaclass__ = abc.ABCMeta
 
+  _SEPARATOR_CHAR = '/'
+
   @abc.abstractmethod
   def as_signature_def(self, receiver_tensors):
     """Generate a SignatureDef proto for inclusion in a MetaGraphDef.
@@ -51,6 +53,52 @@ class ExportOutput(object):
     """
     pass
 
+  def _check_output_key(self, key, error_label):
+    # For multi-head models, the key can be a tuple.
+    if isinstance(key, tuple):
+      key = self._SEPARATOR_CHAR.join(key)
+
+    if not isinstance(key, six.string_types):
+      raise ValueError(
+          '{} output key must be a string; got {}.'.format(error_label, key))
+    return key
+
+  def _wrap_and_check_outputs(
+      self, outputs, single_output_default_name, error_label=None):
+    """Wraps raw tensors as dicts and checks type.
+
+    Note that we create a new dict here so that we can overwrite the keys
+    if necessary.
+
+    Args:
+      outputs: A `Tensor` or a dict of string to `Tensor`.
+      single_output_default_name: A string key for use in the output dict
+        if the provided `outputs` is a raw tensor.
+      error_label: descriptive string for use in error messages. If none,
+        single_output_default_name will be used.
+
+    Returns:
+      A dict of tensors
+
+    Raises:
+      ValueError: if the outputs dict keys are not strings or tuples of strings
+        or the values are not Tensors.
+    """
+    if not isinstance(outputs, dict):
+      outputs = {single_output_default_name: outputs}
+
+    output_dict = {}
+    for key, value in outputs.items():
+      error_name = error_label or single_output_default_name
+      key = self._check_output_key(key, error_name)
+      if not isinstance(value, ops.Tensor):
+        raise ValueError(
+            '{} output value must be a Tensor; got {}.'.format(
+                error_name, value))
+
+      output_dict[key] = value
+    return output_dict
+
 
 @tf_export('estimator.export.ClassificationOutput')
 class ClassificationOutput(ExportOutput):
@@ -154,9 +202,6 @@ class RegressionOutput(ExportOutput):
     return signature_def_utils.regression_signature_def(examples, self.value)
 
 
-_SINGLE_OUTPUT_DEFAULT_NAME = 'output'
-
-
 @tf_export('estimator.export.PredictOutput')
 class PredictOutput(ExportOutput):
   """Represents the output of a generic prediction head.
@@ -165,6 +210,7 @@ class PredictOutput(ExportOutput):
 
   Named outputs must be provided as a dict from string to `Tensor`,
   """
+  _SINGLE_OUTPUT_DEFAULT_NAME = 'output'
 
   def __init__(self, outputs):
     """Constructor for PredictOutput.
@@ -177,16 +223,9 @@ class PredictOutput(ExportOutput):
       ValueError: if the outputs is not dict, or any of its keys are not
           strings, or any of its values are not `Tensor`s.
     """
-    if not isinstance(outputs, dict):
-      outputs = {_SINGLE_OUTPUT_DEFAULT_NAME: outputs}
-    for key, value in outputs.items():
-      if not isinstance(key, six.string_types):
-        raise ValueError(
-            'Prediction output key must be a string; got {}.'.format(key))
-      if not isinstance(value, ops.Tensor):
-        raise ValueError(
-            'Prediction output value must be a Tensor; got {}.'.format(value))
-    self._outputs = outputs
+
+    self._outputs = self._wrap_and_check_outputs(
+        outputs, self._SINGLE_OUTPUT_DEFAULT_NAME, error_label='Prediction')
 
   @property
   def outputs(self):
@@ -195,3 +234,161 @@ class PredictOutput(ExportOutput):
   def as_signature_def(self, receiver_tensors):
     return signature_def_utils.predict_signature_def(receiver_tensors,
                                                      self.outputs)
+
+
+class _SupervisedOutput(ExportOutput):
+  """Represents the output of a supervised training or eval process."""
+  __metaclass__ = abc.ABCMeta
+
+  LOSS_NAME = 'loss'
+  PREDICTIONS_NAME = 'predictions'
+  METRICS_NAME = 'metrics'
+
+  METRIC_VALUE_SUFFIX = 'value'
+  METRIC_UPDATE_SUFFIX = 'update_op'
+
+  _loss = None
+  _predictions = None
+  _metrics = None
+
+  def __init__(self, loss=None, predictions=None, metrics=None):
+    """Constructor for SupervisedOutput (ie, Train or Eval output).
+
+    Args:
+      loss: dict of Tensors or single Tensor representing calculated loss.
+      predictions: dict of Tensors or single Tensor representing model
+        predictions.
+      metrics: dict of (metric_value, update_op) tuples, or a single tuple.
+        metric_value must be a Tensor, and update_op must be a Tensor or Op.
+
+    Raises:
+      ValueError: if any of the outputs' dict keys are not strings or tuples of
+        strings or the values are not Tensors (or Operations in the case of
+        update_op).
+    """
+
+    if loss is not None:
+      loss_dict = self._wrap_and_check_outputs(loss, self.LOSS_NAME)
+      self._loss = self._prefix_output_keys(loss_dict, self.LOSS_NAME)
+    if predictions is not None:
+      pred_dict = self._wrap_and_check_outputs(
+          predictions, self.PREDICTIONS_NAME)
+      self._predictions = self._prefix_output_keys(
+          pred_dict, self.PREDICTIONS_NAME)
+    if metrics is not None:
+      self._metrics = self._wrap_and_check_metrics(metrics)
+
+  def _prefix_output_keys(self, output_dict, output_name):
+    """Prepend output_name to the output_dict keys if it doesn't exist.
+
+    This produces predictable prefixes for the pre-determined outputs
+    of SupervisedOutput.
+
+    Args:
+      output_dict: dict of string to Tensor, assumed valid.
+      output_name: prefix string to prepend to existing keys.
+
+    Returns:
+      dict with updated keys and existing values.
+    """
+
+    new_outputs = {}
+    for key, val in output_dict.items():
+      key = self._prefix_key(key, output_name)
+      new_outputs[key] = val
+    return new_outputs
+
+  def _prefix_key(self, key, output_name):
+    if key.find(output_name) != 0:
+      key = output_name + self._SEPARATOR_CHAR + key
+    return key
+
+  def _wrap_and_check_metrics(self, metrics):
+    """Handle the saving of metrics.
+
+    Metrics is either a tuple of (value, update_op), or a dict of such tuples.
+    Here, we separate out the tuples and create a dict with names to tensors.
+
+    Args:
+      metrics: dict of (metric_value, update_op) tuples, or a single tuple.
+
+    Returns:
+      dict of output_names to tensors
+
+    Raises:
+      ValueError: if the dict key is not a string, or the metric values or ops
+        are not tensors.
+    """
+    if not isinstance(metrics, dict):
+      metrics = {self.METRICS_NAME: metrics}
+
+    outputs = {}
+    for key, (metric_val, metric_op) in metrics.items():
+      key = self._check_output_key(key, self.METRICS_NAME)
+      key = self._prefix_key(key, self.METRICS_NAME)
+
+      val_name = key + self._SEPARATOR_CHAR + self.METRIC_VALUE_SUFFIX
+      op_name = key + self._SEPARATOR_CHAR + self.METRIC_UPDATE_SUFFIX
+      if not isinstance(metric_val, ops.Tensor):
+        raise ValueError(
+            '{} output value must be a Tensor; got {}.'.format(
+                key, metric_val))
+      if (not isinstance(metric_op, ops.Tensor) and
+          not isinstance(metric_op, ops.Operation)):
+        raise ValueError(
+            '{} update_op must be a Tensor or Operation; got {}.'.format(
+                key, metric_op))
+      outputs[val_name] = metric_val
+      outputs[op_name] = metric_op
+
+    return outputs
+
+  @property
+  def loss(self):
+    return self._loss
+
+  @property
+  def predictions(self):
+    return self._predictions
+
+  @property
+  def metrics(self):
+    return self._metrics
+
+  @abc.abstractmethod
+  def _get_signature_def_fn(self):
+    """Returns a function that produces a SignatureDef given desired outputs."""
+    pass
+
+  def as_signature_def(self, receiver_tensors):
+    signature_def_fn = self._get_signature_def_fn()
+    return signature_def_fn(
+        receiver_tensors, self.loss, self.predictions, self.metrics)
+
+
+class TrainOutput(_SupervisedOutput):
+  """Represents the output of a supervised training process.
+
+  This class generates the appropriate signature def for exporting
+  training output by type-checking and wrapping loss, predictions, and metrics
+  values.
+  """
+
+  def _get_signature_def_fn(self):
+    return signature_def_utils.supervised_train_signature_def
+
+
+class EvalOutput(_SupervisedOutput):
+  """Represents the output of a supervised eval process.
+
+  This class generates the appropriate signature def for exporting
+  eval output by type-checking and wrapping loss, predictions, and metrics
+  values.
+  """
+
+  def _get_signature_def_fn(self):
+    return signature_def_utils.supervised_eval_signature_def
+
+
+
+
diff --git a/tensorflow/python/estimator/export/export_output_test.py b/tensorflow/python/estimator/export/export_output_test.py
index 7090e53d807817db7d66ed0ee1307d7e38e9e87e..b21ba91b0fbb7e14df5eb74dbabace57d3596cc9 100644
--- a/tensorflow/python/estimator/export/export_output_test.py
+++ b/tensorflow/python/estimator/export/export_output_test.py
@@ -225,5 +225,115 @@ class ExportOutputTest(test.TestCase):
       })
 
 
+class MockSupervisedOutput(export_output_lib._SupervisedOutput):
+  """So that we can test the abstract class methods directly."""
+
+  def _get_signature_def_fn(self):
+    pass
+
+
+class SupervisedOutputTest(test.TestCase):
+
+  def test_supervised_outputs_valid(self):
+    """Tests that no errors are raised when provided outputs are valid."""
+    loss = {"my_loss": constant_op.constant([0])}
+    predictions = {u"output1": constant_op.constant(["foo"])}
+    metrics = {"metrics": (constant_op.constant([0]),
+                           constant_op.constant([10])),
+               "metrics2": (constant_op.constant([0]),
+                            constant_op.constant([10]))}
+
+    outputter = MockSupervisedOutput(loss, predictions, metrics)
+    self.assertEqual(outputter.loss["loss/my_loss"], loss["my_loss"])
+    self.assertEqual(
+        outputter.predictions["predictions/output1"], predictions["output1"])
+    self.assertEqual(outputter.metrics["metrics/value"], metrics["metrics"][0])
+    self.assertEqual(
+        outputter.metrics["metrics2/update_op"], metrics["metrics2"][1])
+
+    # Single Tensor is OK too
+    outputter = MockSupervisedOutput(
+        loss["my_loss"], predictions["output1"], metrics["metrics"])
+    self.assertEqual(outputter.loss, {"loss": loss["my_loss"]})
+    self.assertEqual(
+        outputter.predictions, {"predictions": predictions["output1"]})
+    self.assertEqual(outputter.metrics["metrics/value"], metrics["metrics"][0])
+
+  def test_supervised_outputs_none(self):
+    outputter = MockSupervisedOutput(
+        constant_op.constant([0]), None, None)
+    self.assertEqual(len(outputter.loss), 1)
+    self.assertEqual(outputter.predictions, None)
+    self.assertEqual(outputter.metrics, None)
+
+  def test_supervised_outputs_invalid(self):
+    with self.assertRaisesRegexp(ValueError, "predictions output value must"):
+      MockSupervisedOutput(constant_op.constant([0]), [3], None)
+    with self.assertRaisesRegexp(ValueError, "loss output value must"):
+      MockSupervisedOutput("str", None, None)
+    with self.assertRaisesRegexp(ValueError, "metrics output value must"):
+      MockSupervisedOutput(None, None, (15.3, 4))
+    with self.assertRaisesRegexp(ValueError, "loss output key must"):
+      MockSupervisedOutput({25: "Tensor"}, None, None)
+
+  def test_supervised_outputs_tuples(self):
+    """Tests that no errors are raised when provided outputs are valid."""
+    loss = {("my", "loss"): constant_op.constant([0])}
+    predictions = {(u"output1", "2"): constant_op.constant(["foo"])}
+    metrics = {("metrics", "twice"): (constant_op.constant([0]),
+                                      constant_op.constant([10]))}
+
+    outputter = MockSupervisedOutput(loss, predictions, metrics)
+    self.assertEqual(set(outputter.loss.keys()), set(["loss/my/loss"]))
+    self.assertEqual(set(outputter.predictions.keys()),
+                     set(["predictions/output1/2"]))
+    self.assertEqual(set(outputter.metrics.keys()),
+                     set(["metrics/twice/value", "metrics/twice/update_op"]))
+
+  def test_supervised_outputs_no_prepend(self):
+    """Tests that no errors are raised when provided outputs are valid."""
+    loss = {"loss": constant_op.constant([0])}
+    predictions = {u"predictions": constant_op.constant(["foo"])}
+    metrics = {u"metrics": (constant_op.constant([0]),
+                            constant_op.constant([10]))}
+
+    outputter = MockSupervisedOutput(loss, predictions, metrics)
+    self.assertEqual(set(outputter.loss.keys()), set(["loss"]))
+    self.assertEqual(set(outputter.predictions.keys()), set(["predictions"]))
+    self.assertEqual(set(outputter.metrics.keys()),
+                     set(["metrics/value", "metrics/update_op"]))
+
+  def test_train_signature_def(self):
+    loss = {"my_loss": constant_op.constant([0])}
+    predictions = {u"output1": constant_op.constant(["foo"])}
+    metrics = {"metrics": (constant_op.constant([0]),
+                           constant_op.constant([10]))}
+
+    outputter = export_output_lib.TrainOutput(loss, predictions, metrics)
+
+    receiver = {u"features": constant_op.constant(100, shape=(100, 2)),
+                "labels": constant_op.constant(100, shape=(100, 1))}
+    sig_def = outputter.as_signature_def(receiver)
+
+    self.assertTrue("loss/my_loss" in sig_def.outputs)
+    self.assertTrue("metrics/value" in sig_def.outputs)
+    self.assertTrue("predictions/output1" in sig_def.outputs)
+    self.assertTrue("features" in sig_def.inputs)
+
+  def test_eval_signature_def(self):
+    loss = {"my_loss": constant_op.constant([0])}
+    predictions = {u"output1": constant_op.constant(["foo"])}
+
+    outputter = export_output_lib.EvalOutput(loss, predictions, None)
+
+    receiver = {u"features": constant_op.constant(100, shape=(100, 2)),
+                "labels": constant_op.constant(100, shape=(100, 1))}
+    sig_def = outputter.as_signature_def(receiver)
+
+    self.assertTrue("loss/my_loss" in sig_def.outputs)
+    self.assertFalse("metrics/value" in sig_def.outputs)
+    self.assertTrue("predictions/output1" in sig_def.outputs)
+    self.assertTrue("features" in sig_def.inputs)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/estimator/export/export_test.py b/tensorflow/python/estimator/export/export_test.py
index c203be7dacf80043079b35cd8e89fc5b69dcaaa0..0af587f2a850dff3ca2dc744e157ed5fbb329735 100644
--- a/tensorflow/python/estimator/export/export_test.py
+++ b/tensorflow/python/estimator/export/export_test.py
@@ -54,7 +54,7 @@ ops.register_tensor_conversion_function(LabeledTensorMock,
                                         _convert_labeled_tensor_mock_to_tensor)
 
 
-class ExportTest(test_util.TensorFlowTestCase):
+class ServingInputReceiverTest(test_util.TensorFlowTestCase):
 
   def test_serving_input_receiver_constructor(self):
     """Tests that no errors are raised when input is expected."""
@@ -161,6 +161,165 @@ class ExportTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       _ = export.ServingInputReceiver(feature, receiver_tensor)
 
+
+class SupervisedInputReceiverTest(test_util.TensorFlowTestCase):
+
+  def test_input_receiver_constructor(self):
+    """Tests that no errors are raised when input is expected."""
+    features = {
+        "feature0": constant_op.constant([0]),
+        u"feature1": constant_op.constant([1]),
+        "feature2": sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+    }
+    labels = {
+        "classes": constant_op.constant([0] * 100),
+    }
+
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+    export.SupervisedInputReceiver(features, labels, receiver_tensors)
+
+  def test_input_receiver_raw_values(self):
+    """Tests that no errors are raised when input is expected."""
+    features = {
+        "feature0": constant_op.constant([0]),
+        u"feature1": constant_op.constant([1]),
+        "feature2": sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+    }
+
+    labels = {
+        "classes": constant_op.constant([0] * 100),
+    }
+
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+    rec = export.SupervisedInputReceiver(
+        features["feature2"], labels, receiver_tensors)
+    self.assertIsInstance(rec.features, sparse_tensor.SparseTensor)
+
+    rec = export.SupervisedInputReceiver(
+        features, labels["classes"], receiver_tensors)
+    self.assertIsInstance(rec.labels, ops.Tensor)
+
+  def test_input_receiver_features_invalid(self):
+    features = constant_op.constant([0] * 100)
+    labels = constant_op.constant([0])
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+
+    with self.assertRaisesRegexp(ValueError, "features must be defined"):
+      export.SupervisedInputReceiver(
+          features=None,
+          labels=labels,
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(ValueError, "feature keys must be strings"):
+      export.SupervisedInputReceiver(
+          features={1: constant_op.constant([1])},
+          labels=labels,
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(ValueError, "label keys must be strings"):
+      export.SupervisedInputReceiver(
+          features=features,
+          labels={1: constant_op.constant([1])},
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(
+        ValueError, "feature feature1 must be a Tensor or SparseTensor"):
+      export.SupervisedInputReceiver(
+          features={"feature1": [1]},
+          labels=labels,
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(
+        ValueError, "feature must be a Tensor or SparseTensor"):
+      export.SupervisedInputReceiver(
+          features=[1],
+          labels=labels,
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(
+        ValueError, "label must be a Tensor or SparseTensor"):
+      export.SupervisedInputReceiver(
+          features=features,
+          labels=100,
+          receiver_tensors=receiver_tensors)
+
+  def test_input_receiver_receiver_tensors_invalid(self):
+    features = {
+        "feature0": constant_op.constant([0]),
+        u"feature1": constant_op.constant([1]),
+        "feature2": sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+    }
+    labels = constant_op.constant([0])
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensors must be defined"):
+      export.SupervisedInputReceiver(
+          features=features,
+          labels=labels,
+          receiver_tensors=None)
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensors keys must be strings"):
+      export.SupervisedInputReceiver(
+          features=features,
+          labels=labels,
+          receiver_tensors={
+              1: array_ops.placeholder(dtypes.string, name="example0")})
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensor example1 must be a Tensor"):
+      export.SupervisedInputReceiver(
+          features=features,
+          labels=labels,
+          receiver_tensors={"example1": [1]})
+
+  def test_single_feature_single_receiver(self):
+    feature = constant_op.constant(5)
+    label = constant_op.constant(5)
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    input_receiver = export.SupervisedInputReceiver(
+        feature, label, receiver_tensor)
+
+    # single receiver is automatically named
+    receiver_key, = input_receiver.receiver_tensors.keys()
+    self.assertEqual("input", receiver_key)
+
+  def test_multi_feature_single_receiver(self):
+    features = {"foo": constant_op.constant(5),
+                "bar": constant_op.constant(6)}
+    labels = {"value": constant_op.constant(5)}
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    _ = export.SupervisedInputReceiver(features, labels, receiver_tensor)
+
+  def test_multi_feature_multi_receiver(self):
+    features = {"foo": constant_op.constant(5),
+                "bar": constant_op.constant(6)}
+    labels = {"value": constant_op.constant(5)}
+    receiver_tensors = {"baz": array_ops.placeholder(dtypes.int64),
+                        "qux": array_ops.placeholder(dtypes.float32)}
+    _ = export.SupervisedInputReceiver(features, labels, receiver_tensors)
+
+  def test_feature_labeled_tensor(self):
+    feature = LabeledTensorMock()
+    label = constant_op.constant(5)
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    _ = export.SupervisedInputReceiver(feature, label, receiver_tensor)
+
+
+class ExportTest(test_util.TensorFlowTestCase):
+
   def test_build_parsing_serving_input_receiver_fn(self):
     feature_spec = {"int_feature": parsing_ops.VarLenFeature(dtypes.int64),
                     "float_feature": parsing_ops.VarLenFeature(dtypes.float32)}
@@ -237,6 +396,69 @@ class ExportTest(test_util.TensorFlowTestCase):
           dtypes.int32,
           serving_input_receiver.receiver_tensors["feature_2"].dtype)
 
+  def test_build_raw_supervised_input_receiver_fn(self):
+    features = {"feature_1": constant_op.constant(["hello"]),
+                "feature_2": constant_op.constant([42])}
+    labels = {"foo": constant_op.constant([5]),
+              "bar": constant_op.constant([6])}
+    input_receiver_fn = export.build_raw_supervised_input_receiver_fn(
+        features, labels)
+    with ops.Graph().as_default():
+      input_receiver = input_receiver_fn()
+      self.assertEqual(set(["feature_1", "feature_2"]),
+                       set(input_receiver.features.keys()))
+      self.assertEqual(set(["foo", "bar"]),
+                       set(input_receiver.labels.keys()))
+      self.assertEqual(set(["feature_1", "feature_2", "foo", "bar"]),
+                       set(input_receiver.receiver_tensors.keys()))
+      self.assertEqual(
+          dtypes.string, input_receiver.receiver_tensors["feature_1"].dtype)
+      self.assertEqual(
+          dtypes.int32, input_receiver.receiver_tensors["feature_2"].dtype)
+
+  def test_build_raw_supervised_input_receiver_fn_raw_tensors(self):
+    features = {"feature_1": constant_op.constant(["hello"]),
+                "feature_2": constant_op.constant([42])}
+    labels = {"foo": constant_op.constant([5]),
+              "bar": constant_op.constant([6])}
+    input_receiver_fn1 = export.build_raw_supervised_input_receiver_fn(
+        features["feature_1"], labels)
+    input_receiver_fn2 = export.build_raw_supervised_input_receiver_fn(
+        features["feature_1"], labels["foo"])
+    with ops.Graph().as_default():
+      input_receiver = input_receiver_fn1()
+      self.assertIsInstance(input_receiver.features, ops.Tensor)
+      self.assertEqual(set(["foo", "bar"]),
+                       set(input_receiver.labels.keys()))
+      self.assertEqual(set(["input", "foo", "bar"]),
+                       set(input_receiver.receiver_tensors.keys()))
+
+      input_receiver = input_receiver_fn2()
+      self.assertIsInstance(input_receiver.features, ops.Tensor)
+      self.assertIsInstance(input_receiver.labels, ops.Tensor)
+      self.assertEqual(set(["input", "label"]),
+                       set(input_receiver.receiver_tensors.keys()))
+
+  def test_build_raw_supervised_input_receiver_fn_batch_size(self):
+    features = {"feature_1": constant_op.constant(["hello"]),
+                "feature_2": constant_op.constant([42])}
+    labels = {"foo": constant_op.constant([5]),
+              "bar": constant_op.constant([6])}
+    input_receiver_fn = export.build_raw_supervised_input_receiver_fn(
+        features, labels, default_batch_size=10)
+    with ops.Graph().as_default():
+      input_receiver = input_receiver_fn()
+      self.assertEqual([10], input_receiver.receiver_tensors["feature_1"].shape)
+      self.assertEqual([10], input_receiver.features["feature_1"].shape)
+
+  def test_build_raw_supervised_input_receiver_fn_overlapping_keys(self):
+    features = {"feature_1": constant_op.constant(["hello"]),
+                "feature_2": constant_op.constant([42])}
+    labels = {"feature_1": constant_op.constant([5]),
+              "bar": constant_op.constant([6])}
+    with self.assertRaises(ValueError):
+      export.build_raw_supervised_input_receiver_fn(features, labels)
+
   def test_build_all_signature_defs_without_receiver_alternatives(self):
     receiver_tensor = array_ops.placeholder(dtypes.string)
     output_1 = constant_op.constant([1.])
@@ -404,6 +626,35 @@ class ExportTest(test_util.TensorFlowTestCase):
     self.assertTrue(int(time_1) < int(time_2))
     self.assertTrue(int(time_2) < int(time_3))
 
+  def test_build_all_signature_defs_serving_only(self):
+    receiver_tensor = {"input": array_ops.placeholder(dtypes.string)}
+    output_1 = constant_op.constant([1.])
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.PredictOutput(outputs=output_1),
+        "train": export_output.TrainOutput(loss=output_1),
+    }
+
+    signature_defs = export.build_all_signature_defs(
+        receiver_tensor, export_outputs)
+
+    expected_signature_defs = {
+        "serving_default": signature_def_utils.predict_signature_def(
+            receiver_tensor, {"output": output_1})
+    }
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
+    signature_defs = export.build_all_signature_defs(
+        receiver_tensor, export_outputs, serving_only=False)
+
+    expected_signature_defs.update({
+        "train": signature_def_utils.supervised_train_signature_def(
+            receiver_tensor, loss={"loss": output_1})
+    })
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
 
 class TensorServingReceiverTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/estimator/exporter.py b/tensorflow/python/estimator/exporter.py
index a3f04626d1e5ed7ca7fb09a5dcc2457a0cf5ab82..b775b19127c7fd8ee02142263791f25cd4b1f84f 100644
--- a/tensorflow/python/estimator/exporter.py
+++ b/tensorflow/python/estimator/exporter.py
@@ -22,9 +22,12 @@ import abc
 import os
 
 from tensorflow.python.estimator import gc
+from tensorflow.python.estimator import util
+from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.summary import summary_iterator
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -125,6 +128,244 @@ class _SavedModelExporter(Exporter):
     return export_result
 
 
+def _loss_smaller(best_eval_result, current_eval_result):
+  """Compares two evaluation results and returns true if the 2nd one is smaller.
+
+  Both evaluation results should have the values for MetricKeys.LOSS, which are
+  used for comparison.
+
+  Args:
+    best_eval_result: best eval metrics.
+    current_eval_result: current eval metrics.
+
+  Returns:
+    True if the loss of current_eval_result is smaller; otherwise, False.
+
+  Raises:
+    ValueError: If input eval result is None or no loss is available.
+  """
+  default_key = metric_keys.MetricKeys.LOSS
+  if not best_eval_result or default_key not in best_eval_result:
+    raise ValueError(
+        'best_eval_result cannot be empty or no loss is found in it.')
+
+  if not current_eval_result or default_key not in current_eval_result:
+    raise ValueError(
+        'current_eval_result cannot be empty or no loss is found in it.')
+
+  return best_eval_result[default_key] > current_eval_result[default_key]
+
+
+def _verify_compare_fn_args(compare_fn):
+  """Verifies compare_fn arguments."""
+  args = set(util.fn_args(compare_fn))
+  if 'best_eval_result' not in args:
+    raise ValueError(
+        'compare_fn (%s) must include best_eval_result argument.' % compare_fn)
+  if 'current_eval_result' not in args:
+    raise ValueError(
+        'compare_fn (%s) must include current_eval_result argument.' %
+        compare_fn)
+  non_valid_args = list(args - set(['best_eval_result', 'current_eval_result']))
+  if non_valid_args:
+    raise ValueError('compare_fn (%s) has following not expected args: %s' %
+                     (compare_fn, non_valid_args))
+
+
+@tf_export('estimator.BestExporter')
+class BestExporter(Exporter):
+  """This class exports the serving graph and checkpoints of the best models.
+
+  This class performs a model export everytime when the new model is better
+  than any exsiting model.
+  """
+
+  def __init__(self,
+               name='best_exporter',
+               serving_input_receiver_fn=None,
+               event_file_pattern='eval/*.tfevents.*',
+               compare_fn=_loss_smaller,
+               assets_extra=None,
+               as_text=False,
+               exports_to_keep=5):
+    """Create an `Exporter` to use with `tf.estimator.EvalSpec`.
+
+    Example of creating a BestExporter for training and evluation:
+    ```python
+    def make_train_and_eval_fn():
+      # Set up feature columns.
+      categorial_feature_a = (
+          tf.feature_column.categorical_column_with_hash_bucket(...))
+      categorial_feature_a_emb = embedding_column(
+          categorical_column=categorial_feature_a, ...)
+      ...  # other feature columns
+
+      estimator = tf.estimator.DNNClassifier(
+          config=tf.estimator.RunConfig(
+              model_dir='/my_model', save_summary_steps=100),
+          feature_columns=[categorial_feature_a_emb, ...],
+          hidden_units=[1024, 512, 256])
+
+      serving_feature_spec = tf.feature_column.make_parse_example_spec(
+          categorial_feature_a_emb)
+      serving_input_receiver_fn = (
+          tf.estimator.export.build_parsing_serving_input_receiver_fn(
+          serving_feature_spec))
+
+      exporter = tf.estimator.BestExporter(
+          name="best_exporter",
+          serving_input_receiver_fn=serving_input_receiver_fn,
+          exports_to_keep=5)
+
+      train_spec = tf.estimator.TrainSpec(...)
+
+      eval_spec = [tf.estimator.EvalSpec(
+        input_fn=eval_input_fn,
+        steps=100,
+        exporters=exporter,
+        start_delay_secs=0,
+        throttle_secs=5)]
+
+      return tf.estimator.DistributedTrainingSpec(estimator, train_spec,
+                                                  eval_spec)
+    ```
+
+    Args:
+      name: unique name of this `Exporter` that is going to be used in the
+        export path.
+      serving_input_receiver_fn: a function that takes no arguments and returns
+        a `ServingInputReceiver`.
+      event_file_pattern: event file name pattern relative to model_dir. If
+        None, however, the exporter would not be preemption-safe. To be
+        preemption-safe, event_file_pattern should be specified.
+      compare_fn: a function that compares two evaluation results and returns
+        true if current evaluation result is better. Follows the signature:
+        * Args:
+          * `best_eval_result`: This is the evaluation result of the best model.
+          * `current_eval_result`: This is the evaluation result of current
+                 candidate model.
+        * Returns:
+          True if current evaluation result is better; otherwise, False.
+      assets_extra: An optional dict specifying how to populate the assets.extra
+        directory within the exported SavedModel.  Each key should give the
+        destination path (including the filename) relative to the assets.extra
+        directory.  The corresponding value gives the full path of the source
+        file to be copied.  For example, the simple case of copying a single
+        file without renaming it is specified as `{'my_asset_file.txt':
+        '/path/to/my_asset_file.txt'}`.
+      as_text: whether to write the SavedModel proto in text format. Defaults to
+        `False`.
+      exports_to_keep: Number of exports to keep.  Older exports will be
+        garbage-collected.  Defaults to 5.  Set to `None` to disable garbage
+        collection.
+
+    Raises:
+      ValueError: if any arguments is invalid.
+    """
+    self._compare_fn = compare_fn
+    if self._compare_fn is None:
+      raise ValueError('`compare_fn` must not be None.')
+    _verify_compare_fn_args(self._compare_fn)
+
+    self._saved_model_exporter = _SavedModelExporter(
+        name, serving_input_receiver_fn, assets_extra, as_text)
+
+    self._event_file_pattern = event_file_pattern
+    self._model_dir = None
+    self._best_eval_result = None
+
+    self._exports_to_keep = exports_to_keep
+    if exports_to_keep is not None and exports_to_keep <= 0:
+      raise ValueError(
+          '`exports_to_keep`, if provided, must be positive number')
+
+  @property
+  def name(self):
+    return self._saved_model_exporter.name
+
+  def export(self, estimator, export_path, checkpoint_path, eval_result,
+             is_the_final_export):
+    export_result = None
+
+    if self._model_dir != estimator.model_dir() and self._event_file_pattern:
+      # Loads best metric from event files.
+      tf_logging.info('Loading best metric from event files.')
+
+      self._model_dir = estimator.model_dir()
+      full_event_file_pattern = os.path.join(self._model_dir,
+                                             self._event_file_pattern)
+      self._best_eval_result = self._get_best_eval_result(
+          full_event_file_pattern)
+
+    if self._best_eval_result is None or self._compare_fn(
+        best_eval_result=self._best_eval_result,
+        current_eval_result=eval_result):
+      tf_logging.info('Performing best model export.')
+      self._best_eval_result = eval_result
+      export_result = self._saved_model_exporter.export(
+          estimator, export_path, checkpoint_path, eval_result,
+          is_the_final_export)
+      self._garbage_collect_exports(export_path)
+
+    return export_result
+
+  def _garbage_collect_exports(self, export_dir_base):
+    """Deletes older exports, retaining only a given number of the most recent.
+
+    Export subdirectories are assumed to be named with monotonically increasing
+    integers; the most recent are taken to be those with the largest values.
+
+    Args:
+      export_dir_base: the base directory under which each export is in a
+        versioned subdirectory.
+    """
+    if self._exports_to_keep is None:
+      return
+
+    def _export_version_parser(path):
+      # create a simple parser that pulls the export_version from the directory.
+      filename = os.path.basename(path.path)
+      if not (len(filename) == 10 and filename.isdigit()):
+        return None
+      return path._replace(export_version=int(filename))
+
+    # pylint: disable=protected-access
+    keep_filter = gc._largest_export_versions(self._exports_to_keep)
+    delete_filter = gc._negation(keep_filter)
+    for p in delete_filter(
+        gc._get_paths(export_dir_base, parser=_export_version_parser)):
+      try:
+        gfile.DeleteRecursively(p.path)
+      except errors_impl.NotFoundError as e:
+        tf_logging.warn('Can not delete %s recursively: %s', p.path, e)
+    # pylint: enable=protected-access
+
+  def _get_best_eval_result(self, event_files):
+    """Get the best eval result from event files.
+
+    Args:
+      event_files: Absolute pattern of event files.
+
+    Returns:
+      The best eval result.
+    """
+    if not event_files:
+      return None
+
+    best_eval_result = None
+    for event_file in gfile.Glob(os.path.join(event_files)):
+      for event in summary_iterator.summary_iterator(event_file):
+        if event.HasField('summary'):
+          event_eval_result = {}
+          for value in event.summary.value:
+            if value.HasField('simple_value'):
+              event_eval_result[value.tag] = value.simple_value
+          if best_eval_result is None or self._compare_fn(
+              best_eval_result, event_eval_result):
+            best_eval_result = event_eval_result
+    return best_eval_result
+
+
 @tf_export('estimator.FinalExporter')
 class FinalExporter(Exporter):
   """This class exports the serving graph and checkpoints in the end.
@@ -157,9 +398,8 @@ class FinalExporter(Exporter):
     Raises:
       ValueError: if any arguments is invalid.
     """
-    self._saved_model_exporter = _SavedModelExporter(name,
-                                                     serving_input_receiver_fn,
-                                                     assets_extra, as_text)
+    self._saved_model_exporter = _SavedModelExporter(
+        name, serving_input_receiver_fn, assets_extra, as_text)
 
   @property
   def name(self):
@@ -213,9 +453,8 @@ class LatestExporter(Exporter):
     Raises:
       ValueError: if any arguments is invalid.
     """
-    self._saved_model_exporter = _SavedModelExporter(name,
-                                                     serving_input_receiver_fn,
-                                                     assets_extra, as_text)
+    self._saved_model_exporter = _SavedModelExporter(
+        name, serving_input_receiver_fn, assets_extra, as_text)
     self._exports_to_keep = exports_to_keep
     if exports_to_keep is not None and exports_to_keep <= 0:
       raise ValueError(
diff --git a/tensorflow/python/estimator/exporter_test.py b/tensorflow/python/estimator/exporter_test.py
index 70b5612804b2d91d66482d98ae080f42dfa17455..053c5490711cf538b4b83599b7938a43d8eaec34 100644
--- a/tensorflow/python/estimator/exporter_test.py
+++ b/tensorflow/python/estimator/exporter_test.py
@@ -30,6 +30,159 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 
 
+class BestExporterTest(test.TestCase):
+
+  def test_error_out_if_exports_to_keep_is_zero(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    with self.assertRaisesRegexp(ValueError, "positive number"):
+      exporter = exporter_lib.BestExporter(
+          name="best_exporter",
+          serving_input_receiver_fn=_serving_input_receiver_fn,
+          exports_to_keep=0)
+      self.assertEqual("best_exporter", exporter.name)
+
+  def test_best_exporter(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    exporter = exporter_lib.BestExporter(
+        name="best_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        exports_to_keep=5)
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.export_savedmodel.return_value = "export_result_path"
+    estimator.model_dir.return_value = export_dir_base
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {}, False)
+
+    self.assertEqual("export_result_path", export_result)
+    estimator.export_savedmodel.assert_called_with(
+        export_dir_base,
+        _serving_input_receiver_fn,
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        checkpoint_path="checkpoint_path",
+        strip_default_attrs=True)
+
+  def test_best_export_is_saved(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    exporter = exporter_lib.BestExporter(
+        name="best_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        exports_to_keep=1)
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.export_savedmodel.return_value = "export_result_path"
+    estimator.model_dir.return_value = export_dir_base
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 0.5}, False)
+
+    self.assertTrue(estimator.export_savedmodel.called)
+    self.assertEqual("export_result_path", export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 0.6}, False)
+    self.assertEqual(None, export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 0.4}, False)
+    self.assertEqual("export_result_path", export_result)
+
+  def test_best_exporter_with_preemption(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    eval_dir_base = os.path.join(export_dir_base, "eval_continuous")
+    estimator_lib._write_dict_to_summary(eval_dir_base, {"loss": 50}, 1)
+    estimator_lib._write_dict_to_summary(eval_dir_base, {"loss": 60}, 2)
+
+    exporter = exporter_lib.BestExporter(
+        name="best_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        event_file_pattern="eval_continuous/*.tfevents.*",
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        exports_to_keep=1)
+
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.model_dir.return_value = export_dir_base
+    estimator.export_savedmodel.return_value = "export_result_path"
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 100}, False)
+    self.assertEqual(None, export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 10}, False)
+    self.assertEqual("export_result_path", export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 20}, False)
+    self.assertEqual(None, export_result)
+
+  def test_garbage_collect_exports(self):
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    export_dir_1 = _create_test_export_dir(export_dir_base)
+    export_dir_2 = _create_test_export_dir(export_dir_base)
+    export_dir_3 = _create_test_export_dir(export_dir_base)
+    export_dir_4 = _create_test_export_dir(export_dir_base)
+
+    self.assertTrue(gfile.Exists(export_dir_1))
+    self.assertTrue(gfile.Exists(export_dir_2))
+    self.assertTrue(gfile.Exists(export_dir_3))
+    self.assertTrue(gfile.Exists(export_dir_4))
+
+    def _serving_input_receiver_fn():
+      return array_ops.constant([1]), None
+
+    exporter = exporter_lib.BestExporter(
+        name="best_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        exports_to_keep=2)
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.model_dir.return_value = export_dir_base
+    # Garbage collect all but the most recent 2 exports,
+    # where recency is determined based on the timestamp directory names.
+    exporter.export(estimator, export_dir_base, None, None, False)
+
+    self.assertFalse(gfile.Exists(export_dir_1))
+    self.assertFalse(gfile.Exists(export_dir_2))
+    self.assertTrue(gfile.Exists(export_dir_3))
+    self.assertTrue(gfile.Exists(export_dir_4))
+
+
 class LatestExporterTest(test.TestCase):
 
   def test_error_out_if_exports_to_keep_is_zero(self):
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index a6f471291008e3c27dea1aeea5865e334f76e5c8..eefc7c712d79d8d02632ccb928f7ab4af02b2596 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -136,11 +136,13 @@ def numpy_input_fn(x,
       values in `x` have same shape).
     ValueError: if duplicate keys are in both `x` and `y` when `y` is a dict.
     ValueError: if x or y is an empty dict.
-    TypeError: `x` is not a dict or array, or if `shuffle` is not bool.
+    TypeError: `x` is not a dict or array.
+    ValueError: if 'shuffle' is not provided or a bool.
   """
   if not isinstance(shuffle, bool):
-    raise TypeError('shuffle must be explicitly set as boolean; '
-                    'got {}'.format(shuffle))
+    raise ValueError('shuffle must be provided and explicitly set as boolean '
+                     '(it is recommended to set it as True for training); '
+                     'got {}'.format(shuffle))
 
   def input_fn():
     """Numpy input function."""
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
index 92d057e25da785cf5ee310ca1c80f67a5fbdb43a..81b201cc5c5f3d6b8211030d17006f89a545793e 100644
--- a/tensorflow/python/estimator/inputs/numpy_io_test.py
+++ b/tensorflow/python/estimator/inputs/numpy_io_test.py
@@ -286,8 +286,9 @@ class NumpyIoTest(test.TestCase):
     x = np.arange(32, 36)
     y = np.arange(4)
     with self.test_session():
-      with self.assertRaisesRegexp(TypeError,
-                                   'shuffle must be explicitly set as boolean'):
+      with self.assertRaisesRegexp(ValueError,
+                                   'shuffle must be provided and explicitly '
+                                   'set as boolean'):
         # Default shuffle is None.
         numpy_io.numpy_input_fn(x, y)
 
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
index bd06843021f47f81fc0c22d0fcee43530dc10098..1ed6ed4d846a47d70a72c1363567ce918bb007a6 100644
--- a/tensorflow/python/estimator/inputs/pandas_io.py
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -68,15 +68,16 @@ def pandas_input_fn(x,
   Raises:
     ValueError: if `x` already contains a column with the same name as `y`, or
       if the indexes of `x` and `y` don't match.
-    TypeError: `shuffle` is not bool.
+    ValueError: if 'shuffle' is not provided or a bool.
   """
   if not HAS_PANDAS:
     raise TypeError(
         'pandas_input_fn should not be called without pandas installed')
 
   if not isinstance(shuffle, bool):
-    raise TypeError('shuffle must be explicitly set as boolean; '
-                    'got {}'.format(shuffle))
+    raise ValueError('shuffle must be provided and explicitly set as boolean '
+                     '(it is recommended to set it as True for training); '
+                     'got {}'.format(shuffle))
 
   x = x.copy()
   if y is not None:
diff --git a/tensorflow/python/estimator/inputs/pandas_io_test.py b/tensorflow/python/estimator/inputs/pandas_io_test.py
index e5912a3b28e78c6fc9d8b259a81b2575e6868c6f..dcecf6dd61c4d24a36b2be8f054c066050d088fc 100644
--- a/tensorflow/python/estimator/inputs/pandas_io_test.py
+++ b/tensorflow/python/estimator/inputs/pandas_io_test.py
@@ -70,8 +70,9 @@ class PandasIoTest(test.TestCase):
       return
     x, _ = self.makeTestDataFrame()
     y_noindex = pd.Series(np.arange(-32, -28))
-    with self.assertRaisesRegexp(TypeError,
-                                 'shuffle must be explicitly set as boolean'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'shuffle must be provided and explicitly '
+                                 'set as boolean'):
       # Default shuffle is None
       pandas_io.pandas_input_fn(x, y_noindex)
 
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 8e2ec83020abc5193309303d0cdd56bd07ef3b5e..51a61adb216c9b019aa01bb7e55c71a8464c01b3 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -250,7 +250,7 @@ class _PandasFeedFn(object):
                num_epochs=None):
     if len(placeholders) != len(dataframe.columns) + 1:
       raise ValueError("Expected {} placeholders; got {}.".format(
-          len(dataframe.columns), len(placeholders)))
+          len(dataframe.columns) + 1, len(placeholders)))
     self._index_placeholder = placeholders[0]
     self._col_placeholders = placeholders[1:]
     self._dataframe = dataframe
diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/estimator/keras.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/estimator.py
rename to tensorflow/python/estimator/keras.py
index 5c79c964c8171ceecd00e9d03245773837394ae0..dd92a0403b9786dae0980992c0cdb7ecb901ea13 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/estimator/keras.py
@@ -30,12 +30,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import models
-from tensorflow.python.keras._impl.keras import optimizers
-from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
-from tensorflow.python.keras._impl.keras.engine.network import Network
-from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import models
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.network import Network
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_module
@@ -68,7 +68,7 @@ def _convert_tensor(x):
   return x
 
 
-def _any_variable_initalized():
+def _any_variable_initialized():
   """Check if any variable has been initialized in the Keras model.
 
   Returns:
@@ -493,7 +493,7 @@ def model_to_estimator(keras_model=None,
       keras_model_fn, model_dir=model_dir, config=config)
 
   # Check if we need to call get_weights:
-  if _any_variable_initalized():
+  if _any_variable_initialized():
     keras_weights = keras_model.get_weights()
     # Warn if config passed to estimator tries to update GPUOptions. If a
     # session has already been created, the GPUOptions passed to the first
diff --git a/tensorflow/python/keras/_impl/keras/estimator_test.py b/tensorflow/python/estimator/keras_test.py
similarity index 94%
rename from tensorflow/python/keras/_impl/keras/estimator_test.py
rename to tensorflow/python/estimator/keras_test.py
index 80fa87d0410871c30b8a7c46e7ff02bc81c96f3b..6688a841300f04416e4099f3abc9a03858ac245e 100644
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/estimator/keras_test.py
@@ -25,15 +25,16 @@ import tempfile
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python import keras
+from tensorflow.python.estimator import keras as keras_lib
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import testing_utils
-from tensorflow.python.keras._impl.keras.applications import mobilenet
-from tensorflow.python.keras._impl.keras.optimizers import SGD
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.applications import mobilenet
+from tensorflow.python.keras.optimizers import SGD
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -192,7 +193,7 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
           metrics=['mse', keras.metrics.categorical_accuracy])
 
       with self.test_session():
-        est_keras = keras.estimator.model_to_estimator(
+        est_keras = keras_lib.model_to_estimator(
             keras_model=keras_model, config=self._config)
         before_eval_results = est_keras.evaluate(
             input_fn=eval_input_fn, steps=1)
@@ -214,7 +215,7 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
           metrics=['mse', keras.metrics.categorical_accuracy])
 
       with self.test_session():
-        est_keras = keras.estimator.model_to_estimator(
+        est_keras = keras_lib.model_to_estimator(
             keras_model=keras_model,
             # Also use dict config argument to get test coverage for that line.
             config={
@@ -240,7 +241,7 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
         metrics=['mse', keras.metrics.categorical_accuracy])
 
     with self.test_session():
-      est_keras = keras.estimator.model_to_estimator(
+      est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=self._config)
       est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
       before_eval_results = est_keras.evaluate(
@@ -264,7 +265,7 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
                                  np.random.random((10, _NUM_CLASS)))
       original_preds = keras_model.predict(np.ones((10,) + _INPUT_SIZE))
 
-      est_keras = keras.estimator.model_to_estimator(
+      est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=self._config)
       est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
       before_eval_results = est_keras.evaluate(
@@ -300,7 +301,7 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
       keras_eval = keras_model.evaluate(x_test, y_test, batch_size=32)
 
     with self.test_session():
-      keras_est = keras.estimator.model_to_estimator(
+      keras_est = keras_lib.model_to_estimator(
           keras_model=keras_model, config=self._config)
       est_eval = keras_est.evaluate(input_fn=eval_input_fn)
 
@@ -336,7 +337,7 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
       keras_pred = [np.argmax(y) for y in keras_model.predict(x_test)]
 
     with self.test_session():
-      keras_est = keras.estimator.model_to_estimator(
+      keras_est = keras_lib.model_to_estimator(
           keras_model=keras_model, config=self._config)
       est_pred = [
           np.argmax(y[keras_model.output_names[0]])
@@ -383,7 +384,7 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
 
     with self.test_session():
       model = multi_inputs_multi_outputs_model()
-      est_keras = keras.estimator.model_to_estimator(
+      est_keras = keras_lib.model_to_estimator(
           keras_model=model, config=self._config)
       before_eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
       est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
@@ -409,7 +410,7 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
       keras.models.save_model(keras_model, fname)
 
     with self.test_session():
-      keras_est = keras.estimator.model_to_estimator(
+      keras_est = keras_lib.model_to_estimator(
           keras_model_path=fname, config=self._config)
       est_pred = [
           np.argmax(y[keras_model.output_names[0]])
@@ -419,24 +420,24 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
 
   def test_keras_model_init_error(self):
     with self.assertRaisesRegexp(ValueError, 'Either'):
-      keras.estimator.model_to_estimator()
+      keras_lib.model_to_estimator()
 
     with self.test_session():
       keras_model = simple_sequential_model()
       with self.assertRaisesRegexp(ValueError, 'not both'):
-        keras.estimator.model_to_estimator(
+        keras_lib.model_to_estimator(
             keras_model=keras_model,
             keras_model_path=tempfile.mkdtemp(dir=self._base_dir))
 
     with self.test_session():
       keras_model = simple_sequential_model()
       with self.assertRaisesRegexp(ValueError, 'compiled'):
-        keras.estimator.model_to_estimator(keras_model=keras_model)
+        keras_lib.model_to_estimator(keras_model=keras_model)
 
     with self.test_session():
       keras_model = simple_sequential_model()
       with self.assertRaisesRegexp(ValueError, 'not a local path'):
-        keras.estimator.model_to_estimator(
+        keras_lib.model_to_estimator(
             keras_model_path='gs://bucket/object')
 
   def test_invalid_ionames_error(self):
@@ -460,7 +461,7 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
     model.compile(
         loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
     with self.test_session():
-      est_keras = keras.estimator.model_to_estimator(
+      est_keras = keras_lib.model_to_estimator(
           keras_model=model, config=self._config)
 
     with self.test_session():
@@ -479,12 +480,12 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
     }
     with self.assertRaisesRegexp(ValueError, 'relu6'):
       with self.test_session():
-        keras.estimator.model_to_estimator(
+        keras_lib.model_to_estimator(
             keras_model=keras_mobile,
             model_dir=tempfile.mkdtemp(dir=self._base_dir))
 
     with self.test_session():
-      keras.estimator.model_to_estimator(
+      keras_lib.model_to_estimator(
           keras_model=keras_mobile,
           model_dir=tempfile.mkdtemp(dir=self._base_dir),
           custom_objects=custom_objects)
@@ -509,7 +510,7 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
     })
     with test.mock.patch.dict('os.environ', {'TF_CONFIG': tf_config}):
       with self.test_session():
-        keras.estimator.model_to_estimator(
+        keras_lib.model_to_estimator(
             keras_model=keras_model,
             model_dir=tempfile.mkdtemp(dir=self._base_dir))
 
@@ -524,7 +525,7 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
       gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.3)
       sess_config = config_pb2.ConfigProto(gpu_options=gpu_options)
       self._config._session_config = sess_config
-      keras.estimator.model_to_estimator(
+      keras_lib.model_to_estimator(
           keras_model=keras_model, config=self._config)
       self.assertEqual(
           keras.backend.get_session()
@@ -548,7 +549,7 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
           loss='categorical_crossentropy',
           optimizer=SGD(lr=0.0001, momentum=0.9),
           metrics=['mse', keras.metrics.categorical_accuracy])
-      keras.estimator.model_to_estimator(
+      keras_lib.model_to_estimator(
           keras_model=keras_model, config=self._config)
 
 
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
index 8111ab564c017175b3f7bc1020d850db74587958..3edf9fe940b19c7a0b1a7c21a9674189faba5acb 100644
--- a/tensorflow/python/estimator/model_fn.py
+++ b/tensorflow/python/estimator/model_fn.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import nest
@@ -53,6 +54,13 @@ class ModeKeys(object):
 LOSS_METRIC_KEY = 'loss'
 AVERAGE_LOSS_METRIC_KEY = 'average_loss'
 
+# Mapping of the modes to appropriate tag_constants that are used for saving.
+EXPORT_TAG_MAP = {
+    ModeKeys.PREDICT: [tag_constants.SERVING],
+    ModeKeys.TRAIN: [tag_constants.TRAINING],
+    ModeKeys.EVAL: [tag_constants.EVAL],
+}
+
 
 @tf_export('estimator.EstimatorSpec')
 class EstimatorSpec(
@@ -326,6 +334,57 @@ class EstimatorSpec(
     return EstimatorSpec(*new_fields)
 
 
+class _TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
+    'mode',
+    'predictions',
+    'loss',
+    'train_op',
+    'eval_metrics',
+    'export_outputs',
+    'scaffold_fn',
+    'host_call'])):
+  """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
+
+  This is a simplified implementation of `tf.contrib.tpu.EstimatorSpec`. See
+  tensorflow/contrib/tpu/python/tpu/tpu_estimator.py for more detailed
+  documentation.
+  """
+
+  def __new__(cls,
+              mode,
+              predictions=None,
+              loss=None,
+              train_op=None,
+              eval_metrics=None,
+              export_outputs=None,
+              scaffold_fn=None,
+              host_call=None):
+    """Creates a `_TPUEstimatorSpec` instance."""
+    return super(_TPUEstimatorSpec, cls).__new__(cls,
+                                                 mode=mode,
+                                                 predictions=predictions,
+                                                 loss=loss,
+                                                 train_op=train_op,
+                                                 eval_metrics=eval_metrics,
+                                                 export_outputs=export_outputs,
+                                                 scaffold_fn=scaffold_fn,
+                                                 host_call=host_call)
+
+  def as_estimator_spec(self):
+    """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
+    if not self.eval_metrics:
+      eval_metric_ops = None
+    else:
+      metric_fn, tensors = self.eval_metrics
+      eval_metric_ops = metric_fn(**tensors)
+    return EstimatorSpec(mode=self.mode,
+                         predictions=self.predictions,
+                         loss=self.loss,
+                         train_op=self.train_op,
+                         eval_metric_ops=eval_metric_ops,
+                         export_outputs=self.export_outputs)
+
+
 def _check_is_tensor_or_operation(x, name):
   if not (isinstance(x, ops.Operation) or isinstance(x, ops.Tensor)):
     raise TypeError('{} must be Operation or Tensor, given: {}'.format(name, x))
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index 8162b249f1f0be6c901f09ff21f9a67f7c5e492f..c7707be8397d950f4e5993b678c215128d3d8b9f 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -27,8 +27,8 @@ import six
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
-from tensorflow.python.estimator import util
 from tensorflow.python.util import compat_internal
+from tensorflow.python.util import function_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -283,7 +283,7 @@ def _validate_properties(run_config):
             message='tf_random_seed must be integer.')
 
   _validate('device_fn', lambda device_fn: six.callable(device_fn) and
-            set(util.fn_args(device_fn)) == _VALID_DEVICE_FN_ARGS,
+            set(function_utils.fn_args(device_fn)) == _VALID_DEVICE_FN_ARGS,
             message='device_fn must be callable with exactly'
                     ' one argument "op".')
 
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index 994115c9eaa4b69a1f4336a6a52922dacee64361..a187b12b24c4e588e3aef8624a09a092a4b62b95 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -424,6 +424,11 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
     eval_spec: A `EvalSpec` instance to specify the evaluation and export
       specification.
 
+  Returns:
+    A tuple of the result of the `evaluate` call to the `Estimator` and the
+    export results using the specified `ExportStrategy`.
+    Currently, the return value is undefined for distributed training mode.
+
   Raises:
     ValueError: if environment variable `TF_CONFIG` is incorrectly set.
   """
@@ -510,6 +515,11 @@ class _TrainingExecutor(object):
     procedure is `run_foo'. This `run` method invoke the procedure base on the
     `RunConfig.task_type`.
 
+    Returns:
+      A tuple of the result of the `evaluate` call to the `Estimator` and the
+      export results using the specified `ExportStrategy`.
+      Currently undefined for distributed training mode.
+
     Raises:
       ValueError: if the estimator.config is mis-configured.
     """
@@ -518,8 +528,7 @@ class _TrainingExecutor(object):
     if (not config.cluster_spec and
         config.task_type != run_config_lib.TaskType.EVALUATOR):
       logging.info('Running training and evaluation locally (non-distributed).')
-      self.run_local()
-      return
+      return self.run_local()
 
     # Distributed case.
     if not config.task_type:
@@ -650,6 +659,9 @@ class _TrainingExecutor(object):
     evaluator = _TrainingExecutor._Evaluator(self._estimator, self._eval_spec,
                                              self._train_spec.max_steps)
 
+    eval_result = _EvalResult(status=_EvalStatus.MISSING_CHECKPOINT)
+    export_results = []
+
     while True:
       self._estimator.train(
           input_fn=self._train_spec.input_fn,
@@ -666,7 +678,7 @@ class _TrainingExecutor(object):
       # _should_stop_local_train will then end the while True as the stopping
       # condition is satisfied (both checks use the same global_step value,
       # i.e., no race condition)
-      eval_result = evaluator.evaluate_and_export()
+      eval_result, export_results = evaluator.evaluate_and_export()
 
       if eval_result.status != _EvalStatus.EVALUATED:
         #  This is unexpected; should never happen.
@@ -682,6 +694,7 @@ class _TrainingExecutor(object):
       if _should_stop_local_train(
           eval_result.metrics[ops.GraphKeys.GLOBAL_STEP]):
         break
+    return eval_result.metrics, export_results
 
   def _start_std_server(self, config):
     """Creates, starts, and returns a server_lib.Server."""
@@ -807,7 +820,7 @@ class _TrainingExecutor(object):
     # iteration of while loop will end the continuous eval as the stopping
     # condition is satisfied (both checks use the same global_step value,
     # i.e., no race condition)
-    eval_result = evaluator.evaluate_and_export()
+    eval_result, _ = evaluator.evaluate_and_export()
 
     if not self._continuous_eval_listener.after_eval(eval_result):
       logging.info('Exiting evaluation, as requested by '
@@ -846,7 +859,7 @@ class _TrainingExecutor(object):
       """Evaluate and (maybe) export the current model.
 
       Returns:
-        An `EvalResult` instance.
+        A tuple of `EvalResult` instance and the export results.
 
       Raises:
         RuntimeError: for any unexpected internal error.
@@ -856,14 +869,14 @@ class _TrainingExecutor(object):
       if not latest_ckpt_path:
         self._log_err_msg('Estimator is not trained yet. Will start an '
                           'evaluation when a checkpoint is ready.')
-        return _EvalResult(status=_EvalStatus.MISSING_CHECKPOINT)
+        return _EvalResult(status=_EvalStatus.MISSING_CHECKPOINT), []
 
       if latest_ckpt_path == self._previous_ckpt_path:
         self._log_err_msg(
             'No new checkpoint ready for evaluation. Skip the current '
             'evaluation pass as evaluation results are expected to be same '
             'for the same checkpoint.')
-        return _EvalResult(status=_EvalStatus.NO_NEW_CHECKPOINT)
+        return _EvalResult(status=_EvalStatus.NO_NEW_CHECKPOINT), []
 
       metrics = self._estimator.evaluate(
           input_fn=self._eval_spec.input_fn,
@@ -881,7 +894,8 @@ class _TrainingExecutor(object):
       is_the_final_export = (
           eval_result.metrics[ops.GraphKeys.GLOBAL_STEP] >=
           self._max_training_steps if self._max_training_steps else False)
-      self._export_eval_result(eval_result, is_the_final_export)
+      export_results = self._export_eval_result(eval_result,
+                                                is_the_final_export)
 
       if is_the_final_export:
         logging.debug('Calling exporter with the `is_the_final_export=True`.')
@@ -889,7 +903,7 @@ class _TrainingExecutor(object):
 
       self._last_warning_time = 0
       self._previous_ckpt_path = latest_ckpt_path
-      return eval_result
+      return eval_result, export_results
 
     def _log_err_msg(self, message):
       """Prints warning `message` every 10 mins."""
@@ -904,15 +918,18 @@ class _TrainingExecutor(object):
           compat.as_str_any(self._estimator.model_dir),
           compat.as_str_any('export'))
 
+      export_results = []
       for exporter in self._eval_spec.exporters:
-        exporter.export(
-            estimator=self._estimator,
-            export_path=os.path.join(
-                compat.as_str_any(export_dir_base),
-                compat.as_str_any(exporter.name)),
-            checkpoint_path=eval_result.checkpoint_path,
-            eval_result=eval_result.metrics,
-            is_the_final_export=is_the_final_export)
+        export_results.append(
+            exporter.export(
+                estimator=self._estimator,
+                export_path=os.path.join(
+                    compat.as_str_any(export_dir_base),
+                    compat.as_str_any(exporter.name)),
+                checkpoint_path=eval_result.checkpoint_path,
+                eval_result=eval_result.metrics,
+                is_the_final_export=is_the_final_export))
+      return export_results
 
 
 class _EvalStatus(object):
diff --git a/tensorflow/python/estimator/training_test.py b/tensorflow/python/estimator/training_test.py
index 3b6f5e18cb50d84002dae842d15284cc57c0f972..2c838db7a4de98d941752ce9d5ddf8f2b47a46f1 100644
--- a/tensorflow/python/estimator/training_test.py
+++ b/tensorflow/python/estimator/training_test.py
@@ -1835,6 +1835,7 @@ class TrainingExecutorRunLocalTest(test.TestCase):
     def export(estimator, *args, **kwargs):
       del args, kwargs
       estimator.export_was_called = True
+      return 'path_to_export'
 
     exporter = test.mock.PropertyMock(spec=exporter_lib.Exporter)
     exporter.name = 'see_whether_export_is_called'
@@ -1848,9 +1849,12 @@ class TrainingExecutorRunLocalTest(test.TestCase):
         exporters=exporter)
 
     executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec)
-    executor.run_local()
+    # pylint: disable=assignment-from-no-return
+    _, export_results = executor.run_local()
+    # pylint: enable=assignment-from-no-return
 
     self.assertTrue(mock_est.export_was_called)
+    self.assertEqual(export_results, ['path_to_export'])
 
   def test_errors_out_if_evaluate_returns_empty_dict(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
@@ -1867,7 +1871,6 @@ class TrainingExecutorRunLocalTest(test.TestCase):
     train_spec = training.TrainSpec(input_fn=lambda: 1)
     eval_spec = training.EvalSpec(input_fn=(lambda: 1), throttle_secs=123)
     mock_est.evaluate.return_value = 123
-
     executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
     with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_RESULT_TYPE_ERR):
       executor.run_local()
@@ -1883,6 +1886,21 @@ class TrainingExecutorRunLocalTest(test.TestCase):
                                  _MISSING_GLOBAL_STEP_IN_EVAL_RESULT_ERR):
       executor.run_local()
 
+  def test_train_and_evaluate_return_metrics(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, model_dir='path/')
+    mock_est.latest_checkpoint.return_value = 'checkpoint_path/'
+    train_spec = training.TrainSpec(
+        input_fn=lambda: 1, max_steps=300, hooks=[_FakeHook()])
+    eval_spec = training.EvalSpec(
+        input_fn=lambda: 1, steps=2, hooks=[_FakeHook()], name='local_eval')
+    mock_est.evaluate.return_value = {_GLOBAL_STEP_KEY: train_spec.max_steps}
+
+    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
+    # pylint: disable=assignment-from-no-return
+    metrics, _ = executor.run_local()
+    # pylint: enable=assignment-from-no-return
+    self.assertEqual(metrics['global_step'], 300)
+
 
 class TrainAndEvaluateRunTest(test.TestCase):
 
diff --git a/tensorflow/python/estimator/util.py b/tensorflow/python/estimator/util.py
index bb4bdd3fdfb2e19dbc1c581d7771f2e1ac4442ba..e4e1d37f74330c9bfd48adff95e6409793714729 100644
--- a/tensorflow/python/estimator/util.py
+++ b/tensorflow/python/estimator/util.py
@@ -13,55 +13,21 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Utility to retrieve function args."""
+"""Utilities for Estimators."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
 import os
 import time
 
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
-from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
-
-
-def _is_bounded_method(fn):
-  _, fn = tf_decorator.unwrap(fn)
-  return tf_inspect.ismethod(fn) and (fn.__self__ is not None)
-
-
-def _is_callable_object(obj):
-  return hasattr(obj, '__call__') and tf_inspect.ismethod(obj.__call__)
-
-
-def fn_args(fn):
-  """Get argument names for function-like object.
-
-  Args:
-    fn: Function, or function-like object (e.g., result of `functools.partial`).
-
-  Returns:
-    `tuple` of string argument names.
-
-  Raises:
-    ValueError: if partial function has positionally bound arguments
-  """
-  if isinstance(fn, functools.partial):
-    args = fn_args(fn.func)
-    args = [a for a in args[len(fn.args):] if a not in (fn.keywords or [])]
-  else:
-    if _is_callable_object(fn):
-      fn = fn.__call__
-    args = tf_inspect.getfullargspec(fn).args
-    if _is_bounded_method(fn):
-      args.remove('self')
-  return tuple(args)
+from tensorflow.python.util import function_utils
 
+fn_args = function_utils.fn_args
 
 # When we create a timestamped directory, there is a small chance that the
 # directory already exists because another process is also creating these
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 40386ae7aa6ceabb058b3e0bb12c2295648d1989..ffcb9990d52c2e3ff3bb356e8f8eebd850ea9e20 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -140,7 +140,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras.engine import training
 from tensorflow.python.layers import base
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -1068,6 +1068,7 @@ def numeric_column(key,
     raise TypeError(
         'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
+  _assert_key_is_string(key)
   return _NumericColumn(
       key,
       shape=shape,
@@ -1166,6 +1167,13 @@ def _assert_string_or_int(dtype, prefix):
         '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype))
 
 
+def _assert_key_is_string(key):
+  if not isinstance(key, six.string_types):
+    raise ValueError(
+        'key must be a string. Got: type {}. Given key: {}.'.format(
+            type(key), key))
+
+
 @tf_export('feature_column.categorical_column_with_hash_bucket')
 def categorical_column_with_hash_bucket(key,
                                         hash_bucket_size,
@@ -1218,6 +1226,7 @@ def categorical_column_with_hash_bucket(key,
                      'hash_bucket_size: {}, key: {}'.format(
                          hash_bucket_size, key))
 
+  _assert_key_is_string(key)
   _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
 
   return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
@@ -1334,6 +1343,7 @@ def categorical_column_with_vocabulary_file(key,
       raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
           num_oov_buckets, key))
   _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+  _assert_key_is_string(key)
   return _VocabularyFileCategoricalColumn(
       key=key,
       vocabulary_file=vocabulary_file,
@@ -1448,6 +1458,7 @@ def categorical_column_with_vocabulary_list(
         'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
             dtype, vocabulary_dtype, key))
   _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+  _assert_key_is_string(key)
 
   return _VocabularyListCategoricalColumn(
       key=key, vocabulary_list=tuple(vocabulary_list), dtype=dtype,
@@ -1518,6 +1529,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
     raise ValueError(
         'default_value {} not in range [0, {}), column_name {}'.format(
             default_value, num_buckets, key))
+  _assert_key_is_string(key)
   return _IdentityCategoricalColumn(
       key=key, num_buckets=num_buckets, default_value=default_value)
 
@@ -3032,7 +3044,7 @@ class _CrossedColumn(
         feature_tensors.append(ids_and_weights.id_tensor)
       else:
         raise ValueError('Unsupported column type. Given: {}'.format(key))
-    return sparse_ops._sparse_cross_hashed(  # pylint: disable=protected-access
+    return sparse_ops.sparse_cross_hashed(
         inputs=feature_tensors,
         num_buckets=self.hash_bucket_size,
         hash_key=self.hash_key)
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index b06540489ff8429d76b1f9e47df6ebbcb2d42110..f9206f4f38d7631ccdb57d41d8c52f9f0edede6d 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -182,6 +182,10 @@ class NumericColumnTest(test.TestCase):
     self.assertEqual(dtypes.float32, a.dtype)
     self.assertIsNone(a.normalizer_fn)
 
+  def test_key_should_be_string(self):
+    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+      fc.numeric_column(key=('aaa',))
+
   def test_shape_saved_as_tuple(self):
     a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
     self.assertEqual((1, 2), a.shape)
@@ -645,6 +649,10 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(10, a.hash_bucket_size)
     self.assertEqual(dtypes.string, a.dtype)
 
+  def test_key_should_be_string(self):
+    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+      fc.categorical_column_with_hash_bucket(('key',), 10)
+
   def test_bucket_size_should_be_given(self):
     with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be set.'):
       fc.categorical_column_with_hash_bucket('aaa', None)
@@ -1276,7 +1284,6 @@ def get_keras_linear_model_predictions(features,
   return retval
 
 
-@test_util.with_c_api
 class LinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
@@ -1552,16 +1559,10 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      if ops._USE_C_API:
-        with self.assertRaisesRegexp(
-            Exception,
-            r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-          predictions = fc.linear_model(features, [price])
-      else:
-        predictions = fc.linear_model(features, [price])
-        with _initialized_session():
-          with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
-            predictions.eval()
+      with self.assertRaisesRegexp(
+          Exception,
+          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+        fc.linear_model(features, [price])
 
   def test_dense_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
@@ -1925,7 +1926,6 @@ class LinearModelTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
-@test_util.with_c_api
 class _LinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
@@ -2190,16 +2190,10 @@ class _LinearModelTest(test.TestCase):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      if ops._USE_C_API:
-        with self.assertRaisesRegexp(
-            Exception,
-            r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-          predictions = get_keras_linear_model_predictions(features, [price])
-      else:
-        predictions = get_keras_linear_model_predictions(features, [price])
-        with _initialized_session():
-          with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
-            predictions.eval()
+      with self.assertRaisesRegexp(
+          Exception,
+          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+        get_keras_linear_model_predictions(features, [price])
 
   def test_dense_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
@@ -2686,7 +2680,6 @@ class InputLayerTest(test.TestCase):
       self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
 
 
-@test_util.with_c_api
 class FunctionalInputLayerTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
@@ -2751,16 +2744,10 @@ class FunctionalInputLayerTest(test.TestCase):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      if ops._USE_C_API:
-        with self.assertRaisesRegexp(
-            Exception,
-            r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-          net = fc.input_layer(features, [price])
-      else:
-        net = fc.input_layer(features, [price])
-        with _initialized_session():
-          with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
-            net.eval()
+      with self.assertRaisesRegexp(
+          Exception,
+          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+        fc.input_layer(features, [price])
 
   def test_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
@@ -3327,6 +3314,11 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.string)
     }, column._parse_example_spec)
 
+  def test_key_should_be_string(self):
+    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+      fc.categorical_column_with_vocabulary_file(
+          key=('aaa',), vocabulary_file='path_to_file', vocabulary_size=3)
+
   def test_all_constructor_args(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
@@ -3752,6 +3744,11 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.string)
     }, column._parse_example_spec)
 
+  def test_key_should_be_string(self):
+    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+      fc.categorical_column_with_vocabulary_list(
+          key=('aaa',), vocabulary_list=('omar', 'stringer', 'marlo'))
+
   def test_defaults_int(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36))
@@ -4143,6 +4140,10 @@ class IdentityCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, column._parse_example_spec)
 
+  def test_key_should_be_string(self):
+    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+      fc.categorical_column_with_identity(key=('aaa',), num_buckets=3)
+
   def test_deep_copy(self):
     original = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     for column in (original, copy.deepcopy(original)):
diff --git a/tensorflow/python/framework/c_api_util.py b/tensorflow/python/framework/c_api_util.py
index 7bbe3183dfa376776dde8412a3270b1684a02391..f68f30a0a966b9e6031f73bd59634b25fbcb6689 100644
--- a/tensorflow/python/framework/c_api_util.py
+++ b/tensorflow/python/framework/c_api_util.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import api_def_pb2
+from tensorflow.core.framework import op_def_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.util import compat
 from tensorflow.python.util import tf_contextlib
@@ -89,6 +91,53 @@ class ScopedTFFunction(object):
       c_api.TF_DeleteFunction(self.func)
 
 
+class ApiDefMap(object):
+  """Wrapper around Tf_ApiDefMap that handles querying and deletion.
+
+  The OpDef protos are also stored in this class so that they could
+  be queried by op name.
+  """
+
+  def __init__(self):
+    op_def_proto = op_def_pb2.OpList()
+    buf = c_api.TF_GetAllOpList()
+    try:
+      op_def_proto.ParseFromString(c_api.TF_GetBuffer(buf))
+      self._api_def_map = c_api.TF_NewApiDefMap(buf)
+    finally:
+      c_api.TF_DeleteBuffer(buf)
+
+    self._op_per_name = {}
+    for op in op_def_proto.op:
+      self._op_per_name[op.name] = op
+
+  def __del__(self):
+    # Note: when we're destructing the global context (i.e when the process is
+    # terminating) we can have already deleted other modules.
+    if c_api is not None and c_api.TF_DeleteApiDefMap is not None:
+      c_api.TF_DeleteApiDefMap(self._api_def_map)
+
+  def put_api_def(self, text):
+    c_api.TF_ApiDefMapPut(self._api_def_map, text, len(text))
+
+  def get_api_def(self, op_name):
+    api_def_proto = api_def_pb2.ApiDef()
+    buf = c_api.TF_ApiDefMapGet(self._api_def_map, op_name, len(op_name))
+    try:
+      api_def_proto.ParseFromString(c_api.TF_GetBuffer(buf))
+    finally:
+      c_api.TF_DeleteBuffer(buf)
+    return api_def_proto
+
+  def get_op_def(self, op_name):
+    if op_name in self._op_per_name:
+      return self._op_per_name[op_name]
+    raise ValueError("No entry found for " + op_name + ".")
+
+  def op_names(self):
+    return self._op_per_name.keys()
+
+
 @tf_contextlib.contextmanager
 def tf_buffer(data=None):
   """Context manager that creates and deletes TF_Buffer.
diff --git a/tensorflow/python/framework/c_api_util_test.py b/tensorflow/python/framework/c_api_util_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..169abf1bb46d2a6d37d79346a6bb5503b347373f
--- /dev/null
+++ b/tensorflow/python/framework/c_api_util_test.py
@@ -0,0 +1,59 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for c_api utils."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+class ApiDefMapTest(test_util.TensorFlowTestCase):
+
+  def testApiDefMapOpNames(self):
+    api_def_map = c_api_util.ApiDefMap()
+    self.assertIn("Add", api_def_map.op_names())
+
+  def testApiDefMapGet(self):
+    api_def_map = c_api_util.ApiDefMap()
+    op_def = api_def_map.get_op_def("Add")
+    self.assertEqual(op_def.name, "Add")
+    api_def = api_def_map.get_api_def("Add")
+    self.assertEqual(api_def.graph_op_name, "Add")
+
+  def testApiDefMapPutThenGet(self):
+    api_def_map = c_api_util.ApiDefMap()
+    api_def_text = """
+op {
+  graph_op_name: "Add"
+  summary: "Returns x + y element-wise."
+  description: <<END
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
+"""
+    api_def_map.put_api_def(api_def_text)
+    api_def = api_def_map.get_api_def("Add")
+    self.assertEqual(api_def.graph_op_name, "Add")
+    self.assertEqual(api_def.summary, "Returns x + y element-wise.")
+
+
+if __name__ == "__main__":
+  googletest.main()
+
diff --git a/tensorflow/python/framework/fast_tensor_util.pyx b/tensorflow/python/framework/fast_tensor_util.pyx
index 19928314efe143572b36324230f8e8f2ae87648d..17d112a1ece9ae3d121b894d9b246d24b46d84e2 100644
--- a/tensorflow/python/framework/fast_tensor_util.pyx
+++ b/tensorflow/python/framework/fast_tensor_util.pyx
@@ -7,6 +7,18 @@ cimport numpy as np
 from tensorflow.python.util import compat
 
 
+def AppendFloat16ArrayToTensorProto(
+    # For numpy, npy_half is a typedef for npy_uint16,
+    # see: https://github.com/numpy/numpy/blob/master/doc/source/reference/c-api.coremath.rst#half-precision-functions
+    # Because np.float16_t dosen't exist in cython, we use uint16_t here.
+    # TODO: Use np.float16_t when cython supports it.
+    tensor_proto, np.ndarray[np.uint16_t, ndim=1] nparray):
+  cdef long i, n
+  n = nparray.size
+  for i in range(n):
+    tensor_proto.half_val.append(nparray[i])
+
+
 def AppendFloat32ArrayToTensorProto(
     tensor_proto, np.ndarray[np.float32_t, ndim=1] nparray):
   cdef long i, n
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index e7f9e590af8421a5dc43bbf68060752461cef195..067522201694d64027ebcc7ea500e4771a96ed92 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -68,9 +68,10 @@ class Defun(object):
   during the first call to the function. Subsequent function calls will refer to
   the same set of variables.
 
-  Definitions of functions are frozen in a graph as soon as the graph is used to
-  create a session. Therefore, nodes using the function must be created in the
-  graph before the corresponding session is created.
+  Definitions of functions in a graph are frozen as soon as the graph is used to
+  create a session. However, new functions and new calls to existing functions
+  may be added to the graph, with the new functions themselves becoming
+  immediately frozen.
 
   Example, but also see the [How To on functions](link_needed).
 
@@ -248,6 +249,9 @@ class _DefinedFunction(object):
     # Constructed only when C API is enabled, lazily
     self._c_func = None
     self._sub_functions = dict()  # Constructed with _definition or _c_func
+    device_stack = ops.get_default_graph()._device_function_stack  # pylint: disable=protected-access
+    # Get the innermost device if possbile.
+    self._caller_device = device_stack[-1] if device_stack else None
 
     # Cached OpDef for this function. When C API is enabled, this is
     # the only part of FunctionDef that we cache in Python. When C API
@@ -255,12 +259,10 @@ class _DefinedFunction(object):
     # another reference to _definition.signature
     self._op_def = None
 
-    self._args = []
     assert isinstance(input_types, (list, tuple))
-    for i in range(len(input_types)):
-      argname = argnames[i] if i < len(argnames) else ("arg%d" % i)
-      argtype = input_types[i]
-      self._args.append((argname, argtype))
+    self._arg_types = input_types
+    self._arg_names = [argnames[i] if i < len(argnames) else ("arg%d" % i)
+                       for i in range(len(input_types))]
 
   @property
   def name(self):
@@ -313,6 +315,16 @@ class _DefinedFunction(object):
     self._create_definition_if_needed()
     return self._extra_inputs
 
+  @property
+  def stateful_ops(self):
+    """Returns the list of stateful ops in function definition.
+
+    Returns:
+      A list of (op.name, op.type) pairs.
+    """
+    self._create_definition_if_needed()
+    return self._stateful_ops
+
   def _create_definition_if_needed(self):
     """Creates the function definition if it's not created yet."""
     with context.graph_mode():
@@ -323,42 +335,11 @@ class _DefinedFunction(object):
     if self._definition is not None or self._c_func is not None:
       return
 
-    # Create the func_def object.
-    temp_graph = _FuncGraph(capture_by_value=self._capture_by_value)
-    with temp_graph.as_default():
-      # List of placeholders for the function_def.
-      inputs = []
-      for (argname, argtype) in self._args:
-        argholder = array_ops.placeholder(argtype, name=argname)
-        inputs.append(argholder)
-      # Call func and gather the output tensors.
-      with vs.variable_scope("", custom_getter=temp_graph.getvar):
-        outputs = self._func(*inputs)
-
-      # There is no way of distinguishing between a function not returning
-      # anything and a function returning None in Python.
-      # We need to allow the former and ideally want to forbid the latter as
-      # it is most likely user error.
-      # TODO(iga): Consider adding a @NoOutput decorator on top of @Defun to
-      # allow users to explicitly mark the function as not returning anything.
-      # For now, we allow a single None return and interpret it as a function
-      # with no output.
-      if outputs is None:
-        outputs = []
-      else:
-        # If func only returned one value, make it a tuple.
-        if not isinstance(outputs, (list, tuple)):
-          outputs = (outputs,)
-        if any([_ is None for _ in outputs]):
-          raise ValueError("Function can not return None.")
-      # Ensures each output is a Tensor in the function graph.
-      outputs = [ops.convert_to_tensor(t) for t in outputs]
-      outputs = [
-          temp_graph.capture(t) if t.graph is not temp_graph else t
-          for t in outputs
-      ]
+    temp_graph = func_graph_from_py_func(
+        self._func, self._arg_names, self._arg_types, self._func_name,
+        self._capture_by_value, self._caller_device)
+
     self._extra_inputs = temp_graph.extra_inputs
-    inputs.extend(temp_graph.extra_args)
     # pylint: disable=protected-access
     self._sub_functions = temp_graph._functions
     # pylint: enable=protected-access
@@ -377,8 +358,8 @@ class _DefinedFunction(object):
       self._definition = graph_to_function_def.graph_to_function_def(
           temp_graph,
           temp_graph.get_operations(),
-          inputs,
-          outputs,
+          temp_graph.inputs,
+          temp_graph.outputs,
           out_names=self._out_names)
 
       for k in kwargs_attr:
@@ -408,8 +389,8 @@ class _DefinedFunction(object):
           base_func_name,
           self._func_name is None,  # append_hash_to_fn_name
           None,  # opers
-          [t._as_tf_output() for t in inputs],
-          [t._as_tf_output() for t in outputs],
+          [t._as_tf_output() for t in temp_graph.inputs],
+          [t._as_tf_output() for t in temp_graph.outputs],
           output_names,
           None,  # opts
           description)
@@ -424,6 +405,10 @@ class _DefinedFunction(object):
       else:
         self._func_name = compat.as_str(self._op_def.name)
 
+    self._stateful_ops = [(op.name, op.type)
+                          for op in temp_graph.get_operations()
+                          if op.op_def.is_stateful]
+
   def _set_c_attrs(self, attrs):
     """Sets `attrs` as attributes of self._c_func.
 
@@ -636,16 +621,33 @@ class _FuncGraph(ops.Graph):
   function argument and the caller passes in the captured tensor.
   """
 
-  def __init__(self, capture_by_value, *args, **kwargs):
+  def __init__(self, name, capture_by_value, *args, **kwargs):
     super(_FuncGraph, self).__init__(*args, **kwargs)
     self._capture_by_value = capture_by_value
     self._building_function = True
     self._outer_graph = ops.get_default_graph()
     self._vscope = vs.get_variable_scope()
     self._old_custom_getter = self._vscope.custom_getter
+
+    # The name of the function.
+    self.name = name
+    # Placeholder tensors representing the inputs to this function. The tensors
+    # are in this _FuncGraph.
+    self.inputs = []
+    # Tensors that will be returned this function. The tensors are in this
+    # _FuncGraph.
+    self.outputs = []
+    # Maps external tensor -> internal tensor (e.g. input placeholder).
     self._captured = {}
+    # The external tensors that have been captured as inputs and must be passed
+    # to this function (empty if capturing by value, otherwise these are the
+    # keys of _captured).
     self.extra_inputs = []
+    # Input placeholders that been added for captured values (empty if capturing
+    # by value).
     self.extra_args = []
+    # Captured variables.
+    # TODO(skyewm): is this needed?
     self.extra_vars = []
 
   def getvar(
@@ -696,7 +698,7 @@ class _FuncGraph(ops.Graph):
     return super(_FuncGraph, self).create_op(op_type, inputs, data_types,
                                              **kwargs)
 
-  def capture(self, tensor):
+  def capture(self, tensor, name=None):
     """Adds the given tensor to this graph and returns the captured tensor."""
     if tensor in self._captured:
       # Captured already.
@@ -704,15 +706,16 @@ class _FuncGraph(ops.Graph):
     elif self._capture_by_value:
       return self._add_tensor_and_parents(tensor)
     else:
-      return self._capture_tensor_as_extra_input(tensor)
+      return self._capture_tensor_as_extra_input(tensor, name)
 
-  def _capture_tensor_as_extra_input(self, tensor):
+  def _capture_tensor_as_extra_input(self, tensor, name=None):
     # Substitute with a placeholder.
     self.extra_inputs.append(tensor)
     # Hoist the new input placeholder out of any control flow context
     # we're currently in.
     with ops.control_dependencies(None):
-      ph = array_ops.placeholder(tensor.dtype, shape=tensor.get_shape())
+      ph = array_ops.placeholder(
+          tensor.dtype, shape=tensor.get_shape(), name=name)
     # pylint: disable=protected-access
     if ops._USE_C_SHAPES:
       handle_data = c_api.GetResourceHandleShapeAndType(tensor.graph._c_graph,
@@ -724,6 +727,7 @@ class _FuncGraph(ops.Graph):
     else:
       ph._handle_data = tensor._handle_data
     # pylint: enable=protected-access
+    self.inputs.append(ph)
     self._captured[tensor] = ph
     self.extra_args.append(ph)
     if _is_guaranteed_const(tensor):
@@ -762,6 +766,63 @@ class _FuncGraph(ops.Graph):
     return captured_op
 
 
+def func_graph_from_py_func(func, arg_names, arg_types, name=None,
+                            capture_by_value=False, device=None):
+  """Returns a _FuncGraph generated from `func`.
+
+  Args:
+    func: A Python callable which constructs a TF function body. The arguments
+      must correspond to `arg_types`. Returns a value or list/tuple of values.
+      No returned value can be None.
+    arg_names: A sequence of strings for the function argument names.
+    arg_types: A sequence of the function's argument types.
+    name: The function name. If None, the name is derived from `func`.
+    capture_by_value: boolean. If True, captured values will be copied into the
+      function body.
+    device: device name or function.
+
+  Returns:
+    A _FuncGraph.
+
+  Raises:
+    ValueError: if func returns None.
+  """
+  if not name:
+    name = _get_func_name(func)
+  func_graph = _FuncGraph(name, capture_by_value)
+  with func_graph.as_default(), ops.device(device):
+    # Create placeholders for the function arguments.
+    for (argname, argtype) in zip(arg_names, arg_types):
+      argholder = array_ops.placeholder(argtype, name=argname)
+      func_graph.inputs.append(argholder)
+    # Call func and gather the output tensors.
+    with vs.variable_scope("", custom_getter=func_graph.getvar):
+      outputs = func(*func_graph.inputs)
+
+    # There is no way of distinguishing between a function not returning
+    # anything and a function returning None in Python.
+    # We need to allow the former and ideally want to forbid the latter as
+    # it is most likely user error.
+    # TODO(iga): Consider adding a @NoOutput decorator on top of @Defun to
+    # allow users to explicitly mark the function as not returning anything.
+    # For now, we allow a single None return and interpret it as a function
+    # with no output.
+    if outputs is None:
+      outputs = []
+    else:
+      # If func only returned one value, make it a tuple.
+      if not isinstance(outputs, (list, tuple)):
+        outputs = (outputs,)
+      if any([_ is None for _ in outputs]):
+        raise ValueError("Function can not return None.")
+    # Ensures each output is a Tensor in the function graph.
+    outputs = [ops.convert_to_tensor(t) for t in outputs]
+    outputs = [func_graph.capture(t) if t.graph is not func_graph else t
+               for t in outputs]
+    func_graph.outputs = outputs
+  return func_graph
+
+
 def _is_guaranteed_const(tensor):
   """Determines whether `tensor` is guaranteed to be a constant.
 
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index a5c19f189ea5c48a6a80f3bcf0069aba2f2f4125..15e41ba91f9ae121d3d4ea48e3e71eace7cd9a3e 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -36,6 +36,7 @@ from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework.errors import InvalidArgumentError
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -182,6 +183,8 @@ class FunctionTest(test.TestCase):
     def APlus2B(a, b):
       return a + b * 2
 
+    # APlus2B is stateless.
+    self.assertEqual([], APlus2B.stateful_ops)
     with ops.Graph().as_default():
       call = APlus2B([1.0], [2.0])
       self.assertEqual("APlus2B", call.op.name)
@@ -428,6 +431,8 @@ class FunctionTest(test.TestCase):
       with ops.control_dependencies([check]):
         return x * 2
 
+    # Foo contains a stateful op (Assert).
+    self.assertEqual([("Assert", "Assert")], Foo.stateful_ops)
     g = ops.Graph()
     with g.as_default(), self.test_session():
       self.assertAllEqual(Foo(constant_op.constant(3.0)).eval(), 6.0)
@@ -809,17 +814,11 @@ class FunctionTest(test.TestCase):
     def Foo(x, y, z):
       return math_ops.tanh(math_ops.matmul(x, y) + z)
 
-    # We added more randomness to function names in C API.
-    # TODO(iga): Remove this if statement when we switch to C API.
-    if ops._USE_C_API:  # pylint: disable=protected-access
-      if sys.byteorder == "big":
-        self.assertEqual("Foo_kEdkAG8SJvg",
-                         Foo.instantiate([dtypes.float32] * 3).name)
-      else:
-        self.assertEqual("Foo_aCYSbwBkR5A",
-                         Foo.instantiate([dtypes.float32] * 3).name)
+    if sys.byteorder == "big":
+      self.assertEqual("Foo_kEdkAG8SJvg",
+                       Foo.instantiate([dtypes.float32] * 3).name)
     else:
-      self.assertEqual("Foo_d643acf7",
+      self.assertEqual("Foo_aCYSbwBkR5A",
                        Foo.instantiate([dtypes.float32] * 3).name)
 
   def testSignatureHash(self):
@@ -1719,5 +1718,91 @@ class VariableHoistingTest(test.TestCase):
     self._testSimpleModel(False, use_resource=True)
 
 
+class DevicePlacementTest(test.TestCase):
+
+  def testNoDeviceGraph(self):
+    with ops.Graph().as_default():
+
+      @function.Defun(*[dtypes.float32] * 2)
+      def Matmul(a, b):
+        return math_ops.matmul(a, b)
+
+      Matmul(1., 2.)
+
+      gdef = ops.get_default_graph().as_graph_def()
+      self.assertAllEqual(len(gdef.library.function), 1)
+      fdef = gdef.library.function[0]
+
+      for node in fdef.node_def:
+        self.assertAllEqual(node.device, "")
+
+  def testNestedDevices(self):
+    with ops.Graph().as_default(), ops.device("CPU:0"):
+
+      @function.Defun(*[dtypes.float32] * 2)
+      def Matmul(a, b):
+        return math_ops.matmul(a, b)
+
+      with ops.device("CPU:1"):
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def Divide(a, b):
+          return math_ops.divide(a, b)
+
+        Divide(Matmul(1., 2.), 3.)
+
+      gdef = ops.get_default_graph().as_graph_def()
+      matmul_fdef = [
+          f for f in gdef.library.function if "Matmul" in f.signature.name
+      ]
+      divide_fdef = [
+          f for f in gdef.library.function if "Divide" in f.signature.name
+      ]
+      self.assertAllEqual(len(matmul_fdef), 1)
+      self.assertAllEqual(len(divide_fdef), 1)
+      for node in matmul_fdef[0].node_def:
+        self.assertAllEqual(node.device, "/device:CPU:0")
+      for node in divide_fdef[0].node_def:
+        self.assertAllEqual(node.device, "/device:CPU:1")
+
+  def _testNestedDeviceWithSameFunction(self, func_name):
+
+    def MatmulWrap(a, b):
+
+      @function.Defun(
+          func_name=func_name, *[dtypes.int32] * 2)
+      def Matmul(a, b):
+        return math_ops.matmul(a, b)
+
+      return Matmul(a, b)
+
+    with ops.Graph().as_default(), ops.device("CPU:0"):
+      c = MatmulWrap(1, 2)
+
+      with ops.device("CPU:1"):
+        MatmulWrap(c, 3)
+
+      gdef = ops.get_default_graph().as_graph_def()
+
+      devices = []
+      for node in gdef.library.function[0].node_def:
+        devices.append(node.device)
+      for node in gdef.library.function[1].node_def:
+        devices.append(node.device)
+
+      self.assertAllEqual(sorted(devices), ["/device:CPU:0", "/device:CPU:1"])
+
+  def testFunctionWithName(self):
+    with self.assertRaises(InvalidArgumentError) as cm:
+      self._testNestedDeviceWithSameFunction("MatmulTest")
+    self.assertEqual(
+        cm.exception.message,
+        "Cannot add function \'MatmulTest\' because a different "
+        "function with the same name already exists.")
+
+  def testFunctionWithoutName(self):
+    self._testNestedDeviceWithSameFunction(None)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 5112bea48b5033e2cd16a555d65993b575f475eb..72eb7e0eeb73fb1f8725ab2cbd4182e543c79b9f 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -17,78 +17,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import contextlib
-import copy
 
-from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import graph_pb2
-from tensorflow.core.framework import types_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
-# TODO(josh11b): SWIG the code from node_def_util instead of duplicating
-# the logic here.
-def _GetNodeAttr(node_def, attr_name):
-  if attr_name not in node_def.attr:
-    raise ValueError('Expected one attr with name %r in %s.' % (attr_name,
-                                                                str(node_def)))
-  return node_def.attr[attr_name]
-
-
-def _ArgToTypesNoRef(node_def, arg_def):
-  if arg_def.number_attr:
-    repeats = _GetNodeAttr(node_def, arg_def.number_attr).i
-    if arg_def.type_attr:
-      dtype = _GetNodeAttr(node_def, arg_def.type_attr).type
-    else:
-      assert arg_def.type != types_pb2.DT_INVALID
-      dtype = arg_def.type
-    return [dtype] * repeats
-  elif arg_def.type_attr:
-    return [_GetNodeAttr(node_def, arg_def.type_attr).type]
-  elif arg_def.type_list_attr:
-    return _GetNodeAttr(node_def, arg_def.type_list_attr).list.type
-  else:
-    assert arg_def.type != types_pb2.DT_INVALID
-    return [arg_def.type]
-
-
-def _SingleArgToTypes(node_def, arg_def):
-  types = _ArgToTypesNoRef(node_def, arg_def)
-  if arg_def.is_ref:
-    return [dtypes.as_dtype(dt)._as_ref.as_datatype_enum for dt in types]  # pylint: disable=protected-access
-  return types
-
-
-def _ArgsToTypes(node_def, arg_list):
-  types = []
-  for arg_def in arg_list:
-    types.extend(_SingleArgToTypes(node_def, arg_def))
-  return types
-
-
-def _InputTypes(node_def, op_dict):
-  op_def = op_dict[node_def.op]
-  return _ArgsToTypes(node_def, op_def.input_arg)
-
-
-def _OutputTypes(node_def, op_dict):
-  op_def = op_dict[node_def.op]
-  return _ArgsToTypes(node_def, op_def.output_arg)
-
-
 def _IsControlInput(input_name):
   # Expected format: '^operation_name' (control input).
   return input_name.startswith('^')
@@ -128,18 +71,6 @@ def _ParseTensorName(tensor_name):
     raise ValueError('Cannot convert %r to a tensor name.' % (tensor_name,))
 
 
-def _CanonicalInputName(input_name):
-  input_name = compat.as_str(input_name)
-  if _IsControlInput(input_name):
-    return input_name
-  input_op_name, output_index = _ParseTensorName(input_name)
-  return '%s:%d' % (input_op_name, output_index)
-
-
-def _InvalidNodeMessage(node, message):
-  return 'graph_def is invalid at node %r: %s.' % (node.name, message)
-
-
 @contextlib.contextmanager
 def _MaybeDevice(device):
   """Applies the given device only if device is not None or empty."""
@@ -460,351 +391,70 @@ def import_graph_def(graph_def,
     _RemoveDefaultAttrs(op_dict, producer_op_list, graph_def)
 
   graph = ops.get_default_graph()
-
-  if graph._c_graph:  # pylint: disable=protected-access
-    with ops.name_scope(name, 'import', input_map.values()) as scope:
-      # Save unique prefix generated by name_scope
-      if scope:
-        assert scope.endswith('/')
-        prefix = scope[:-1]
-      else:
-        prefix = ''
-
-      # Generate any input map tensors inside name scope
-      input_map = _ConvertInputMapValues(name, input_map)
-
-    scoped_options = c_api_util.ScopedTFImportGraphDefOptions()
-    options = scoped_options.options
-    _PopulateTFImportGraphDefOptions(options, prefix, input_map,
-                                     return_elements)
-
-    # _ProcessNewOps mutates the new operations. _lock ensures a Session.run
-    # call cannot occur between creating the TF_Operations in the
-    # TF_GraphImportGraphDefWithResults call and mutating the them in
-    # _ProcessNewOps.
-    with graph._lock:  # pylint: disable=protected-access
-      with c_api_util.tf_buffer(graph_def.SerializeToString()) as serialized:
-        try:
-          results = c_api.TF_GraphImportGraphDefWithResults(
-              graph._c_graph, serialized, options)  # pylint: disable=protected-access
-          results = c_api_util.ScopedTFImportGraphDefResults(results)
-        except errors.InvalidArgumentError as e:
-          # Convert to ValueError for backwards compatibility.
-          raise ValueError(str(e))
-
-      # Create _DefinedFunctions for any imported functions.
-      #
-      # We do this by creating _DefinedFunctions directly from `graph_def`, and
-      # adding them to `graph`. Adding an existing function to a TF_Graph is a
-      # no-op, so this only has the effect of updating the Python state (usually
-      # _DefinedFunction.add_to_graph also adds the function to the TF_Graph).
-      #
-      # TODO(skyewm): fetch the TF_Functions directly from the TF_Graph
-      # TODO(skyewm): avoid sending serialized FunctionDefs back to the TF_Graph
-      # TODO(b/74620627): move this after _ProcessNewOps outside the lock once
-      # _USE_C_SHAPES is removed.
-      if graph_def.library and graph_def.library.function:
-        # pylint: disable=protected-access
-        functions = function._from_library(graph_def.library)
-        for f in functions:
-          f.add_to_graph(graph)
-        # pylint: enable=protected-access
-
-      _ProcessNewOps(graph)
-
-    # Treat input mappings that don't appear in the graph as an error, because
-    # they are likely to be due to a typo.
-    missing_unused_input_keys = (
-        c_api.TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
-            results.results))
-    if missing_unused_input_keys:
-      missing_unused_input_keys = [
-          compat.as_str(s) for s in missing_unused_input_keys
-      ]
-      raise ValueError(
-          'Attempted to map inputs that were not found in graph_def: [%s]' %
-          ', '.join(missing_unused_input_keys))
-
-    if return_elements is None:
-      return None
+  with ops.name_scope(name, 'import', input_map.values()) as scope:
+    # Save unique prefix generated by name_scope
+    if scope:
+      assert scope.endswith('/')
+      prefix = scope[:-1]
     else:
-      return _GatherReturnElements(return_elements, graph, results.results)
-
-  else:
-    g = graph
-
-    # Use a canonical representation for all tensor names.
-    input_map = {_CanonicalInputName(k): v for k, v in input_map.items()}
-    used_input_keys = set()
-    name_to_op = {}
-
-    # Add any functions defined in `graph_def` to `g`
+      prefix = ''
+
+    # Generate any input map tensors inside name scope
+    input_map = _ConvertInputMapValues(name, input_map)
+
+  scoped_options = c_api_util.ScopedTFImportGraphDefOptions()
+  options = scoped_options.options
+  _PopulateTFImportGraphDefOptions(options, prefix, input_map,
+                                   return_elements)
+
+  # _ProcessNewOps mutates the new operations. _lock ensures a Session.run
+  # call cannot occur between creating the TF_Operations in the
+  # TF_GraphImportGraphDefWithResults call and mutating the them in
+  # _ProcessNewOps.
+  with graph._lock:  # pylint: disable=protected-access
+    with c_api_util.tf_buffer(graph_def.SerializeToString()) as serialized:
+      try:
+        results = c_api.TF_GraphImportGraphDefWithResults(
+            graph._c_graph, serialized, options)  # pylint: disable=protected-access
+        results = c_api_util.ScopedTFImportGraphDefResults(results)
+      except errors.InvalidArgumentError as e:
+        # Convert to ValueError for backwards compatibility.
+        raise ValueError(str(e))
+
+    # Create _DefinedFunctions for any imported functions.
+    #
+    # We do this by creating _DefinedFunctions directly from `graph_def`, and
+    # adding them to `graph`. Adding an existing function to a TF_Graph is a
+    # no-op, so this only has the effect of updating the Python state (usually
+    # _DefinedFunction.add_to_graph also adds the function to the TF_Graph).
+    #
+    # TODO(skyewm): fetch the TF_Functions directly from the TF_Graph
+    # TODO(skyewm): avoid sending serialized FunctionDefs back to the TF_Graph
+    # TODO(b/74620627): move this after _ProcessNewOps outside the lock once
+    # _USE_C_SHAPES is removed.
     if graph_def.library and graph_def.library.function:
-      # Copy op_dict so we don't clobber the original
-      op_dict = copy.copy(op_dict)
       # pylint: disable=protected-access
-      # Note that we do not prepend `name` to the function name. The reasoning
-      # is that function names are similar to op definition names, which
-      # currently do not have a scoped name or namespace scheme.
       functions = function._from_library(graph_def.library)
       for f in functions:
-        f.add_to_graph(g)
-        op_dict[f.name] = f.definition.signature
+        f.add_to_graph(graph)
       # pylint: enable=protected-access
 
-    # LINT.IfChange
-    with ops.name_scope(name, 'import', input_map.values()) as scope:
-      # TODO(ashankar): Should this just copy over or should it do some
-      # more nuanced merging? For example, the graph may already have some
-      # marked "bad versions" and we don't want to lose those because of
-      # what's in graph_def.versions? The C++ ImporGraphDef does something
-      # more nuanced.
-      g.graph_def_versions.CopyFrom(graph_def.versions)
-
-      input_map = _ConvertInputMapValues(name, input_map)
-
-      # NOTE(mrry): We do this in two passes, because there may be a cycle in
-      # `graph_def`.
-
-      # 1. Add operations without their inputs.
-      for node in graph_def.node:
-        # Check to see if this op's name matches a previously seen op
-        if node.name in name_to_op:
-          raise ValueError('Duplicate name \'%s\' in GraphDef.' % node.name)
-        if node.op not in op_dict:
-          raise ValueError(
-              'No op named %s in defined operations. If the Graph you are '
-              'importing uses custom ops or any parts of tf.contrib, you '
-              'should explicitly import the libraries defining those ops '
-              'before loading the Graph. Note that tf.contrib is lazily loaded '
-              'when accessed, so simply referencing (e.g.) '
-              '`tf.contrib.resampler` will cause those ops to be made '
-              'available.' % node.op)
-        op_def = op_dict[node.op]
-
-        output_types = _OutputTypes(node, op_dict)
-        name_to_op[node.name] = g.create_op(
-            node.op, [], output_types, name=node.name, attrs=node.attr,
-            compute_shapes=False, compute_device=False,
-            op_def=op_def)
-
-      # Maps from a node to the ops it is colocated with, if colocation
-      # is specified in the attributes.
-      colocation_pairs = collections.defaultdict(list)
-
-      # 2. Add inputs to the operations.
-      for node in graph_def.node:
-        op = name_to_op[node.name]
-        input_types = _InputTypes(node, op_dict)
-        apply_device_function = True
-
-        # Rewrite the colocation attributes in the graph, since the
-        # names of new ops may have changed.
-        for key, value in op.node_def.attr.items():
-          if key == '_class':
-            class_values = value.list
-            new_class_values = []
-            for class_value in class_values.s:
-              if class_value.startswith(b'loc:@'):
-                op_to_bind_to = class_value[5:].decode()
-                # Find the op by its original name.
-                if op_to_bind_to not in name_to_op:
-                  raise ValueError('Specified colocation to an op that '
-                                   'does not exist during import: %s in %s' % (
-                                       op_to_bind_to, node.name))
-                original_op = name_to_op[op_to_bind_to]
-                new_class_values.append(compat.as_bytes(
-                    'loc:@' + original_op.name))
-                if op_to_bind_to != node.name:
-                  # Keep track of this mapping for a later phase.
-                  colocation_pairs[op].append(original_op)
-                  # Don't apply this op's device function,
-                  # the colocation constraint will ensure
-                  # the proper device gets assigned at runtime.
-                  apply_device_function = False
-
-              else:
-                new_class_values.append(class_value)
-            value.list.CopyFrom(attr_value_pb2.AttrValue.ListValue(
-                s=new_class_values))
-
-        # NOTE(mrry): We cannot use zip here because control inputs do not
-        # appear in the list of input_types.
-        for i, input_name in enumerate(
-            [_CanonicalInputName(x) for x in node.input]):
-
-          if _IsControlInput(input_name):
-            # (a) Input is a control input that should be taken from an op
-            #     in "graph_def".
-            try:
-              source_op = name_to_op[input_name[1:]]
-            except KeyError:
-              raise ValueError(
-                  _InvalidNodeMessage(
-                      node,
-                      'Control input %r not found in graph_def.'
-                      % (input_name,)))
-            # pylint: disable=protected-access
-            op._add_control_input(source_op)
-            # pylint: enable=protected-access
-
-          else:
-            try:
-              input_type = input_types[i]
-            except IndexError:
-              raise ValueError(_InvalidNodeMessage(
-                  node, 'More inputs specified (%r) than the op expects.'
-                  % (input_name,)))
-
-            if input_name in input_map:
-              # (b) Input should be replaced by a tensor from the caller.
-              source_tensor = input_map[input_name]
-              used_input_keys.add(input_name)
-
-            else:
-              # (c) Input should be taken from an op in `graph_def`.
-              operation_name, output_index = _ParseTensorName(input_name)
-              try:
-                source_op = name_to_op[operation_name]
-                source_tensor = list(source_op.values())[output_index]
-              except (KeyError, IndexError):
-                raise ValueError(
-                    _InvalidNodeMessage(
-                        node,
-                        'Input tensor %r not found in graph_def.'
-                        % (input_name,)))
-
-            try:
-              # pylint: disable=protected-access
-              op._add_input(source_tensor, dtype=input_type)
-              # pylint: enable=protected-access
-            except TypeError as te:
-              raise ValueError(_InvalidNodeMessage(
-                  node, 'Input tensor %r %s' % (input_name, te)))
-
-        # pylint: disable=protected-access
-        if op._input_types != input_types:
-          raise ValueError(
-              _InvalidNodeMessage(
-                  node,
-                  'Input types mismatch (expected %r but got %r)'
-                  % (', '.join(dtypes.as_dtype(x).name for x in input_types),
-                     ', '.join(x.name for x in op._input_types))))
-        # pylint: enable=protected-access
-
-        # Execute shape inference for this op.
-        # NOTE(mrry): If the graph contains a cycle, the full shape
-        # information may not be available for this op's inputs.
-        ops.set_shape_and_handle_data_for_outputs(op)
-        # For nodes with _output_shapes set, set the output shapes.
-        if '_output_shapes' in op.node_def.attr:
-          for i, output in enumerate(op.outputs):
-            dims = op.node_def.attr['_output_shapes'].list.shape[i]
-            output_shape = tensor_shape.TensorShape(
-                None if dims.unknown_rank else
-                [dim.size if dim.size >= 0 else None for dim in dims.dim])
-
-            try:
-              output.set_shape(output_shape)
-            except ValueError as e:
-              # If the output shape is incompatible with what is inferred
-              # by the graph for a very specific whitelist of ops, then we
-              # ignore this output shape.  This can happen if there is a
-              # bug in the shape function for some operation, and the
-              # serialized graph def has the incorrect shape set when
-              # running on a newer binary with the fixed shape function.
-              # This is an escape hatch that allows us to correct shape
-              # functions that are not critical to correct execution but
-              # would cause graphs to fail if imported after correcting.
-              #
-              # This can be removed after 2017/03/08.
-              if op.type in ['RandomShuffleQueue', 'PaddingFIFOQueue',
-                             'FIFOQueue', 'PriorityQueue', 'QueueSize',
-                             'Stack', 'Barrier', 'BarrierReadySize',
-                             'BarrierIncompleteSize', 'HashTable',
-                             'MutableHashTable',
-                             'MutableHashTableOfTensors', 'Mutex',
-                             'CuckooTable', 'IndexTable',
-                             'WholeFileReader', 'TextLineReader',
-                             'FixedLengthRecordReader',
-                             'TFRecordReader', 'IdentityReader',
-                             'LMDBReader',
-                             'RefSwitch', 'RefEnter', 'RefNextIteration',
-                             'RefMerge', 'RefIdentity']:
-                pass
-              elif op.type in [
-                  'ConditionalAccumulator', 'SparseConditionalAccumulator',
-                  'Table'
-              ]:
-                # This can be removed after 2017/04/24.
-                pass
-              else:
-                raise e
-
-          del op.node_def.attr['_output_shapes']
-
-        # NOTE(mrry): We do this after configuring the inputs, because
-        # the result of the device functions may depend on the inputs.
-        if apply_device_function:
-          with _MaybeDevice(node.device):
-            g._apply_device_functions(op)  # pylint: disable=protected-access
-
-      # The following loop populates the device field of ops that are
-      # colocated with another op.  This is implied by the colocation
-      # attribute, but we propagate the device field for completeness.
-      for op, coloc_op_list in colocation_pairs.items():
-        coloc_device = None
-        # Find any device in the list of colocated ops that have a
-        # device, if it exists.  We assume that if multiple ops
-        # have devices, they refer to the same device.  Otherwise, a
-        # runtime error will occur since the colocation property
-        # cannot be guaranteed.
-        #
-        # One possible improvement is to try to check for compatibility
-        # of all devices in this list at import time here, which would
-        # require implementing a compatibility function for device specs
-        # in python.
-        for coloc_op in coloc_op_list:
-          if coloc_op.device:
-            coloc_device = pydev.DeviceSpec.from_string(coloc_op.device)
-            break
-        if coloc_device:
-          op._set_device(coloc_device)  # pylint: disable=protected-access
-
-      # Treat input mappings that don't appear in the graph as an error,
-      # because they are likely to be due to a typo.
-      def _IsImportedNodeOutput(tensor_name):
-        operation_name, output_index = _ParseTensorName(tensor_name)
-        try:
-          return output_index < len(name_to_op[operation_name].outputs)
-        except KeyError:
-          return False
-      absent_input_keys = [
-          k for k in frozenset(input_map.keys()).difference(used_input_keys)
-          if not _IsImportedNodeOutput(k)]
-      if absent_input_keys:
-        raise ValueError(
-            'Attempted to map inputs that were not found in graph_def: [%s]'
-            % ', '.join(absent_input_keys))
-
-      if return_elements is None:
-        return None
-      else:
-        ret = []
-        for name in return_elements:
-          name = compat.as_str(name)
-          if ':' in name:
-            try:
-              operation_name, output_index = _ParseTensorName(name)
-              ret.append(name_to_op[operation_name].outputs[output_index])
-            except (ValueError, KeyError, IndexError):
-              raise ValueError(
-                  'Requested return_element %r not found in graph_def.' % name)
-          else:
-            try:
-              ret.append(name_to_op[name])
-            except KeyError:
-              raise ValueError(
-                  'Requested return_element %r not found in graph_def.' % name)
-        return ret
-    # LINT.ThenChange(//tensorflow/core/graph/graph_constructor.cc)
+    _ProcessNewOps(graph)
+
+  # Treat input mappings that don't appear in the graph as an error, because
+  # they are likely to be due to a typo.
+  missing_unused_input_keys = (
+      c_api.TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
+          results.results))
+  if missing_unused_input_keys:
+    missing_unused_input_keys = [
+        compat.as_str(s) for s in missing_unused_input_keys
+    ]
+    raise ValueError(
+        'Attempted to map inputs that were not found in graph_def: [%s]' %
+        ', '.join(missing_unused_input_keys))
+
+  if return_elements is None:
+    return None
+  else:
+    return _GatherReturnElements(return_elements, graph, results.results)
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index 2c913d1e028e15e293158fe180e263a78c514ee4..c5a54470d27b5949fd642b057feda7f3f1a4347f 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -31,7 +31,6 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops  # pylint: disable=unused-import
-from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -45,7 +44,6 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
-@test_util.with_c_api
 class ImportGraphDefTest(test.TestCase):
 
   def _MakeGraphDef(self,
@@ -231,10 +229,7 @@ class ImportGraphDefTest(test.TestCase):
           return_elements=["foo"],
           name="")
 
-      if ops._USE_C_API:
-        self.assertEqual(op.name, "foo")
-      else:
-        self.assertEqual(op.name, "foo_1")
+      self.assertEqual(op.name, "foo")
 
   def testInputMap(self):
     with ops.Graph().as_default():
@@ -425,14 +420,9 @@ class ImportGraphDefTest(test.TestCase):
         self.assertEqual(sess.run(imported_r), 10)
 
   def testTypeMismatchInGraphDef(self):
-    if ops._USE_C_API:
-      # TODO(skyewm): improve error message
-      error_msg = ("Input 0 of node import/B was passed int32 from import/A:0 "
-                   "incompatible with expected float.")
-    else:
-      error_msg = ("Cannot convert a tensor of type int32 to an input of type "
-                   "float")
-
+    # TODO(skyewm): improve error message
+    error_msg = ("Input 0 of node import/B was passed int32 from import/A:0 "
+                 "incompatible with expected float.")
     with ops.Graph().as_default():
       with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
@@ -476,14 +466,11 @@ class ImportGraphDefTest(test.TestCase):
             "Shapes () and (43,) are not compatible" in str(e.exception))
 
   def testInvalidSignatureTooManyInputsInGraphDef(self):
-    if ops._USE_C_API:
-      # TODO(skyewm): improve error message
-      error_msg = "NodeDef expected inputs '' do not match 1 inputs specified"
-    else:
-      error_msg = r"More inputs specified \('A:0'\) than the op expects"
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      # TODO(skyewm): improve error message
+      with self.assertRaisesRegexp(
+          ValueError,
+          "NodeDef expected inputs '' do not match 1 inputs specified"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
@@ -491,16 +478,12 @@ class ImportGraphDefTest(test.TestCase):
             """))
 
   def testInvalidSignatureNotEnoughInputsInGraphDef(self):
-    if ops._USE_C_API:
-      # TODO(skyewm): improve error message
-      error_msg = ("NodeDef expected inputs 'int32, float' do not match 1 "
-                   "inputs specified")
-    else:
-      error_msg = (r"Input types mismatch \(expected 'int32, float32' but "
-                   r"got 'int32'\)")
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      # TODO(skyewm): improve error message
+      with self.assertRaisesRegexp(
+          ValueError,
+          "NodeDef expected inputs 'int32, float' do not match 1 inputs "
+          "specified"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
@@ -508,13 +491,9 @@ class ImportGraphDefTest(test.TestCase):
             """))
 
   def testMissingInputOpInGraphDef(self):
-    if ops._USE_C_API:
-      error_msg = "Node 'B': Unknown input node 'A:0'"
-    else:
-      error_msg = "Input tensor 'A:0' not found"
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(ValueError,
+                                   "Node 'B': Unknown input node 'A:0'"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'FloatInput' input: 'A:0' }
@@ -532,14 +511,11 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(b.inputs[0], feed_a_0)
 
   def testMissingInputTensorInGraphDef(self):
-    if ops._USE_C_API:
-      error_msg = ("Node 'B': Connecting to invalid output 1 of source node A "
-                   "which has 1 outputs")
-    else:
-      error_msg = "Input tensor 'A:1' not found"
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(
+          ValueError,
+          "Node 'B': Connecting to invalid output 1 of source node A "
+          "which has 1 outputs"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'FloatOutput' }
@@ -547,52 +523,36 @@ class ImportGraphDefTest(test.TestCase):
             """))
 
   def testMissingControlInputInGraphDef(self):
-    if ops._USE_C_API:
-      error_msg = r"Node 'B': Unknown input node '\^A'"
-    else:
-      error_msg = r"Control input '\^A' not found"
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(ValueError,
+                                   r"Node 'B': Unknown input node '\^A'"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: '^A' }
             """))
 
   def testInvalidTensorNameOutputIndexInGraphDef(self):
-    if ops._USE_C_API:
-      error_msg = "Node 'B': Unknown input node 'A:B'"
-    else:
-      error_msg = "Cannot convert 'A:B' to a tensor name."
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(ValueError,
+                                   "Node 'B': Unknown input node 'A:B'"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: 'A:B' }
             """))
 
   def testInvalidTensorNameInGraphDef(self):
-    if ops._USE_C_API:
-      error_msg = "Node 'B': Unknown input node 'A:B:0'"
-    else:
-      error_msg = "Cannot convert 'A:B:0' to a tensor name."
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(ValueError,
+                                   "Node 'B': Unknown input node 'A:B:0'"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: 'A:B:0' }
             """))
 
   def testMissingReturnOperation(self):
-    if ops._USE_C_API:
-      error_msg = "Requested return node 'B' not found in graph def"
-    else:
-      error_msg = "return_element 'B' not found in graph_def."
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(
+          ValueError, "Requested return node 'B' not found in graph def"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'None' }
@@ -600,38 +560,26 @@ class ImportGraphDefTest(test.TestCase):
             return_elements=["B"])
 
   def testMissingReturnTensor(self):
-    if ops._USE_C_API:
-      error_msg = (r"Invalid return output 1 of node 'A', which has 1 "
-                   r"output\(s\)")
-    else:
-      error_msg = "return_element 'A:1' not found in graph_def."
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Invalid return output 1 of node 'A', which has 1 output\(s\)"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             """),
             return_elements=["A:1"])
 
-      if ops._USE_C_API:
-        error_msg = "Requested return tensor 'B:0' not found in graph def"
-      else:
-        error_msg = "return_element 'B:0' not found in graph_def."
-
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(
+          ValueError, "Requested return tensor 'B:0' not found in graph def"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             """),
             return_elements=["B:0"])
 
-      if ops._USE_C_API:
-        error_msg = "Cannot convert 'A:B:0' to a tensor name."
-      else:
-        error_msg = "return_element 'A:B:0' not found in graph_def."
-
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(ValueError,
+                                   "Cannot convert 'A:B:0' to a tensor name."):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
@@ -669,14 +617,10 @@ class ImportGraphDefTest(test.TestCase):
             input_map={"A:2": constant_op.constant(5.0)})
 
   def testInputMapTypeMismatch(self):
-    if ops._USE_C_API:
-      error_msg = ("Input 0 of node import/B was passed float from Const:0 "
-                   "incompatible with expected int32.")
-    else:
-      error_msg = ("Cannot convert a tensor of type float32 to an input of "
-                   "type int32.")
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(
+          ValueError, "Input 0 of node import/B was passed float from Const:0 "
+          "incompatible with expected int32."):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
@@ -899,13 +843,9 @@ class ImportGraphDefTest(test.TestCase):
             value { list { s: 'loc:@A' } }
           } }""")
 
-    if ops._USE_C_API:
-      error_msg = "Node 'B' expects to be colocated with unknown node 'A'"
-    else:
-      error_msg = "does not exist during import"
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(
+          ValueError, "Node 'B' expects to be colocated with unknown node 'A'"):
         importer.import_graph_def(
             original_graph_def, return_elements=["B"], name="imported_graph")
 
@@ -957,28 +897,19 @@ class ImportGraphDefTest(test.TestCase):
           TypeError, "return_elements must be a list of strings."):
         importer.import_graph_def(self._MakeGraphDef(""), return_elements=[7])
 
-      if ops._USE_C_API:
-        error_msg = "Cannot convert 'a:b:c' to a tensor name."
-      else:
-        error_msg = "Requested return_element 'a:b:c' not found in graph_def."
-      with self.assertRaisesRegexp(ValueError, error_msg):
-        importer.import_graph_def(self._MakeGraphDef(""),
-                                  return_elements=["a:b:c"])
+      with self.assertRaisesRegexp(ValueError,
+                                   "Cannot convert 'a:b:c' to a tensor name."):
+        importer.import_graph_def(
+            self._MakeGraphDef(""), return_elements=["a:b:c"])
 
   def testDuplicateOperationNames(self):
-    if ops._USE_C_API:
-      error_msg = "Node 'A' is not unique"
-    else:
-      error_msg = "Duplicate name 'A' in GraphDef."
-
-    with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
-        importer.import_graph_def(
-            self._MakeGraphDef("""
-            node { name: 'A' op: 'IntOutput' }
-            node { name: 'B' op: 'IntOutput' }
-            node { name: 'A' op: 'IntOutput' }
-            """))
+    with self.assertRaisesRegexp(ValueError, "Node 'A' is not unique"):
+      importer.import_graph_def(
+          self._MakeGraphDef("""
+          node { name: 'A' op: 'IntOutput' }
+          node { name: 'B' op: 'IntOutput' }
+          node { name: 'A' op: 'IntOutput' }
+          """))
 
   def testWithExtensionAndAttr(self):
     with ops.Graph().as_default() as g:
@@ -1119,40 +1050,22 @@ class ImportGraphDefTest(test.TestCase):
                            min_consumer)
 
   def testVersionLow(self):
-    with ops.Graph().as_default() as g:
-      pat = (r"GraphDef producer version -1 below min producer %d supported "
-             r"by TensorFlow \S+\.  Please regenerate your graph.$" %
-             versions.GRAPH_DEF_VERSION_MIN_PRODUCER)
-      # C API throws error during import, Python-only throws error during run
-      if ops._USE_C_API:
-        with self.assertRaisesRegexp(Exception, pat):
-          importer.import_graph_def(self._MakeGraphDef("", producer=-1))
-      else:
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          Exception,
+          r"GraphDef producer version -1 below min producer %d supported "
+          r"by TensorFlow \S+\.  Please regenerate your graph.$" %
+          versions.GRAPH_DEF_VERSION_MIN_PRODUCER):
         importer.import_graph_def(self._MakeGraphDef("", producer=-1))
-        x = constant_op.constant(
-            7)  # Need at least one op to get a C++ graph generated
-        with self.test_session(graph=g) as sess:
-          with self.assertRaisesRegexp(Exception, pat):
-            sess.run(x)
 
   def testVersionHigh(self):
-    with ops.Graph().as_default() as g:
-      pat = (r"GraphDef min consumer version %d above current version %d "
-             r"for TensorFlow \S+\.  Please upgrade TensorFlow\.$" %
-             (1 << 30, versions.GRAPH_DEF_VERSION))
-
-      if ops._USE_C_API:
-        with self.assertRaisesRegexp(ValueError, pat):
-          importer.import_graph_def(self._MakeGraphDef("",
-                                                       min_consumer=1 << 30))
-      else:
-        # Python API only throws when graph is run
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"GraphDef min consumer version %d above current version %d "
+          r"for TensorFlow \S+\.  Please upgrade TensorFlow\.$" %
+          (1 << 30, versions.GRAPH_DEF_VERSION)):
         importer.import_graph_def(self._MakeGraphDef("", min_consumer=1 << 30))
-        x = constant_op.constant(
-            7)  # Need at least one op to get a C++ graph generated
-        with self.test_session(graph=g) as sess:
-          with self.assertRaisesRegexp(Exception, pat):
-            sess.run(x)
 
   def testVersionAppliesToOpConstruction(self):
     """These tests rely on shape fns in test_ops.cc."""
@@ -1198,29 +1111,13 @@ class ImportGraphDefTest(test.TestCase):
           """),
           return_elements=["A"],
           producer_op_list=producer_op_list)
-      if ops._USE_C_API:
-        error_msg = "Operation 'import/A' has no attr named 'default_int'."
-      else:
-        error_msg = "No attr named 'default_int'"
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(
+          ValueError, "Operation 'import/A' has no attr named 'default_int'."):
         a[0].get_attr("default_int")
 
-    # Unknown attrs cannot be imported using C API. This test will eventually be
-    # deleted.
-    if not ops._USE_C_API:
-      # Attr only in producer_op_list with non-default value is preserved.
-      with ops.Graph().as_default():
-        a = importer.import_graph_def(
-            self._MakeGraphDef("""
-            node { name: 'A' op: 'OpWithFutureDefaultAttr'
-                   attr { key: 'default_int' value { i: 987 } } }
-            """),
-            return_elements=["A"],
-            producer_op_list=producer_op_list)
-        self.assertEqual(987, a[0].get_attr("default_int"))
-
   def testFunctions(self):
     dtype = dtypes.float32
+
     @function.Defun(dtype, dtype, dtype, dtype)
     def Grad(x, y, dout1, dout2):  # pylint: disable=unused-argument
       # Return the inputs for simplicity of testing. The correct return value
@@ -1299,6 +1196,7 @@ class ImportGraphDefTest(test.TestCase):
   def testImportInsideDefun(self):
     g = ops.Graph()
     with g.as_default():
+
       @function.Defun()
       def Add2(x, y):
         return math_ops.add(x, y)
@@ -1322,6 +1220,7 @@ class ImportGraphDefTest(test.TestCase):
   def testImportGraphWithFunctionTwice(self):
     g = ops.Graph()
     with g.as_default():
+
       @function.Defun()
       def Add2(x, y):
         return math_ops.add(x, y)
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 9a8477debb05fdf11d7d8c39adf9e5e55cc2caeb..535c6017f5fd0f8adf9ed091bd4477762e52b0e3 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -58,7 +58,7 @@ def load_op_library(library_filename):
   op_list_str = py_tf.TF_GetOpList(lib_handle)
   op_list = op_def_pb2.OpList()
   op_list.ParseFromString(compat.as_bytes(op_list_str))
-  wrappers = py_tf.GetEagerPythonWrappers(op_list_str)
+  wrappers = py_tf.GetPythonWrappers(op_list_str)
 
   # Delete the library handle to release any memory held in C
   # that are no longer needed.
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index 0532ed464cc7a2d7e1eb2908c360aed1316648ea..5cf86972100bd6f60dbdb0ec8f8239cebbf937e7 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -61,7 +61,6 @@ def _TestDir(test_name):
 # pylint: enable=invalid-name
 
 
-@test_util.with_c_api
 class SimpleMetaGraphTest(test.TestCase):
 
   def testNoVariables(self):
@@ -285,7 +284,6 @@ class SimpleMetaGraphTest(test.TestCase):
       self.assertIs(global_vars[0], trainable_vars[0])
 
 
-@test_util.with_c_api
 class ScopedMetaGraphTest(test.TestCase):
 
   def _testScopedExport(self, test_dir, exported_filenames):
@@ -841,7 +839,6 @@ class ScopedMetaGraphTest(test.TestCase):
     self.assertEqual("", str(graph2.as_graph_element("matmul").device))
 
 
-@test_util.with_c_api
 class MetaGraphWithVariableScopeTest(test.TestCase):
 
   def testMetricsCollection(self):
@@ -899,7 +896,6 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
         initializer = variables.local_variables_initializer()
 
 
-@test_util.with_c_api
 class ExportImportAcrossScopesTest(test.TestCase):
 
   def testPartionedVariables(self):
diff --git a/tensorflow/python/framework/op_def_library_test.py b/tensorflow/python/framework/op_def_library_test.py
index 84ca062ade3b32c37212ba2d5b7eb9c64fb1dfa5..66cfe213b3cc943de4cd423e8e2ffffbe0b49f8b 100644
--- a/tensorflow/python/framework/op_def_library_test.py
+++ b/tensorflow/python/framework/op_def_library_test.py
@@ -36,7 +36,6 @@ def _unknown_shape(op):
   return [tensor_shape.unknown_shape() for _ in op.outputs]
 
 
-@test_util.with_c_api
 class OpDefLibraryTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -1330,7 +1329,6 @@ class OpDefLibraryTest(test_util.TensorFlowTestCase):
             self.assertEqual(t_c, [x.dtype for x in c])
 
 
-@test_util.with_c_api
 class OpDefLibraryGraphTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 908d0da35e89cb8b91f0a1f0194e198e877f75cf..7fb3a22f5ac77bb54ef37121fd464fa8d66ae9b1 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1057,13 +1057,19 @@ def internal_convert_to_tensor(value,
 
   """
   if ctx is None: ctx = context.context()
-  if ctx.executing_eagerly():
-    # Fast path for EagerTensors that don't need any conversion.
-    if isinstance(value, EagerTensor):
+  if isinstance(value, EagerTensor):
+    if ctx.executing_eagerly():
+      # Fast path for EagerTensors that don't need any conversion.
       # Note that we don't check that value's dtype matches the dtype
       # argument.  We expect that the C runtime will do that checking
       # when we execute the kernel.
       return value
+    else:
+      graph = get_default_graph()
+      if not graph.building_function:
+        raise RuntimeError("Attempting to capture an EagerTensor without "
+                           "building a function.")
+      return graph.capture(value, name=name)
 
   if dtype is not None:
     dtype = dtypes.as_dtype(dtype)
@@ -1251,7 +1257,10 @@ def internal_convert_to_tensor_or_indexed_slices(value,
   Raises:
     ValueError: If `dtype` does not match the element type of `value`.
   """
-  if isinstance(value, _TensorLike):
+  if isinstance(value, EagerTensor) and not context.executing_eagerly():
+    return internal_convert_to_tensor(
+        value, dtype=dtype, name=name, as_ref=as_ref)
+  elif isinstance(value, _TensorLike):
     if dtype and not dtypes.as_dtype(dtype).is_compatible_with(value.dtype):
       raise ValueError(
           "Tensor conversion requested dtype %s for Tensor with dtype %s: %r" %
@@ -3446,8 +3455,9 @@ class Graph(object):
     # the name will still appear in _names_in_use even though the name hasn't
     # been used. This is ok, just leave _names_in_use as-is in this case.
     # TODO(skyewm): make the C API guarantee no name conflicts.
-    if ret.name not in self._names_in_use:
-      self._names_in_use[ret.name] = 1
+    name_key = ret.name.lower()
+    if name_key not in self._names_in_use:
+      self._names_in_use[name_key] = 1
     self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
@@ -4163,20 +4173,27 @@ class Graph(object):
     """
     if self._name_stack:
       name = self._name_stack + "/" + name
-    i = self._names_in_use.get(name, 0)
-    # Increment the number for "name".
+
+    # For the sake of checking for names in use, we treat names as case
+    # insensitive (e.g. foo = Foo).
+    name_key = name.lower()
+    i = self._names_in_use.get(name_key, 0)
+    # Increment the number for "name_key".
     if mark_as_used:
-      self._names_in_use[name] = i + 1
+      self._names_in_use[name_key] = i + 1
     if i > 0:
-      base_name = name
-      # Make sure the composed name is not already used.
-      while name in self._names_in_use:
-        name = "%s_%d" % (base_name, i)
+      base_name_key = name_key
+      # Make sure the composed name key is not already used.
+      while name_key in self._names_in_use:
+        name_key = "%s_%d" % (base_name_key, i)
         i += 1
-      # Mark the composed name as used in case someone wants
+      # Mark the composed name_key as used in case someone wants
       # to call unique_name("name_1").
       if mark_as_used:
-        self._names_in_use[name] = 1
+        self._names_in_use[name_key] = 1
+
+      # Return the new name with the original capitalization of the given name.
+      name = "%s_%d" % (name, i-1)
     return name
 
   def get_name_scope(self):
@@ -5338,6 +5355,7 @@ def init_scope():
       # Names that end with trailing slashes are treated by `name_scope` as
       # absolute.
       scope = scope + '/'
+    inner_device_stack = default_graph._device_function_stack  # pylint: disable=protected-access
 
     outer_context = None
     if not _default_graph_stack.stack:
@@ -5366,9 +5384,23 @@ def init_scope():
       raise RuntimeError("All graphs are building functions, and no "
                          "eager context was previously active.")
 
-    with outer_context(), name_scope(scope), control_dependencies(
-        None), tape.stop_recording():
-      yield
+    outer_graph = None
+    outer_device_stack = None
+    try:
+      with outer_context(), name_scope(scope), control_dependencies(
+          None), tape.stop_recording():
+        if not context.executing_eagerly():
+          # The device stack is preserved when lifting into a graph. Eager
+          # execution doesn't implement device stacks and in particular it
+          # doesn't support device functions, so in general it's not possible
+          # to do the same when lifting into the eager context.
+          outer_graph = get_default_graph()
+          outer_device_stack = outer_graph._device_function_stack  # pylint: disable=protected-access
+          outer_graph._device_function_stack = inner_device_stack  # pylint: disable=protected-access
+        yield
+    finally:
+      if outer_graph is not None:
+        outer_graph._device_function_stack = outer_device_stack  # pylint: disable=protected-access
 
 
 @tf_export("enable_eager_execution")
@@ -5402,36 +5434,30 @@ def enable_eager_execution(config=None, device_policy=None,
       in which operations are executed. Note that @{tf.ConfigProto} is also
       used to configure graph execution (via @{tf.Session}) and many options
       within `tf.ConfigProto` are not implemented (or are irrelevant) when
-     eager execution is enabled.
+      eager execution is enabled.
     device_policy: (Optional.) Policy controlling how operations requiring
-     inputs on a specific device (e.g., a GPU 0) handle inputs on a different
-     device  (e.g. GPU 1 or CPU). When set to None, an appropriate value will be
-     picked automatically. The value picked may change between TensorFlow
-     releases.
-     Valid values:
-
+      inputs on a specific device (e.g., a GPU 0) handle inputs on a different
+      device  (e.g. GPU 1 or CPU). When set to None, an appropriate value will be
+      picked automatically. The value picked may change between TensorFlow
+      releases.
+      Valid values:
       - tf.contrib.eager.DEVICE_PLACEMENT_EXPLICIT: raises an error if the
         placement is not correct.
-
       - tf.contrib.eager.DEVICE_PLACEMENT_WARN: copies the tensors which are not
         on the right device but logs a warning.
-
       - tf.contrib.eager.DEVICE_PLACEMENT_SILENT: silently copies the tensors.
         Note that this may hide performance problems as there is no notification
         provided when operations are blocked on the tensor being copied between
         devices.
-
       - tf.contrib.eager.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies
         int32 tensors, raising errors on the other ones.
     execution_mode: (Optional.) Policy controlling how operations dispatched are
       actually executed. When set to None, an appropriate value will be picked
       automatically. The value picked may change between TensorFlow releases.
       Valid values:
-
-        - tf.contrib.eager.SYNC: executes each operation synchronously.
-
-        - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
-          operations may return "non-ready" handles.
+      - tf.contrib.eager.SYNC: executes each operation synchronously.
+      - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
+        operations may return "non-ready" handles.
 
   Raises:
     ValueError: If eager execution is enabled after creating/executing a
@@ -5455,8 +5481,8 @@ def enable_eager_execution(config=None, device_policy=None,
   # pylint: disable=protected-access
   if context._default_mode == context.GRAPH_MODE:
     graph_mode_has_been_used = (
-        _default_session_stack.stack or
-        _default_graph_stack._global_default_graph is not None)
+        _default_session_stack.stack
+        or len(get_default_graph().get_operations()) > 0)  # pylint: disable=g-explicit-length-test
     if graph_mode_has_been_used:
       raise ValueError(
           "tf.enable_eager_execution must be called at program startup.")
@@ -5553,6 +5579,10 @@ def get_default_graph():
   """
   return _default_graph_stack.get_default()
 
+def has_default_graph():
+  """Returns True if there is a default graph."""
+  return len(_default_graph_stack.stack) >= 1
+
 
 def get_name_scope():
   """Returns the current name scope in the default_graph.
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index c9c1a3d66be1051859b3dc4eef67803881efcd55..f8501b27aea3e47f37c60ecba3401a2b40ec77b9 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -23,7 +23,6 @@ import threading
 import weakref
 
 from tensorflow.core.framework import attr_value_pb2
-from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
@@ -43,7 +42,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import resources
@@ -56,7 +54,6 @@ from tensorflow.python.util import compat
 ops._set_call_cpp_shape_fn(common_shapes.call_cpp_shape_fn)
 
 
-@test_util.with_c_api
 class ResourceTest(test_util.TensorFlowTestCase):
 
   def testBuildGraph(self):
@@ -82,7 +79,6 @@ class ResourceTest(test_util.TensorFlowTestCase):
                   resources.shared_resources()).eval()), 0)
 
 
-@test_util.with_c_api
 class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
   def testShape(self):
@@ -141,7 +137,6 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
         _ = a + b
 
 
-@test_util.with_c_api
 class IndexedSlicesTest(test_util.TensorFlowTestCase):
 
   def testToTensor(self):
@@ -170,7 +165,6 @@ class IndexedSlicesTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(x.indices.eval(), [0, 2])
 
 
-@test_util.with_c_api
 class NodeDefConstructorTest(test_util.TensorFlowTestCase):
 
   def testNoArgs(self):
@@ -193,7 +187,6 @@ def _apply_op(g, *args, **kwargs):
     return op.outputs
 
 
-@test_util.with_c_api
 class OperationTest(test_util.TensorFlowTestCase):
 
   def testNoInputs(self):
@@ -443,12 +436,8 @@ class OperationTest(test_util.TensorFlowTestCase):
                      attr_value_pb2.NameAttrList(name="MyFunc"))
 
     # Try fetching missing attr
-    if ops._USE_C_API:
-      error_msg = "Operation 'FuncAttr' has no attr named 'FakeAttr'."
-    else:
-      error_msg = "No attr named 'FakeAttr' in name: \"FuncAttr\""
-
-    with self.assertRaisesRegexp(ValueError, error_msg):
+    with self.assertRaisesRegexp(
+        ValueError, "Operation 'FuncAttr' has no attr named 'FakeAttr'."):
       op.get_attr("FakeAttr")
 
   # TODO(b/65162920): remove this test when users who are directly mutating the
@@ -461,23 +450,6 @@ class OperationTest(test_util.TensorFlowTestCase):
 
   # TODO(nolivia): test all error cases
   def testAddControlInput(self):
-    # The C API dedups redundant control edges, pure Python does not
-    if ops._USE_C_API: return
-    with ops.Graph().as_default():
-      x = constant_op.constant(1).op
-      y = constant_op.constant(2).op
-      z = constant_op.constant(3).op
-    z._add_control_input(x)  # pylint: disable=protected-access
-    self.assertEqual(z.control_inputs, [x])
-    z._add_control_input(x)  # pylint: disable=protected-access
-    self.assertEqual(z.control_inputs, [x, x])
-    z._add_control_inputs([x, y, y])  # pylint: disable=protected-access
-    self.assertEqual(z.control_inputs, [x, x, x, y, y])
-    self.assertEqual(x._control_outputs, [z])
-
-  def testAddControlInputC(self):
-    # The C API dedups redundant control edges, pure Python does not
-    if not ops._USE_C_API: return
     with ops.Graph().as_default():
       x = constant_op.constant(1).op
       y = constant_op.constant(2).op
@@ -515,8 +487,6 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(list(f.op.inputs), [d, e])
 
   def testControlInputCycle(self):
-    # Non-C API path has a different error message
-    if not ops._USE_C_API: return
     graph = ops.Graph()
     with graph.as_default():
       z = constant_op.constant(0)
@@ -586,25 +556,6 @@ class OperationTest(test_util.TensorFlowTestCase):
         sess.run(z)
 
   def testUpdateInputShapeError(self):
-    # C-API throws the error differently.
-    if ops._USE_C_API:
-      return
-    g = ops.Graph()
-    with g.as_default():
-      w = constant_op.constant(2, shape=[3, 1])
-      x = constant_op.constant(0, shape=[3, 1])
-      y = constant_op.constant(1, shape=[2, 2])
-      z = w + x
-      z.op._update_input(0, y)  # pylint: disable=protected-access
-
-    with session.Session(graph=g) as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   r"Incompatible shapes: \[2,2\] vs. \[3,1\]"):
-        sess.run(z)
-
-  def testUpdateInputShapeErrorC(self):
-    if not ops._USE_C_API:
-      return
     g = ops.Graph()
     with g.as_default():
       w = constant_op.constant(2, shape=[3, 1])
@@ -617,17 +568,6 @@ class OperationTest(test_util.TensorFlowTestCase):
       z.op._update_input(0, y)  # pylint: disable=protected-access
 
   def testUpdateInputOutOfRange(self):
-    # C-API throws the error differently.
-    if ops._USE_C_API: return
-    g = ops.Graph()
-    with g.as_default():
-      x = constant_op.constant(1)
-    with self.assertRaisesRegexp(IndexError, "list index out of range"):
-      x.op._update_input(1, x)  # pylint: disable=protected-access
-
-  def testUpdateInputOutOfRangeC(self):
-    # C-API throws the error differently.
-    if not ops._USE_C_API: return
     g = ops.Graph()
     with g.as_default():
       x = constant_op.constant(1)
@@ -643,11 +583,9 @@ class OperationTest(test_util.TensorFlowTestCase):
     y = constant_op.constant(1)
     z = x + y
 
-    # Pure Python mode doesn't create OpDefs for constants
-    if ops._USE_C_API:
-      self.assertEqual(x.op.op_def.name, "Const")
-      self.assertEqual(len(x.op.op_def.input_arg), 0)
-      self.assertEqual(len(x.op.op_def.output_arg), 1)
+    self.assertEqual(x.op.op_def.name, "Const")
+    self.assertEqual(len(x.op.op_def.input_arg), 0)
+    self.assertEqual(len(x.op.op_def.output_arg), 1)
 
     self.assertEqual(z.op.op_def.name, "Add")
     self.assertEqual(len(z.op.op_def.input_arg), 2)
@@ -673,7 +611,6 @@ class OperationTest(test_util.TensorFlowTestCase):
       op.inputs.append(None)
 
 
-@test_util.with_c_api
 class CreateOpTest(test_util.TensorFlowTestCase):
 
   def testNodeDefArgs(self):
@@ -738,20 +675,15 @@ class CreateOpTest(test_util.TensorFlowTestCase):
 # the control flow context isn't set properly, but a more complicated use case
 # that might not be obvious to test will fail). Thus we instead explicitly test
 # the low-level behavior.
-@test_util.with_c_api
 class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
     g = ops.Graph()
     with g.as_default():
       x = test_ops.int_output()
-      if ops._USE_C_API:
-        c_op = ops._create_c_op(
-            g, ops._NodeDef("IntInputIntOutput", "myop"), [x], [])
-        op = g._create_op_from_tf_operation(c_op)
-      else:
-        # Test pure-Python version to make sure C API has same behavior.
-        op = test_ops.int_input_int_output(x, name="myop").op
+      c_op = ops._create_c_op(
+          g, ops._NodeDef("IntInputIntOutput", "myop"), [x], [])
+      op = g._create_op_from_tf_operation(c_op)
 
     self.assertEqual(op.name, "myop")
     self.assertEqual(op.type, "IntInputIntOutput")
@@ -770,12 +702,8 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     g = ops.Graph()
     with g.as_default():
       x = constant_op.constant([[1, 2, 3], [4, 5, 6]])
-      if ops._USE_C_API:
-        c_op = ops._create_c_op(g, ops._NodeDef("Identity", "myop"), [x], [])
-        op = g._create_op_from_tf_operation(c_op)
-      else:
-        # Test pure-Python version to make sure C API has same behavior.
-        op = array_ops.identity(x, name="myop").op
+      c_op = ops._create_c_op(g, ops._NodeDef("Identity", "myop"), [x], [])
+      op = g._create_op_from_tf_operation(c_op)
 
     self.assertEqual(op.name, "myop")
     self.assertEqual(op.type, "Identity")
@@ -785,15 +713,10 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
   def testUniqueName(self):
     g = ops.Graph()
     with g.as_default():
-      if ops._USE_C_API:
-        c_op = ops._create_c_op(g, ops._NodeDef("IntOutput", "myop"), [], [])
-        c_op2 = ops._create_c_op(g, ops._NodeDef("IntOutput", "myop_1"), [], [])
-        op = g._create_op_from_tf_operation(c_op)
-        op2 = g._create_op_from_tf_operation(c_op2)
-      else:
-        # Test pure-Python version to make sure C API has same behavior.
-        op = test_ops.int_output(name="myop").op
-        op2 = test_ops.int_output(name="myop_1").op
+      c_op = ops._create_c_op(g, ops._NodeDef("IntOutput", "myop"), [], [])
+      c_op2 = ops._create_c_op(g, ops._NodeDef("IntOutput", "myop_1"), [], [])
+      op = g._create_op_from_tf_operation(c_op)
+      op2 = g._create_op_from_tf_operation(c_op2)
 
       # Create ops with same names as op1 and op2. We expect the new names to be
       # uniquified.
@@ -811,14 +734,10 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
       x = test_ops.int_output()
 
       def true_fn():
-        if ops._USE_C_API:
-          ops._create_c_op(ops.get_default_graph(),
-                           ops._NodeDef("IntInput", "cond/myop"), [x], [])
-          new_ops = g._add_new_tf_operations()
-          self.assertEqual(len(new_ops), 1)
-        else:
-          # Test pure-Python version to make sure C API has same behavior.
-          test_ops.int_input(x, name="myop")
+        ops._create_c_op(ops.get_default_graph(),
+                         ops._NodeDef("IntInput", "cond/myop"), [x], [])
+        new_ops = g._add_new_tf_operations()
+        self.assertEqual(len(new_ops), 1)
         return x
 
       control_flow_ops.cond(x < 10, true_fn, lambda: x)
@@ -844,14 +763,10 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
       x = test_ops.int_output()
 
       def body(i):
-        if ops._USE_C_API:
-          ops._create_c_op(ops.get_default_graph(),
-                           ops._NodeDef("IntInput", "myloop/myop"), [x], [])
-          new_ops = g._add_new_tf_operations()
-          self.assertEqual(len(new_ops), 1)
-        else:
-          # Test pure-Python version to make sure C API has same behavior.
-          test_ops.int_input(x, name="myop")
+        ops._create_c_op(ops.get_default_graph(),
+                         ops._NodeDef("IntInput", "myloop/myop"), [x], [])
+        new_ops = g._add_new_tf_operations()
+        self.assertEqual(len(new_ops), 1)
         return i
 
       control_flow_ops.while_loop(lambda i: i < 10, body, [0], name="myloop")
@@ -878,15 +793,11 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
 
       def body(i):
         c = constant_op.constant(1.0, name="c")
-        if ops._USE_C_API:
-          ops._create_c_op(ops.get_default_graph(),
-                           ops._NodeDef("IntInput", "myloop/myop"), [x], [])
-          with ops.control_dependencies([c]):
-            new_ops = g._add_new_tf_operations()
-            self.assertEqual(len(new_ops), 1)
-        else:
-          with ops.control_dependencies([c]):
-            test_ops.int_input(x, name="myop")
+        ops._create_c_op(ops.get_default_graph(),
+                         ops._NodeDef("IntInput", "myloop/myop"), [x], [])
+        with ops.control_dependencies([c]):
+          new_ops = g._add_new_tf_operations()
+          self.assertEqual(len(new_ops), 1)
         return i
 
       control_flow_ops.while_loop(lambda i: i < 10, body, [0], name="myloop")
@@ -905,15 +816,11 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
       c = constant_op.constant(1.0)
 
       def body(i):
-        if ops._USE_C_API:
-          ops._create_c_op(ops.get_default_graph(),
-                           ops._NodeDef("IntInput", "myloop/myop"), [x], [])
-          with ops.control_dependencies([c]):
-            new_ops = g._add_new_tf_operations()
-            self.assertEqual(len(new_ops), 1)
-        else:
-          with ops.control_dependencies([c]):
-            test_ops.int_input(x, name="myop")
+        ops._create_c_op(ops.get_default_graph(),
+                         ops._NodeDef("IntInput", "myloop/myop"), [x], [])
+        with ops.control_dependencies([c]):
+          new_ops = g._add_new_tf_operations()
+          self.assertEqual(len(new_ops), 1)
         return i
 
       control_flow_ops.while_loop(lambda i: i < 10, body, [0], name="myloop")
@@ -925,7 +832,6 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     self.assertIsNotNone(op.control_inputs[0]._get_control_flow_context())
 
 
-@test_util.with_c_api
 class ApplyOpTest(test_util.TensorFlowTestCase):
 
   def testNodeDefArgs(self):
@@ -979,7 +885,6 @@ class ApplyOpTest(test_util.TensorFlowTestCase):
         out_3.op.node_def)
 
 
-@test_util.with_c_api
 class NameStackTest(test_util.TensorFlowTestCase):
 
   def testBasics(self):
@@ -1063,6 +968,15 @@ class NameStackTest(test_util.TensorFlowTestCase):
     self.assertEqual("foo_1", g.unique_name("foo"))
     self.assertEqual("foo_3", g.unique_name("foo"))
 
+  def testUniqueNameCaseInsensitivity(self):
+    g = ops.Graph()
+    self.assertEqual("foo", g.unique_name("foo"))
+    self.assertEqual("Foo_1", g.unique_name("Foo"))
+    with g.name_scope("bar"):
+      self.assertEqual("bar/foo", g.unique_name("foo"))
+    with g.name_scope("Bar"):
+      self.assertEqual("Bar_1/foo", g.unique_name("foo"))
+
   def testInvalidNameRaisesError(self):
     g = ops.Graph()
     with g.name_scope(""):  # Should not raise
@@ -1078,7 +992,6 @@ class NameStackTest(test_util.TensorFlowTestCase):
         pass
 
 
-@test_util.with_c_api
 class NameTest(test_util.TensorFlowTestCase):
 
   def testGenerateName(self):
@@ -1148,7 +1061,6 @@ class NameTest(test_util.TensorFlowTestCase):
                        g.create_op("FloatOutput", [], [dtypes.float32]).name)
 
 
-@test_util.with_c_api
 class DeviceTest(test_util.TensorFlowTestCase):
 
   def testNoDevice(self):
@@ -1385,7 +1297,6 @@ class DeviceTest(test_util.TensorFlowTestCase):
     """, gd)
 
 
-@test_util.with_c_api
 class MultithreadedGraphStateTest(test_util.TensorFlowTestCase):
 
   class TestThread(threading.Thread):
@@ -1588,7 +1499,6 @@ class MultithreadedGraphStateTest(test_util.TensorFlowTestCase):
       self.assertEquals("foo" + s + "/FloatOutput_1", t.result[1].name)
 
 
-@test_util.with_c_api
 class ObjectWithName(object):
 
   def __init__(self, name):
@@ -1599,7 +1509,6 @@ class ObjectWithName(object):
     return self._name
 
 
-@test_util.with_c_api
 class CollectionTest(test_util.TensorFlowTestCase):
 
   def test_get_collections(self):
@@ -1723,7 +1632,6 @@ def _CopyOverrideGrad(op, x_grad):  # pylint: disable=invalid-name
   return x_grad
 
 
-@test_util.with_c_api
 class RegistrationTest(test_util.TensorFlowTestCase):
 
   def testRegisterGradients(self):
@@ -1751,7 +1659,6 @@ class RegistrationTest(test_util.TensorFlowTestCase):
         ops.get_gradient_function(y.op)
 
 
-@test_util.with_c_api
 class ComparisonTest(test_util.TensorFlowTestCase):
 
   def testMembershipAllowed(self):
@@ -1764,10 +1671,8 @@ class ComparisonTest(test_util.TensorFlowTestCase):
     self.assertTrue(t1 not in [t2])
 
 
-@test_util.with_c_api
 class ControlDependenciesTest(test_util.TensorFlowTestCase):
 
-  @test_util.enable_c_api
   def testBasic(self):
     g = ops.Graph()
     with g.as_default():
@@ -1971,7 +1876,6 @@ class ControlDependenciesTest(test_util.TensorFlowTestCase):
     self.assertEqual(b.op.control_inputs, [])
 
 
-@test_util.with_c_api
 class OpScopeTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -2138,6 +2042,21 @@ class InitScopeTest(test_util.TensorFlowTestCase):
     self.assertEqual(len(g1.get_operations()), 0)
     self.assertEqual(len(g0.get_operations()), 1)
 
+  def testPreservesDevices(self):
+    g0 = ops.Graph()
+    with g0.as_default(), ops.device("CPU:0"):
+      g1 = ops.Graph()
+      g1._building_function = True  # pylint: disable=protected-access
+      with g1.as_default(), ops.device("GPU:0"):
+        with ops.init_scope():
+          # init_scope should preserve device set under `g1`.
+          on_gpu = constant_op.constant(1.0)
+          self.assertEqual(on_gpu.device, "/device:GPU:0")
+        still_on_gpu = constant_op.constant(1.0)
+        self.assertEqual(still_on_gpu.device, "/device:GPU:0")
+      on_cpu = constant_op.constant(1.0)
+      self.assertEqual(on_cpu.device, "/device:CPU:0")
+
   def testComposes(self):
     g0 = ops.Graph()
     g1 = ops.Graph()
@@ -2330,7 +2249,6 @@ class InitScopeTest(test_util.TensorFlowTestCase):
       self.assertEqual(ops.get_name_scope(), "")
 
 
-@test_util.with_c_api
 class GraphTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -2440,7 +2358,6 @@ class GraphTest(test_util.TensorFlowTestCase):
         sess.run(a)
 
 
-@test_util.with_c_api
 class AttrScopeTest(test_util.TensorFlowTestCase):
 
   def _get_test_attrs(self):
@@ -2491,10 +2408,8 @@ class AttrScopeTest(test_util.TensorFlowTestCase):
 ops.RegisterShape("KernelLabel")(common_shapes.scalar_shape)
 
 
-@test_util.with_c_api
 class KernelLabelTest(test_util.TensorFlowTestCase):
 
-  @test_util.enable_c_api
   def testNoLabel(self):
     with self.test_session():
       self.assertAllEqual(b"My label is: default",
@@ -2522,7 +2437,6 @@ class KernelLabelTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(b"My label is: overload_2", overload_2.eval())
 
 
-@test_util.with_c_api
 class AsGraphDefTest(test_util.TensorFlowTestCase):
 
   def testGraphDefVersion(self):
@@ -2589,7 +2503,6 @@ def _calc_a_forward_flops(unused_graph, unused_node):
   return ops.OpStats("flops", 20)
 
 
-@test_util.with_c_api
 class StatisticsTest(test_util.TensorFlowTestCase):
 
   def testRegisteredNode(self):
@@ -2614,7 +2527,6 @@ class StatisticsTest(test_util.TensorFlowTestCase):
     self.assertEqual(3, flops_total.value)
 
 
-@test_util.with_c_api
 class ColocationGroupTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
@@ -2739,15 +2651,11 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual("/device:CPU:0", b.device)
 
 
-@test_util.with_c_api
 class DeprecatedTest(test_util.TensorFlowTestCase):
 
   def testSuccess(self):
-    # TODO(skyewm): make g.graph_def_versions work with the C API enabled
-    if ops._USE_C_API: return
-
     with ops.Graph().as_default() as g:
-      g.graph_def_versions.producer = 7
+      test_util.set_producer_version(g, 7)
       old = test_ops.old()
       with self.test_session(graph=g):
         old.run()
@@ -2762,20 +2670,7 @@ class DeprecatedTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(NotImplementedError, self._error()):
         test_ops.old()
 
-  def testGraphExecutionFail(self):
-    # TODO(skyewm): make g.graph_def_versions work with the C API enabled
-    if ops._USE_C_API: return
 
-    with ops.Graph().as_default() as g:
-      g.graph_def_versions.producer = 7
-      old = test_ops.old()
-      g.graph_def_versions.producer = versions.GRAPH_DEF_VERSION
-      with self.test_session(graph=g):
-        with self.assertRaisesRegexp(errors.UnimplementedError, self._error()):
-          old.run()
-
-
-@test_util.with_c_api
 class DenseTensorLikeTypeTest(test_util.TensorFlowTestCase):
 
   def testSuccess(self):
@@ -2825,7 +2720,6 @@ class DenseTensorLikeTypeTest(test_util.TensorFlowTestCase):
           DenseTensorLikeTypeTest.BadClassBadDtype)
 
 
-@test_util.with_c_api
 class NameScopeTest(test_util.TensorFlowTestCase):
 
   def testStripAndPrependScope(self):
@@ -2876,7 +2770,6 @@ class NameScopeTest(test_util.TensorFlowTestCase):
     self.assertRaisesRegexp(ValueError, "'_' is not a valid scope name", f)
 
 
-@test_util.with_c_api
 class TracebackTest(test_util.TensorFlowTestCase):
 
   def testTracebackWithStartLines(self):
@@ -2898,57 +2791,6 @@ class TracebackTest(test_util.TensorFlowTestCase):
           self.assertEquals(frame, frame_with_start_line[:-1])
 
 
-@test_util.with_c_api
-class OutputTypesTest(test_util.TensorFlowTestCase):
-  """Tests Operation._output_types property.
-
-  This test should not exist as _output_types is a private property.
-  This property is used by util.copy_elements and its tests would normally
-  cover Operation._output_types. However, we can't yet run these tests in C
-  API mode because their use _set_device method. This test will be deleted
-  once we port _set_device and run the copy tests with C API on.
-  """
-  # TODO(iga): Remove this test
-
-  def setUp(self):
-    self.prev_use_c_api = ops._USE_C_API  # pylint: disable=protected-access
-    ops._USE_C_API = True  # pylint: disable=protected-access
-
-  def tearDown(self):
-    ops._USE_C_API = self.prev_use_c_api  # pylint: disable=protected-access
-
-  def testOneOutput(self):
-    g = ops.Graph()
-    with g.as_default():
-      # Using a constant because creating unregistered ops
-      # doesn't work with the C API.
-      op = constant_op.constant(12, dtype=dtypes.uint16).op
-      # pylint: disable=protected-access
-      self.assertEqual([types_pb2.DT_UINT16], op._output_types)
-      # pylint: enable=protected-access
-
-  def testTwoDifferentOutputs(self):
-    g = ops.Graph()
-    with g.as_default():
-      x = constant_op.constant([1, 1, 2, 4, 4, 4, 7, 8, 8],
-                               dtype=dtypes.double)
-      y, _ = gen_array_ops.unique(x)
-      self.assertEqual([types_pb2.DT_DOUBLE, types_pb2.DT_INT32],
-                       y.op._output_types)  # pylint: disable=protected-access
-
-  def testThreeOutputs(self):
-    g = ops.Graph()
-    with g.as_default():
-      # Using a split operationt because creating unregistered ops
-      # doesn't work with the C API.
-      a = constant_op.constant("abc", dtype=dtypes.string, shape=[5, 30])
-      split0, _, _ = array_ops.split(a, [4, 15, 11], 1)
-      # pylint: disable=protected-access
-      self.assertEqual([types_pb2.DT_STRING] * 3, split0.op._output_types)
-      # pylint: enable=protected-access
-
-
-@test_util.with_c_api
 class EnableEagerExecutionTest(test_util.TensorFlowTestCase):
 
   def testBadArgumentsToEnableEagerExecution(self):
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index ad6c36b4b1773ecbbb5be56f7eaf2d78af3d010c..ec3748b40ec53814f036ca3463c1840d31bc1140 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/python/framework/python_op_gen.h"
 
 #include <stdio.h>
@@ -26,8 +25,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
-#include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -41,792 +38,913 @@ limitations under the License.
 #include "tensorflow/python/framework/python_op_gen_internal.h"
 
 namespace tensorflow {
-namespace python_op_gen_internal {
+namespace {
 
 const int kRightMargin = 78;
 
-bool IsPythonReserved(const string& s) {
-  static const std::set<string>* const kPythonReserved = new std::set<string>(
-      {// Keywords in Python, from:
-       //   import keyword
-       //   print keyword.kwlist
-       "and", "as", "assert", "break", "class", "continue", "def", "del",
-       "elif", "else", "except", "exec", "finally", "for", "from", "global",
-       "if", "import", "in", "is", "lambda", "not", "or", "pass", "print",
-       "raise", "return", "try", "while", "with", "yield",
-       // Built-in functions and types in Python, from:
-       //   [x for x in dir(__builtins__) if not x[0].islower()]
-       "ArithmeticError", "AssertionError", "AttributeError", "BaseException",
-       "BufferError", "BytesWarning", "DeprecationWarning", "EOFError",
-       "Ellipsis", "EnvironmentError", "Exception", "False",
-       "FloatingPointError", "FutureWarning", "GeneratorExit", "IOError",
-       "ImportError", "ImportWarning", "IndentationError", "IndexError",
-       "KeyError", "KeyboardInterrupt", "LookupError", "MemoryError",
-       "NameError", "None", "NotImplemented", "NotImplementedError", "OSError",
-       "OverflowError", "PendingDeprecationWarning", "ReferenceError",
-       "RuntimeError", "RuntimeWarning", "StandardError", "StopIteration",
-       "SyntaxError", "SyntaxWarning", "SystemError", "SystemExit", "TabError",
-       "True", "TypeError", "UnboundLocalError", "UnicodeDecodeError",
-       "UnicodeEncodeError", "UnicodeError", "UnicodeTranslateError",
-       "UnicodeWarning", "UserWarning", "ValueError", "Warning",
-       "ZeroDivisionError", "__debug__", "__doc__", "__import__", "__name__",
-       "__package__"});
-
-  return kPythonReserved->count(s) > 0;
-}
+constexpr char kEagerFallbackSuffix[] = "_eager_fallback";
 
-bool IsOpWithUnderscorePrefix(const string& s) {
-  static const std::set<string>* const kUnderscoreOps = new std::set<string>(
-      {// Lowercase built-in functions and types in Python, from:
-       // [x for x in dir(__builtins__) if x[0].islower()] except "round".
-       // These need to be excluded so they don't conflict with actual built-in
-       // functions since we use '*' imports.
-       "abs", "all", "any", "apply", "bin", "bool", "buffer", "bytearray",
-       "bytes", "callable", "chr", "classmethod", "cmp", "coerce", "compile",
-       "complex", "copyright", "credits", "delattr", "dict", "dir", "divmod",
-       "enumerate", "eval", "execfile", "exit", "file", "filter", "float",
-       "format", "frozenset", "getattr", "globals", "hasattr", "hash", "help",
-       "hex", "id", "input", "int", "intern", "isinstance", "issubclass",
-       "iter", "len", "license", "list", "locals", "long", "map", "max",
-       "memoryview", "min", "next", "object", "oct", "open", "ord", "pow",
-       "print", "property", "quit", "range", "raw_input", "reduce", "reload",
-       "repr", "reversed", "set", "setattr", "slice", "sorted", "staticmethod",
-       "str", "sum", "super", "tuple", "type", "unichr", "unicode", "vars",
-       "xrange", "zip",
-       // These have the same name as ops defined in Python and might be used
-       // incorrectly depending on order of '*' imports.
-       // TODO(annarev): reduce usage of '*' imports and remove these from the
-       // list.
-       "fused_batch_norm", "histogram_fixed_width", "stack",
-       "batch_norm_with_global_normalization", "clip_by_value"});
-  return kUnderscoreOps->count(s) > 0;
+string AttrVarName(const string& attr_name,
+                   std::unordered_map<string, string>* attr_expressions) {
+  const string var = strings::StrCat("_attr_", attr_name);
+  if (attr_expressions != nullptr) (*attr_expressions)[attr_name] = var;
+  return var;
 }
 
-string AvoidPythonReserved(const string& s) {
-  if (IsPythonReserved(s)) return strings::StrCat(s, "_");
-  return s;
+void AddInferredAttr(const string& indentation, const string& attr_name,
+                     const string& value_expression, string* result,
+                     std::unordered_map<string, string>* attr_expressions) {
+  strings::StrAppend(result, indentation,
+                     AttrVarName(attr_name, attr_expressions), " = ",
+                     value_expression, "\n");
 }
 
-// Indent the first line by "initial" spaces and all following lines
-// by "rest" spaces.
-string Indent(int initial, int rest, StringPiece in) {
-  // TODO(josh11b): Also word-wrapping?
-  string copy(in.data(), in.size());
-  str_util::StripTrailingWhitespace(&copy);
-  std::vector<string> v = str_util::Split(copy, '\n');
+string VectorToTuple(const std::vector<string>& l) {
+  if (l.size() == 1) return strings::StrCat("(", l.front(), ",)");
+  string ret = "(";
+  for (int i = 0; i < l.size(); ++i) {
+    if (i > 0) {
+      strings::StrAppend(&ret, ", ");
+    }
+    strings::StrAppend(&ret, l[i]);
+  }
+  strings::StrAppend(&ret, ")");
+  return ret;
+}
 
-  string result;
-  bool first = true;
-  for (const string& line : v) {
-    if (first) {
-      result = strings::StrCat(Spaces(initial), line, "\n");
-      first = false;
-    } else {
-      if (line.empty()) {
-        strings::StrAppend(&result, "\n");
+void Unflatten(const string& prefix, const std::vector<string>& output_sizes,
+               const string& var, string* result) {
+  for (int i = 0; i < output_sizes.size(); ++i) {
+    if (!output_sizes[i].empty()) {
+      strings::StrAppend(result, prefix, var, " = ");
+      if (i > 0) strings::StrAppend(result, var, "[:", i, "] + ");
+      if (i + 1 < output_sizes.size()) {
+        // Special case i == 0 to avoid "0 +" in the generated code.
+        if (i == 0) {
+          strings::StrAppend(result, "[", var, "[:", output_sizes[i], "]] + ",
+                             var, "[", output_sizes[i], ":]");
+        } else {
+          strings::StrAppend(result, "[", var, "[", i, ":", i, " + ",
+                             output_sizes[i], "]] + ", var, "[", i, " + ",
+                             output_sizes[i], ":]");
+        }
       } else {
-        strings::StrAppend(&result, Spaces(rest), line, "\n");
+        strings::StrAppend(result, "[", var, "[", i, ":]]");
       }
+      strings::StrAppend(result, "\n");
     }
   }
-  return result;
 }
 
-// Adds append to *dest, with a space if the first line will be <= width,
-// or a newline otherwise.
-void AppendWithinWidth(string* dest, StringPiece append, int width) {
-  auto first_line = append.find('\n');
-  if (first_line == string::npos) first_line = append.size();
-  if (dest->size() + first_line + 1 /* space */ > static_cast<size_t>(width)) {
-    strings::StrAppend(dest, "\n", append);
-  } else {
-    strings::StrAppend(dest, " ", append);
-  }
+string TensorPBString(const TensorProto& pb) {
+  // Note: This gets used in the argument list, and so must survive naive
+  // word wrapping.
+  return strings::StrCat("\"\"\"", ProtoShortDebugString(pb), "\"\"\"");
 }
 
-// Like DataTypeString() but uses the Python names for the
-// float types.
-string PythonDataTypeString(DataType dtype) {
-  switch (dtype) {
-    case DT_FLOAT:
-      return "float32";
-    case DT_DOUBLE:
-      return "float64";
-    default:
-      return DataTypeString(dtype);
+const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
+  for (int i = 0; i < api_def.in_arg_size(); ++i) {
+    if (api_def.in_arg(i).name() == name) {
+      return &api_def.in_arg(i);
+    }
   }
+  return nullptr;
 }
 
-string TypeString(DataType dtype, bool ref) {
-  if (ref) {
-    return strings::StrCat("mutable `", PythonDataTypeString(dtype), "`");
-  } else {
-    return strings::StrCat("`", PythonDataTypeString(dtype), "`");
+class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
+ public:
+  GenEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
+                   const string& function_name)
+      : python_op_gen_internal::GenPythonOp(op_def, api_def, function_name) {
+    op_name_ = function_name_;
+    str_util::ConsumePrefix(&op_name_, "_");
   }
-}
-
-string TypeListString(const AttrValue& value) {
-  string ret;
-  for (int t : value.list().type()) {
-    if (!ret.empty()) strings::StrAppend(&ret, ", ");
-    DataType dtype = static_cast<DataType>(t);
-    if (IsRefType(dtype)) {
-      strings::StrAppend(&ret, PythonDataTypeString(RemoveRefType(dtype)),
-                         " mutable");
+  ~GenEagerPythonOp() override {}
+
+  string Code() override;
+
+ protected:
+  void HandleGraphMode(const string& function_setup);
+
+  string GetEagerNotAllowedError();
+  void ExpectListArg(const string& indentation, const string& arg_name,
+                     string* output);
+  bool GetEagerFunctionSetup(const string& indentation, string* function_setup);
+  void GetOutputSizesAndNumOutputsExpr(std::vector<string>* output_sizes,
+                                       string* num_outputs_expr);
+
+  void AddEagerFunctionTeardown(const string& indentation,
+                                const std::vector<string>& output_sizes,
+                                bool execute_record_gradient);
+
+  bool AddEagerFastPathAndGraphCode(const string& parameters,
+                                    const std::vector<string>& output_sizes,
+                                    const string& eager_not_allowed_error);
+  bool AddEagerFallbackCode(const string& parameters,
+                            const std::vector<string>& output_sizes,
+                            const string& num_outputs_expr,
+                            const string& eager_not_allowed_error);
+  void AddEagerFastPathExecute();
+
+  void AddEagerInferredAttrs(const string& indentation);
+  void AddEagerInputCasts(const string& indentation);
+  void AddEagerAttrs(const string& indentation);
+  void AddEagerExecute(const string& indentation,
+                       const string& num_outputs_expr);
+
+  void AddAttrForArg(const string& attr, int arg_index) {
+    gtl::InsertIfNotPresent(&inferred_attrs_, attr,
+                            op_def_.input_arg(arg_index).name());
+    auto iter = attr_to_args_.find(attr);
+    if (iter == attr_to_args_.end()) {
+      attr_to_args_.insert(AttrToArgMap::value_type(attr, {arg_index}));
     } else {
-      strings::StrAppend(&ret, "`", PythonDataTypeString(dtype), "`");
+      iter->second.push_back(arg_index);
     }
   }
-  return ret;
-}
 
-string SingleTensorName(DataType dtype, bool is_ref) {
-  const string type_str = TypeString(dtype, is_ref);
-  return strings::StrCat("A `Tensor` of type ", type_str, ".");
-}
+  // Returns a string expression representing a flattened list of all
+  // the inputs given by `*input_indices` (or all inputs if
+  // `input_indices` is nullptr).  `*output_sizes` can be used to unflatten.
+  string FlattenInputs(const std::vector<int>* input_indices,
+                       std::vector<string>* output_sizes) const;
 
-const char kUnknownTensorType[] = {"A `Tensor`."};
-
-string ArgTypeName(const OpDef& op_def, const OpDef::ArgDef& arg,
-                   const std::unordered_map<string, string>& inferred_attrs,
-                   bool is_output) {
-  if (!arg.number_attr().empty()) {
-    // N Tensors with the same type
-    const string* original_arg =
-        gtl::FindOrNull(inferred_attrs, arg.number_attr());
-    string prefix;
-    if (original_arg == nullptr) {
-      prefix = strings::StrCat("A list of `", arg.number_attr(), "`");
-    } else if (*original_arg == arg.name()) {
-      const OpDef::AttrDef* attr = FindAttr(arg.number_attr(), op_def);
-      if (attr->has_minimum() && attr->minimum() > 0) {
-        prefix = strings::StrCat("A list of at least ", attr->minimum());
-      } else {
-        prefix = "A list of";
-      }
-    } else {
-      prefix = strings::StrCat("A list with the same length as `",
-                               AvoidPythonReserved(*original_arg), "` of");
-    }
+  StringPiece op_name_;
+  typedef std::unordered_map<string, std::vector<int>> AttrToArgMap;
+  AttrToArgMap attr_to_args_;
+  std::unordered_map<string, string> attr_expressions_;
+  // This has all the input args followed by those attrs that don't have
+  // defaults.
+  std::vector<python_op_gen_internal::ParamNames> params_no_default_;
+  // The parameters with defaults (these have to be listed after those without).
+  // No input args are included, just attrs.
+  std::vector<std::pair<python_op_gen_internal::ParamNames, string>>
+      params_with_default_;
+};
 
-    if (arg.type() != DT_INVALID) {
-      return strings::StrCat(prefix, " `Tensor` objects with type ",
-                             TypeString(arg.type(), arg.is_ref()), ".");
-    } else {
-      original_arg = gtl::FindOrNull(inferred_attrs, arg.type_attr());
-      if (arg.is_ref()) {
-        strings::StrAppend(&prefix, " mutable");
+string GetEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
+                        const string& function_name) {
+  return GenEagerPythonOp(op_def, api_def, function_name).Code();
+}
+
+string GenEagerPythonOp::FlattenInputs(
+    const std::vector<int>* input_indices,
+    std::vector<string>* output_sizes) const {
+  string inputs;
+  enum { STARTING, WAS_LIST_INPUT, WAS_SOLO_INPUT } inputs_state = STARTING;
+  const int n = input_indices != nullptr ? input_indices->size()
+                                         : op_def_.input_arg_size();
+  for (int j = 0; j < n; ++j) {
+    const int i = input_indices ? (*input_indices)[j] : j;
+    const auto& arg(op_def_.input_arg(i));
+    const bool is_list =
+        !arg.type_list_attr().empty() || !arg.number_attr().empty();
+    if (is_list) {
+      if (inputs_state == WAS_SOLO_INPUT) {
+        strings::StrAppend(&inputs, "] + ");
+      } else if (inputs_state == WAS_LIST_INPUT) {
+        strings::StrAppend(&inputs, " + ");
       }
-      if (original_arg == nullptr) {
-        return strings::StrCat(prefix, " `Tensor` objects with type `",
-                               arg.type_attr(), "`.");
-      } else if (*original_arg == arg.name()) {
-        const OpDef::AttrDef* attr = FindAttr(arg.type_attr(), op_def);
-        if (attr->has_allowed_values()) {
-          return strings::StrCat(prefix,
-                                 " `Tensor` objects with the same type in: ",
-                                 TypeListString(attr->allowed_values()), ".");
+      strings::StrAppend(&inputs, "list(", param_names_[i].GetRenameTo(), ")");
+      inputs_state = WAS_LIST_INPUT;
+      if (output_sizes != nullptr) {
+        if (!arg.number_attr().empty()) {
+          output_sizes->emplace_back(AttrVarName(arg.number_attr(), nullptr));
         } else {
-          return strings::StrCat(prefix,
-                                 " `Tensor` objects with the same type.");
+          output_sizes->emplace_back(
+              strings::StrCat("len(", param_names_[i].GetRenameTo(), ")"));
         }
-      } else {
-        return strings::StrCat(prefix,
-                               " `Tensor` objects with the same type as `",
-                               AvoidPythonReserved(*original_arg), "`.");
       }
-    }
-  } else if (!arg.type_attr().empty() || !arg.type_list_attr().empty()) {
-    const bool is_list = !arg.type_list_attr().empty();
-    const string attr_name = is_list ? arg.type_list_attr() : arg.type_attr();
-    const OpDef::AttrDef* attr = FindAttr(attr_name, op_def);
-    const string mutable_str = arg.is_ref() ? "mutable " : "";
-    const string prefix =
-        is_list ? strings::StrCat("A list of ", mutable_str, "`Tensor` objects")
-                : strings::StrCat("A ", mutable_str, "`Tensor`");
-    const string* original_arg = gtl::FindOrNull(inferred_attrs, attr_name);
-    if (original_arg == nullptr) {
-      return strings::StrCat(prefix, " of type `", attr_name, "`.");
-    } else if (*original_arg == arg.name()) {
-      if (attr->has_allowed_values()) {
-        if (is_list) {
-          return strings::StrCat(prefix, " with types from: ",
-                                 TypeListString(attr->allowed_values()), ".");
-        } else {
-          return strings::StrCat(
-              prefix, is_output ? ". Has one of the following types: "
-                                : ". Must be one of the following types: ",
-              TypeListString(attr->allowed_values()), ".");
-        }
+    } else {
+      if (inputs_state == WAS_SOLO_INPUT) {
+        strings::StrAppend(&inputs, ", ");
+      } else if (inputs_state == WAS_LIST_INPUT) {
+        strings::StrAppend(&inputs, " + [");
       } else {
-        return strings::StrCat(prefix, ".");
+        strings::StrAppend(&inputs, "[");
       }
-    } else {
-      return strings::StrCat(prefix,
-                             is_output ? ". Has the same type as `"
-                                       : ". Must have the same type as `",
-                             AvoidPythonReserved(*original_arg), "`.");
+      strings::StrAppend(&inputs, param_names_[i].GetRenameTo());
+      inputs_state = WAS_SOLO_INPUT;
+      if (output_sizes != nullptr) output_sizes->emplace_back();
     }
-  } else {
-    return SingleTensorName(arg.type(), arg.is_ref());
   }
+  if (inputs_state == STARTING) return "[]";
+  if (inputs_state == WAS_SOLO_INPUT) {
+    strings::StrAppend(&inputs, "]");
+  }
+  return inputs;
 }
 
-string GetReturns(const OpDef& op_def,
-                  const std::vector<string>& output_type_string) {
-  string result;
-  DCHECK_EQ(op_def.output_arg_size(), output_type_string.size());
-  const int num_outs = op_def.output_arg_size();
-  strings::StrAppend(&result, "\n  Returns:\n");
-  if (num_outs == 0) {
-    strings::StrAppend(&result, "    The created Operation.\n");
-  } else {
-    if (num_outs == 1) {
-      StringPiece description = op_def.output_arg(0).description();
-      if (ConsumeEquals(&description)) {  // Skip the generated type info.
-        strings::StrAppend(&result, Indent(4, 4, description));
-      } else {
-        // Special case of one output, don't use the name of the output unless
-        // there is no description.
-        string desc = output_type_string.empty() ? kUnknownTensorType
-                                                 : output_type_string[0];
-        if (desc == kUnknownTensorType) {
-          // Special case where we don't understand how the output tensor type
-          // depends on the input tensor types, just use the output arg
-          // description if we can.
-          if (!description.empty()) {
-            desc = op_def.output_arg(0).description();
-          } else if (!op_def.output_arg(0).name().empty()) {
-            desc = strings::StrCat(" The ", op_def.output_arg(0).name(),
-                                   " `Tensor`.");
+string GenEagerPythonOp::Code() {
+  if (api_def_.visibility() == ApiDef::SKIP) {
+    return "";
+  }
+
+  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
+    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
+    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
+    params_no_default_.emplace_back(api_def_arg.name(),
+                                    api_def_arg.rename_to());
+    if (!arg.type_attr().empty()) {
+      AddAttrForArg(arg.type_attr(), i);
+    } else if (!arg.type_list_attr().empty()) {
+      AddAttrForArg(arg.type_list_attr(), i);
+    }
+    if (!arg.number_attr().empty()) {
+      AddAttrForArg(arg.number_attr(), i);
+    }
+  }
+  for (int i = 0; i < op_def_.attr_size(); ++i) {
+    const auto& attr(op_def_.attr(i));
+    const auto& api_def_attr(api_def_.attr(i));
+    // Do not add inferred attrs to the Python function signature.
+    if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
+      if (api_def_attr.has_default_value()) {
+        if (attr.type() == "tensor") {
+          params_with_default_.emplace_back(
+              python_op_gen_internal::ParamNames(api_def_attr.name(),
+                                                 api_def_attr.rename_to()),
+              strings::StrCat(
+                  "_execute.make_tensor(",
+                  TensorPBString(api_def_attr.default_value().tensor()), ", \"",
+                  api_def_attr.rename_to(), "\")"));
+        } else if (attr.type() == "list(tensor)") {
+          std::vector<string> pbtxt;
+          for (const auto& pb : api_def_attr.default_value().list().tensor()) {
+            pbtxt.emplace_back(TensorPBString(pb));
           }
-        } else if (!description.empty()) {
-          AppendWithinWidth(&desc, description, kRightMargin - 4 /* indent */);
-        }
-        strings::StrAppend(&result, Indent(4, 4, desc));
-      }
-    } else {
-      std::vector<string> out_names(num_outs);
-      for (int i = 0; i < num_outs; ++i) {
-        if (!op_def.output_arg(i).name().empty()) {
-          out_names[i] = op_def.output_arg(i).name();
-        } else {
-          out_names[i] = strings::StrCat("output", i);
-        }
-      }
-      strings::StrAppend(&result, "    A tuple of `Tensor` objects (",
-                         str_util::Join(out_names, ", "), ").\n\n");
-      for (int i = 0; i < num_outs; ++i) {
-        string desc = strings::StrCat(out_names[i], ": ");
-        StringPiece description = op_def.output_arg(i).description();
-        if (ConsumeEquals(&description)) {  // Skip the generated type info.
-          strings::StrAppend(&desc, description);
+          params_with_default_.emplace_back(
+              python_op_gen_internal::ParamNames(api_def_attr.name(),
+                                                 api_def_attr.rename_to()),
+              strings::StrCat("[_execute.make_tensor(_pb, \"",
+                              api_def_attr.rename_to(), "\") for _pb in ",
+                              VectorToTuple(pbtxt), "]"));
         } else {
-          const string type = static_cast<size_t>(i) < output_type_string.size()
-                                  ? output_type_string[i]
-                                  : kUnknownTensorType;
-          if (!description.empty()) {
-            if (type == kUnknownTensorType) {
-              // Special case where we don't understand how the output tensor
-              // type depends on the input tensor types, so we just use the
-              // output arg description.
-              strings::StrAppend(&desc, description);
-            } else {
-              strings::StrAppend(&desc, type, " ", description);
-            }
-          } else {
-            strings::StrAppend(&desc, type);
-          }
+          params_with_default_.emplace_back(
+              python_op_gen_internal::ParamNames(api_def_attr.name(),
+                                                 api_def_attr.rename_to()),
+              python_op_gen_internal::AttrValueToPython(
+                  attr.type(), api_def_attr.default_value(), "_dtypes."));
         }
-        strings::StrAppend(&result, Indent(4, 6, desc));
+      } else {
+        params_no_default_.emplace_back(api_def_attr.name(),
+                                        api_def_attr.rename_to());
       }
     }
   }
-  return result;
-}
 
-string StringToPython(const string& str) {
-  return strings::StrCat("\"", str_util::CEscape(str), "\"");
-}
+  // Save the list of attr parameters (attrs that won't be inferred),
+  // those with defaults go at the end.
+  // Get the attrs in the order we want by taking the attrs without defaults
+  // from the end of params_no_default_, and adding params_no_default_.
+  attrs_.reserve(params_no_default_.size() - op_def_.input_arg_size() +
+                 params_with_default_.size());
+  for (int i = op_def_.input_arg_size(); i < params_no_default_.size(); ++i) {
+    attrs_.push_back(params_no_default_[i].GetName());
+  }
+  for (const auto& p : params_with_default_) {
+    attrs_.push_back(p.first.GetName());
+  }
 
-string DataTypeToPython(DataType dtype, const string& dtype_module) {
-  return strings::StrCat(dtype_module, PythonDataTypeString(dtype));
-}
+  param_names_.reserve(params_no_default_.size() + params_with_default_.size());
+  param_names_.insert(param_names_.begin(), params_no_default_.begin(),
+                      params_no_default_.end());
+  for (const auto& param_and_default : params_with_default_) {
+    param_names_.push_back(param_and_default.first);
+  }
 
-string ShapeToPython(const TensorShapeProto& shape) {
-  if (shape.unknown_rank()) {
-    return "None";
+  string parameters;
+  for (const auto& param : params_no_default_) {
+    if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
+    strings::StrAppend(&parameters, param.GetRenameTo());
   }
-  string python = "[";
-  for (const auto& dim : shape.dim()) {
-    if (python.size() > 1) strings::StrAppend(&python, ", ");
-    if (!dim.name().empty()) {
-      strings::StrAppend(&python, "(", StringToPython(dim.name()), ", ",
-                         dim.size(), ")");
-    } else {
-      strings::StrAppend(&python, dim.size());
+  for (const auto& param_and_default : params_with_default_) {
+    if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
+    strings::StrAppend(&parameters, param_and_default.first.GetRenameTo(), "=",
+                       param_and_default.second);
+  }
+  if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
+  strings::StrAppend(&parameters, "name=None");
+
+  // Add attr_expressions_ for attrs that are params.
+  for (int i = 0; i < attrs_.size(); ++i) {
+    const string& attr_name = attrs_[i];
+    const string& attr_api_name =
+        param_names_[i + op_def_.input_arg_size()].GetRenameTo();
+    attr_expressions_[attr_name] = attr_api_name;
+  }
+  // Add attr_expressions_ for attrs that are inferred.
+  for (int i = 0; i < op_def_.attr_size(); ++i) {
+    const auto& attr(op_def_.attr(i));
+    if (attr.type() == "int") {
+      auto arg_list = attr_to_args_.find(attr.name());
+      if (arg_list != attr_to_args_.end()) {
+        AttrVarName(attr.name(), &attr_expressions_);
+      }
     }
   }
-  strings::StrAppend(&python, "]");
-  return python;
-}
 
-string TensorToPython(const TensorProto& proto) {
-  return ProtoShortDebugString(proto);
-}
+  string num_outputs_expr;
+  std::vector<string> output_sizes(num_outs_);
+  GetOutputSizesAndNumOutputsExpr(&output_sizes, &num_outputs_expr);
 
-string AttrListToPython(const AttrValue& value,
-                        const string& dtype_module = "tf.") {
-  string ret;
-  if (value.list().s_size() > 0) {
-    for (int i = 0; i < value.list().s_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret, StringToPython(value.list().s(i)));
-    }
-  } else if (value.list().i_size() > 0) {
-    for (int i = 0; i < value.list().i_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret, value.list().i(i));
-    }
-  } else if (value.list().f_size() > 0) {
-    for (int i = 0; i < value.list().f_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret, value.list().f(i));
-    }
-  } else if (value.list().b_size() > 0) {
-    for (int i = 0; i < value.list().b_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret, value.list().b(i) ? "True" : "False");
-    }
-  } else if (value.list().type_size() > 0) {
-    for (int i = 0; i < value.list().type_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret,
-                         DataTypeToPython(value.list().type(i), dtype_module));
-    }
-  } else if (value.list().shape_size() > 0) {
-    for (int i = 0; i < value.list().shape_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret, ShapeToPython(value.list().shape(i)));
-    }
-  } else if (value.list().tensor_size() > 0) {
-    for (int i = 0; i < value.list().tensor_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret, TensorToPython(value.list().tensor(i)));
-    }
-  } else if (value.list().func_size() > 0) {
-    for (int i = 0; i < value.list().func_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret, StringToPython(value.list().func(i).name()));
-    }
+  string eager_not_allowed_error = GetEagerNotAllowedError();
+
+  if (!AddEagerFastPathAndGraphCode(parameters, output_sizes,
+                                    eager_not_allowed_error)) {
+    return result_;
   }
-  return ret;
+
+  if (!AddEagerFallbackCode(parameters, output_sizes, num_outputs_expr,
+                            eager_not_allowed_error)) {
+    return result_;
+  }
+
+  return prelude_ + result_;
 }
 
-// NOTE: The return value may contain spaces (for example, it could be
-// a string "foo bar" with an embedded space) and is not safe to pass
-// to WordWrap().
-string AttrValueToPython(const string& type, const AttrValue& value,
-                         const string& dtype_module) {
-  if (type == "string") {
-    return StringToPython(value.s());
-  } else if (type == "int") {
-    return strings::StrCat(value.i());
-  } else if (type == "float") {
-    if (std::isnan(value.f()) || std::isinf(value.f())) {
-      return strings::StrCat("float('", value.f(), "')");
+void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
+  // Handle graph-mode case
+  strings::StrAppend(&result_,
+                     "  _ctx = _context._context\n"
+                     "  if _ctx is None or not _ctx._eager_context.is_eager:\n",
+                     function_setup,
+                     "    _, _, _op = _op_def_lib._apply_op_helper(\n");
+  AddBodyNoReturn("        ");
+  if (num_outs_ > 0) {
+    strings::StrAppend(&result_, "    _result = _op.outputs[:]\n");
+    // Special case handling for stateful op with single list output
+    // that might be empty.
+    if (num_outs_ == 1 && op_def_.is_stateful() &&
+        (!op_def_.output_arg(0).number_attr().empty() ||
+         !op_def_.output_arg(0).type_list_attr().empty())) {
+      // TODO(josh11b): Can skip this if the number_attr/type_list_attr has
+      // a constraint indicating that this can never be empty.
+      strings::StrAppend(&result_,
+                         "    if not _result:\n"
+                         "      return _op\n");
+    }
+    strings::StrAppend(&result_, "    _inputs_flat = _op.inputs\n");
+
+    // Compute graph-mode attrs.
+    if (op_def_.attr_size() > 0) {
+      string attr_values;
+      for (int i = 0; i < op_def_.attr_size(); ++i) {
+        if (i > 0) strings::StrAppend(&attr_values, ", ");
+        const auto& attr_name(op_def_.attr(i).name());
+        strings::StrAppend(&attr_values, "\"", attr_name, "\", _op.get_attr(\"",
+                           attr_name, "\")");
+      }
+      strings::StrAppend(&attr_values, ")");
+      strings::StrAppend(&result_,
+                         WordWrap("    _attrs = (", attr_values, kRightMargin),
+                         "\n");
     } else {
-      return strings::StrCat(value.f());
+      strings::StrAppend(&result_, "    _attrs = None\n");
     }
-  } else if (type == "bool") {
-    return value.b() ? "True" : "False";
-  } else if (type == "type") {
-    return DataTypeToPython(value.type(), dtype_module);
-  } else if (type == "shape") {
-    return ShapeToPython(value.shape());
-  } else if (type == "tensor") {
-    return TensorToPython(value.tensor());
-  } else if (type == "func") {
-    return StringToPython(value.func().name());
-  } else if (str_util::StartsWith(type, "list(")) {
-    return strings::StrCat("[", AttrListToPython(value, dtype_module), "]");
   } else {
-    return "?";
+    strings::StrAppend(&result_, "    return _op\n");
   }
 }
 
-void GenerateLowerCaseOpName(const string& str, string* result) {
-  const char joiner = '_';
-  const int last_index = str.size() - 1;
-  for (int i = 0; i <= last_index; ++i) {
-    const char c = str[i];
-    // Emit a joiner only if a previous-lower-to-now-upper or a
-    // now-upper-to-next-lower transition happens.
-    if (isupper(c) && (i > 0)) {
-      if (islower(str[i - 1]) || ((i < last_index) && islower(str[i + 1]))) {
-        result->push_back(joiner);
-      }
+string GenEagerPythonOp::GetEagerNotAllowedError() {
+  bool eager_allowed = true;
+  string ref_arg;
+  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
+    const auto& arg = op_def_.input_arg(i);
+    if (arg.is_ref()) {
+      eager_allowed = false;
+      DCHECK_EQ(op_def_.input_arg(i).name(), api_def_.in_arg(i).name());
+      ref_arg = api_def_.in_arg(i).rename_to();
+    }
+  }
+  for (int i = 0; i < op_def_.output_arg_size(); ++i) {
+    const auto& arg = op_def_.output_arg(i);
+    if (arg.is_ref()) {
+      eager_allowed = false;
+      DCHECK_EQ(op_def_.output_arg(i).name(), api_def_.out_arg(i).name());
+      ref_arg = api_def_.out_arg(i).rename_to();
     }
-    result->push_back(tolower(c));
   }
+
+  if (eager_allowed) return "";
+
+  return strings::StrCat("raise RuntimeError(\"", op_name_,
+                         " op does not support eager execution. ", "Arg '",
+                         ref_arg, "' is a ref.\")\n");
 }
 
-static void AddDelimiter(string* append_to, const string& delim) {
-  if (!append_to->empty()) strings::StrAppend(append_to, delim);
+void GenEagerPythonOp::ExpectListArg(const string& indentation,
+                                     const string& arg_name, string* output) {
+  strings::StrAppend(output, indentation, "if not isinstance(", arg_name,
+                     ", (list, tuple)):\n", indentation, "  raise TypeError(\n",
+                     indentation, "      \"Expected list for '", arg_name,
+                     "' argument to \"\n", indentation, "      \"'", op_name_,
+                     "' Op, not %r.\" % ", arg_name, ")\n");
 }
 
-const ApiDef::Attr* FindAttr(StringPiece name, const ApiDef& api_def) {
-  for (int i = 0; i < api_def.attr_size(); ++i) {
-    if (api_def.attr(i).name() == name) {
-      return &api_def.attr(i);
+bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
+                                             string* function_setup) {
+  // Validate list inputs, infer length attrs.
+  for (int i = 0; i < op_def_.attr_size(); ++i) {
+    const auto& attr(op_def_.attr(i));
+    if (attr.type() == "int") {
+      auto arg_list = attr_to_args_.find(attr.name());
+      if (arg_list != attr_to_args_.end()) {
+        // Inferred int attrs are the lengths of inputs. Validate those
+        // inputs are lists and have the same length.
+        for (auto iter = arg_list->second.begin();
+             iter != arg_list->second.end(); ++iter) {
+          const string& arg_api_name = param_names_[*iter].GetRenameTo();
+          ExpectListArg(indentation, arg_api_name, function_setup);
+          if (iter == arg_list->second.begin()) {
+            AddInferredAttr(indentation, attr.name(),
+                            strings::StrCat("len(", arg_api_name, ")"),
+                            function_setup, &attr_expressions_);
+          } else {
+            const auto& attr_var = attr_expressions_[attr.name()];
+            strings::StrAppend(
+                function_setup, indentation, "if len(", arg_api_name,
+                ") != ", attr_var, ":\n", indentation, "  raise ValueError(\n",
+                indentation, "      \"List argument '", arg_api_name, "' to '",
+                op_name_, "' Op with length %d \"\n", indentation,
+                "      \"must match length %d of argument '",
+                inferred_attrs_[attr.name()], "'.\" %\n", indentation,
+                "      (len(", arg_api_name, "), ", attr_var, "))\n");
+          }
+        }
+      }
     }
   }
-  return nullptr;
-}
 
-const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
-  for (int i = 0; i < api_def.in_arg_size(); ++i) {
-    if (api_def.in_arg(i).name() == name) {
-      return &api_def.in_arg(i);
+  for (int i = 0; i < attrs_.size(); ++i) {
+    const string& attr_name = attrs_[i];
+    const auto& param = param_names_[i + op_def_.input_arg_size()];
+    const auto& attr = *FindAttr(attr_name, op_def_);
+    const string& attr_api_name = param.GetRenameTo();
+    StringPiece attr_type = attr.type();
+    attr_expressions_[attr_name] = attr_api_name;
+    const int default_index = i - (attrs_.size() - params_with_default_.size());
+    if (default_index >= 0) {
+      const string& default_value = params_with_default_[default_index].second;
+      strings::StrAppend(function_setup, indentation, "if ", attr_api_name,
+                         " is None:\n");
+      strings::StrAppend(function_setup, indentation, "  ", attr_api_name,
+                         " = ", default_value, "\n");
+    }
+    if (str_util::StartsWith(attr_type, "list(")) {
+      ExpectListArg(indentation, attr_api_name, function_setup);
+    }
+
+    if (attr_type == "string") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_str(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(string)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_str(_s, \"", attr_api_name,
+                         "\") for _s in ", attr_api_name, "]\n");
+    } else if (attr_type == "int") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_int(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(int)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_int(_i, \"", attr_api_name,
+                         "\") for _i in ", attr_api_name, "]\n");
+    } else if (attr_type == "float") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_float(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(float)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_float(_f, \"", attr_api_name,
+                         "\") for _f in ", attr_api_name, "]\n");
+    } else if (attr_type == "bool") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_bool(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(bool)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_bool(_b, \"", attr_api_name,
+                         "\") for _b in ", attr_api_name, "]\n");
+    } else if (attr_type == "type") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_type(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(type)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_type(_t, \"", attr_api_name,
+                         "\") for _t in ", attr_api_name, "]\n");
+    } else if (attr_type == "shape") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_shape(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(shape)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_shape(_s, \"", attr_api_name,
+                         "\") for _s in ", attr_api_name, "]\n");
+    } else if (attr_type == "tensor") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_tensor(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(tensor)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_tensor(_t, \"", attr_api_name,
+                         "\") for _t in ", attr_api_name, "]\n");
+    } else if (attr_type != "func") {
+      *function_setup =
+          strings::StrCat("# No definition for ", function_name_,
+                          " since we don't support attrs with type\n"
+                          "# '",
+                          attr_type, "' right now.\n\n");
+      return false;
     }
   }
-  return nullptr;
+  return true;
 }
 
-GenPythonOp::GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                         const string& function_name)
-    : op_def_(op_def),
-      api_def_(api_def),
-      function_name_(function_name),
-      num_outs_(op_def.output_arg_size()) {}
-
-GenPythonOp::~GenPythonOp() {}
-
-string GenPythonOp::Code() {
-  // This has all the input args followed by those attrs that don't have
-  // defaults.
-  std::vector<ParamNames> params_no_default;
-  // The parameters with defaults (these have to be listed after those without).
-  // No input args are included, just attrs.
-  std::vector<ParamNames> params_with_default;
-
-  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
-    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
-    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
-    params_no_default.emplace_back(api_def_arg.name(), api_def_arg.rename_to());
-    if (!arg.type_attr().empty()) {
-      gtl::InsertIfNotPresent(&inferred_attrs_, arg.type_attr(), arg.name());
-    } else if (!arg.type_list_attr().empty()) {
-      gtl::InsertIfNotPresent(&inferred_attrs_, arg.type_list_attr(),
-                              arg.name());
-    }
+// If output i is list output, output_sizes[i] will be set to a
+// string with the python expression that will evaluate to its
+// length. output_sizes[i] is empty for non-list outputs.
+void GenEagerPythonOp::GetOutputSizesAndNumOutputsExpr(
+    std::vector<string>* output_sizes, string* num_outputs_expr) {
+  // Expression representing the number of outputs.
+  int num_fixed_outputs = 0;
+  for (int i = 0; i < num_outs_; ++i) {
+    const auto& arg(op_def_.output_arg(i));
     if (!arg.number_attr().empty()) {
-      gtl::InsertIfNotPresent(&inferred_attrs_, arg.number_attr(), arg.name());
-    }
-  }
-  for (int i = 0; i < api_def_.attr_size(); ++i) {
-    const auto& attr(api_def_.attr(i));
-    // Do not add inferred attrs to the Python function signature.
-    if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
-      if (attr.has_default_value()) {
-        params_with_default.emplace_back(attr.name(), attr.rename_to());
+      if (!num_outputs_expr->empty()) {
+        strings::StrAppend(num_outputs_expr, " + ");
+      }
+      (*output_sizes)[i] = attr_expressions_[arg.number_attr()];
+      strings::StrAppend(num_outputs_expr, (*output_sizes)[i]);
+    } else if (!arg.type_list_attr().empty()) {
+      if (!num_outputs_expr->empty()) {
+        strings::StrAppend(num_outputs_expr, " + ");
+      }
+      // Have to be careful to use an expression that works in both
+      // graph and eager paths here.
+      const auto iter = inferred_attrs_.find(arg.type_list_attr());
+      if (iter == inferred_attrs_.end()) {
+        (*output_sizes)[i] = strings::StrCat(
+            "len(", attr_expressions_[arg.type_list_attr()], ")");
       } else {
-        params_no_default.emplace_back(attr.name(), attr.rename_to());
+        (*output_sizes)[i] = strings::StrCat("len(", iter->second, ")");
       }
+      strings::StrAppend(num_outputs_expr, (*output_sizes)[i]);
+    } else {
+      ++num_fixed_outputs;
     }
   }
-
-  // Save the list of attr parameters (attrs that won't be inferred),
-  // those with defaults go at the end.
-  // Get the attrs in the order we want by taking the attrs without defaults
-  // from the end of args_no_default, and adding args_no_default.
-  attrs_.reserve(params_no_default.size() - op_def_.input_arg_size() +
-                 params_with_default.size());
-  for (int i = op_def_.input_arg_size(); i < params_no_default.size(); ++i) {
-    attrs_.push_back(params_no_default[i].GetName());
-  }
-  for (int i = 0; i < params_with_default.size(); ++i) {
-    attrs_.push_back(params_with_default[i].GetName());
-  }
-
-  param_names_.reserve(params_no_default.size() + params_with_default.size());
-  param_names_.insert(param_names_.begin(), params_no_default.begin(),
-                      params_no_default.end());
-  for (const auto& param : params_with_default) {
-    param_names_.push_back(param);
+  if (num_fixed_outputs > 0) {
+    if (!num_outputs_expr->empty()) {
+      strings::StrAppend(num_outputs_expr, " + ");
+    }
+    strings::StrAppend(num_outputs_expr, num_fixed_outputs);
+  } else if (num_outputs_expr->empty()) {
+    *num_outputs_expr = "0";
   }
+}
 
-  string parameters;
-  for (const auto& param : params_no_default) {
-    AddDelimiter(&parameters, ", ");
-    strings::StrAppend(&parameters, param.GetRenameTo());
-  }
-  for (const auto& param_and_default : params_with_default) {
-    AddDelimiter(&parameters, ", ");
-    strings::StrAppend(&parameters, param_and_default.GetRenameTo(), "=None");
+void GenEagerPythonOp::AddEagerFunctionTeardown(
+    const string& indentation, const std::vector<string>& output_sizes,
+    bool execute_record_gradient) {
+  if (num_outs_ > 0) {
+    if (execute_record_gradient) {
+      strings::StrAppend(&result_, indentation, "_execute.record_gradient(\n",
+                         "      \"", op_def_.name(),
+                         "\", _inputs_flat, _attrs, _result, name)\n");
+    }
+    if (num_outs_ == 1 && !output_sizes[0].empty()) {
+      // Single list result.
+    } else if (num_outs_ == 1) {
+      // Execute returns a single-element list which we need to destructure.
+      strings::StrAppend(&result_, indentation, "_result, = _result\n");
+    } else {
+      // Have multiple outputs, so we will need to reformat the return
+      // value of execute() to be a list with one entry per op output
+      // (that entry will be a list of tensors if that output is of list
+      // type).
+      // For list outputs, convert the right subrange of _result into a list.
+      Unflatten(indentation, output_sizes, "_result", &result_);
+      // Convert to a named tuple.
+      strings::StrAppend(&result_, indentation, "_result = _", op_def_.name(),
+                         "Output._make(_result)\n");
+    }
+  } else {
+    strings::StrAppend(&result_, indentation, "_result = None\n");
   }
-  AddDelimiter(&parameters, ", ");
-  strings::StrAppend(&parameters, "name=None");
+  strings::StrAppend(&result_, indentation, "return _result\n\n");
+}
 
+bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
+    const string& parameters, const std::vector<string>& output_sizes,
+    const string& eager_not_allowed_error) {
   AddExport();
-  AddDefLine(parameters);
+  AddDefLine(function_name_, parameters);
   AddDocStringDescription();
   AddDocStringArgs();
   AddDocStringInputs();
   AddDocStringAttrs();
   AddDocStringNameArg();
-  AddOutputGlobals();
+  AddOutputGlobals();  // Added to prelude_
   AddDocStringOutputs();
   strings::StrAppend(&result_, "  \"\"\"\n");
-  AddBody("  ");
-  strings::StrAppend(&result_, "\n\n");
 
-  return prelude_ + result_;
+  // Handle graph-mode case
+  string function_setup;
+  if (!GetEagerFunctionSetup("    ", &function_setup)) {
+    result_ = function_setup;
+    return false;
+  }
+  HandleGraphMode(function_setup);
+  AddEagerFunctionTeardown("    ", output_sizes,
+                           true /* execute_record_gradient */);
+
+  // Handle eager-mode case
+  strings::StrAppend(&result_, "  else:\n");
+
+  if (eager_not_allowed_error.empty()) {
+    AddEagerFastPathExecute();
+  } else {
+    strings::StrAppend(&result_, "    ", eager_not_allowed_error);
+  }
+
+  strings::StrAppend(&result_, "\n\n");
+  return true;
 }
 
-void GenPythonOp::AddExport() {
-  if (api_def_.visibility() != ApiDef::VISIBLE) {
-    return;
+bool GenEagerPythonOp::AddEagerFallbackCode(
+    const string& parameters, const std::vector<string>& output_sizes,
+    const string& num_outputs_expr, const string& eager_not_allowed_error) {
+  if (!eager_not_allowed_error.empty()) {
+    strings::StrAppend(&result_, "  ", eager_not_allowed_error);
+    return true;
   }
 
-  strings::StrAppend(&result_, "@tf_export(");
+  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix),
+             strings::StrCat(parameters, ", ctx=None"));
+  strings::StrAppend(
+      &result_, "  r\"\"\"This is the slowpath function for Eager mode.\n");
+  strings::StrAppend(&result_, "  This is for function ", function_name_,
+                     "\n  \"\"\"\n");
 
-  // Add all endpoint names to tf_export.
-  bool first_endpoint = true;
-  for (const auto& endpoint : api_def_.endpoint()) {
-    if (!first_endpoint) {
-      strings::StrAppend(&result_, ", ");
-    } else {
-      first_endpoint = false;
-    }
-    string endpoint_name;
-    python_op_gen_internal::GenerateLowerCaseOpName(endpoint.name(),
-                                                    &endpoint_name);
-    strings::StrAppend(&result_, "'", endpoint_name, "'");
+  strings::StrAppend(&result_, "  _ctx = ctx if ctx else _context.context()\n");
+
+  string function_setup;
+  if (!GetEagerFunctionSetup("  ", &function_setup)) {
+    result_ = function_setup;
+    return false;
   }
-  strings::StrAppend(&result_, ")\n");
-}
+  strings::StrAppend(&result_, function_setup);
 
-void GenPythonOp::AddDefLine(const string& function_name,
-                             const string& parameters) {
-  strings::StrAppend(&result_, "def ", function_name, "(", parameters, "):\n");
-}
+  AddEagerInferredAttrs("  ");
+  AddEagerInputCasts("  ");
+  strings::StrAppend(
+      &result_, "  _inputs_flat = ", FlattenInputs(nullptr, nullptr), "\n");
+  AddEagerAttrs("  ");
+  AddEagerExecute("  ", num_outputs_expr);
 
-void GenPythonOp::AddDefLine(const string& parameters) {
-  AddDefLine(function_name_, parameters);
+  AddEagerFunctionTeardown("  ", output_sizes,
+                           true /* execute_record_gradient */);
+
+  return true;
 }
 
-void GenPythonOp::AddDocStringDescription() {
-  string comment;
-  if (api_def_.summary().empty()) {
-    comment = "TODO: add doc.\n";
-  } else {
-    comment = strings::StrCat(api_def_.summary(), "\n");
-    if (!api_def_.description().empty()) {
-      strings::StrAppend(&comment, "\n", Indent(2, 2, api_def_.description()));
-    }
+void GenEagerPythonOp::AddEagerFastPathExecute() {
+  string fastpath_execute_params = strings::StrCat(
+      "_ctx._context_handle, _ctx._eager_context.device_name, \"",
+      op_def_.name(), "\", ", "name, _ctx._post_execution_callbacks");
+  string fallback_params;
+
+  for (int i = 0; i < api_def_.in_arg_size(); i++) {
+    const string param_name = param_names_[i].GetRenameTo();
+    strings::StrAppend(&fastpath_execute_params, ", ", param_name);
+    if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+    strings::StrAppend(&fallback_params, param_name);
   }
-  strings::StrAppend(&result_, "  r\"\"\"", comment, "\n");
-}
 
-void GenPythonOp::AddDocStringArgs() {
-  strings::StrAppend(&result_, "  Args:\n");
-}
+  for (const auto& attr : api_def_.attr()) {
+    if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
+      strings::StrAppend(&fastpath_execute_params, ", \"", attr.name(), "\", ",
+                         attr.rename_to());
 
-void GenPythonOp::AddDocStringInputs() {
-  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
-    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
-    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
-    StringPiece description = api_def_arg.description();
-    string desc;
-    if (ConsumeEquals(&description)) {  // Skip the generated type info.
-      desc = strings::StrCat(param_names_[i].GetRenameTo(), ": ");
-    } else {
-      desc = strings::StrCat(param_names_[i].GetRenameTo(), ": ",
-                             ArgTypeName(op_def_, arg, inferred_attrs_, false));
+      if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+      strings::StrAppend(&fallback_params, attr.rename_to(), "=",
+                         attr.rename_to());
     }
-    if (!description.empty()) {
-      AppendWithinWidth(&desc, description, kRightMargin - 4 /* indent */);
-    }
-    strings::StrAppend(&result_, Indent(4, 6, desc));
   }
+
+  if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+  strings::StrAppend(&fallback_params, "name=name");
+
+  strings::StrAppend(&result_, "    try:\n");
+  strings::StrAppend(
+      &result_, "      ",
+      "_result = _pywrap_tensorflow.TFE_Py_FastPathExecute(\n",
+      WordWrap(strings::StrCat("        "),
+               strings::StrCat(fastpath_execute_params, ")"), kRightMargin),
+      "\n");
+
+  if (op_def_.output_arg_size() > 1) {
+    const string output_tuple_name =
+        strings::StrCat("_", op_def_.name(), "Output");
+    strings::StrAppend(&result_, "      ", "_result = ", output_tuple_name,
+                       "._make(_result)\n");
+  }
+  strings::StrAppend(&result_, "      ", "return _result\n");
+
+  // Handle fallback.
+  if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+  strings::StrAppend(&fallback_params, "ctx=_ctx");
+  strings::StrAppend(&result_, "    ", "except _core._FallbackException:\n");
+  strings::StrAppend(
+      &result_, "      ", "return ", function_name_, kEagerFallbackSuffix,
+      "(\n",
+      WordWrap(strings::StrCat("          "),
+               strings::StrCat(fallback_params, ")"), kRightMargin),
+      "\n");
+
+  // Any errors thrown from execute need to be unwrapped from
+  // _NotOkStatusException.
+  strings::StrAppend(&result_, "    ",
+                     "except _core._NotOkStatusException as e:\n");
+  strings::StrAppend(&result_, "      ", "if name is not None:\n");
+  strings::StrAppend(&result_, "        ",
+                     "message = e.message + \" name: \" + name\n");
+  strings::StrAppend(&result_, "      ", "else:\n");
+  strings::StrAppend(&result_, "        ", "message = e.message\n");
+  strings::StrAppend(
+      &result_, "      ",
+      "_six.raise_from(_core._status_to_exception(e.code, message), None)\n");
 }
 
-void GenPythonOp::AddDocStringAttrs() {
-  for (const string& name : attrs_) {
-    const auto& attr = *FindAttr(name, op_def_);
-    const auto& api_def_attr = *FindAttr(name, api_def_);
-    string desc =
-        strings::StrCat(AvoidPythonReserved(api_def_attr.rename_to()), ": ");
-
-    static const char* const kAttrTypeName[][2] = {
-        {"string", "`string`"},
-        {"list(string)", "list of `strings`"},
-        {"int", "`int`"},
-        {"list(int)", "list of `ints`"},
-        {"float", "`float`"},
-        {"list(float)", "list of `floats`"},
-        {"bool", "`bool`"},
-        {"list(bool)", "list of `bools`"},
-        {"type", "`tf.DType`"},
-        {"list(type)", "list of `tf.DTypes`"},
-        {"shape", "`tf.TensorShape` or list of `ints`"},
-        {"list(shape)",
-         "list of shapes (each a `tf.TensorShape` or list of `ints`)"},
-        {"tensor", "`tf.TensorProto`"},
-        {"list(tensor)", "list of `tf.TensorProto` objects"},
-        {"func", "function decorated with @Defun"},
-        {"list(func)", "list of functions decorated with @Defun"},
-    };
-    for (size_t i = 0; i < TF_ARRAYSIZE(kAttrTypeName); ++i) {
-      if (attr.type() == kAttrTypeName[i][0]) {
-        string s;
-        if (api_def_attr.has_default_value()) {
-          s = strings::StrCat("optional ", kAttrTypeName[i][1]);
+void GenEagerPythonOp::AddEagerInferredAttrs(const string& indentation) {
+  // Figure out values for inferred attrs, and cast to eager tensors.
+  for (int i = 0; i < op_def_.attr_size(); ++i) {
+    const auto& attr(op_def_.attr(i));
+    const auto& api_def_attr(api_def_.attr(i));
+    auto arg_list = attr_to_args_.find(attr.name());
+    if (arg_list != attr_to_args_.end()) {
+      if (attr.type() == "type") {
+        std::vector<string> output_sizes;
+        const string flattened =
+            FlattenInputs(&arg_list->second, &output_sizes);
+        string conversion = strings::StrCat("_execute.args_to_matching_eager(",
+                                            flattened, ", _ctx");
+        if (attr.has_default_value()) {
+          strings::StrAppend(
+              &conversion, ", ",
+              python_op_gen_internal::AttrValueToPython(
+                  attr.type(), api_def_attr.default_value(), "_dtypes."));
+        }
+        strings::StrAppend(&conversion, ")");
+        const string var_name = AttrVarName(attr.name(), &attr_expressions_);
+        if (output_sizes.size() == 1) {
+          // Avoid creating a temporary variable in the case where
+          // we can easily assign to the right value directly.
+          const string inputs_var =
+              param_names_[arg_list->second.front()].GetRenameTo();
+          if (output_sizes.front().empty()) {
+            strings::StrAppend(&result_, indentation, var_name, ", (",
+                               inputs_var, ",) = ", conversion, "\n");
+          } else {
+            strings::StrAppend(&result_, indentation, var_name, ", ",
+                               inputs_var, " = ", conversion, "\n");
+          }
         } else {
-          s = kAttrTypeName[i][1];
+          const string inputs_var = strings::StrCat("_inputs_", attr.name());
+          strings::StrAppend(&result_, indentation, var_name, ", ", inputs_var,
+                             " = ", conversion, "\n");
+          // Convert from a flat list of eager tensors back to the
+          // parameter variables.
+          Unflatten(indentation, output_sizes, inputs_var, &result_);
+          std::vector<string> p;
+          for (int j : arg_list->second) {
+            p.emplace_back(param_names_[j].GetRenameTo());
+          }
+          strings::StrAppend(&result_, indentation, VectorToTuple(p), " = ",
+                             inputs_var, "\n");
         }
-        if (s[0] == 'o' || (s[0] == '`' && (s[1] == 'i' || s[1] == 'o'))) {
-          strings::StrAppend(&desc, "An ", s);
+      } else if (attr.type() == "list(type)") {
+        // NOTE: We ignore default values for these attrs, since it is
+        // unclear how you would use it, and the one use case is
+        // parse_single_sequence_example which only needs it for
+        // backwards compatibility.
+        const string var_name = AttrVarName(attr.name(), &attr_expressions_);
+        string inputs_var;
+        string conversion;
+        if (arg_list->second.size() > 1) {
+          // If you have more than one list(tensor) argument, their types
+          // have to match.
+          std::vector<string> lists;
+          for (auto iter = arg_list->second.begin();
+               iter != arg_list->second.end(); ++iter) {
+            lists.push_back(param_names_[*iter].GetRenameTo());
+          }
+          inputs_var = VectorToTuple(lists);
+          conversion = "_execute.args_to_mixed_eager_tensors";
         } else {
-          strings::StrAppend(&desc, "A ", s);
+          // For one list(tensor) argument, we just convert every
+          // element of the list to an eager tensor.
+          inputs_var = param_names_[arg_list->second.front()].GetRenameTo();
+          conversion = "_execute.convert_to_mixed_eager_tensors";
         }
-        break;
+        strings::StrAppend(&result_, indentation, var_name, ", ", inputs_var,
+                           " = ", conversion, "(", inputs_var, ", _ctx)\n");
       }
     }
-
-    if (attr.has_allowed_values()) {
-      strings::StrAppend(&desc, " from: `",
-                         AttrListToPython(attr.allowed_values()), "`");
-    }
-
-    if (attr.has_minimum()) {
-      if (attr.type() == "int") {
-        strings::StrAppend(&desc, " that is `>= ", attr.minimum(), "`");
-      } else if (attr.minimum() > 0) {
-        strings::StrAppend(&desc, " that has length `>= ", attr.minimum(), "`");
-      }
-    }
-
-    strings::StrAppend(&desc, ".");
-
-    if (api_def_attr.has_default_value()) {
-      strings::StrAppend(
-          &desc, " Defaults to `",
-          AttrValueToPython(attr.type(), api_def_attr.default_value()), "`.");
-    }
-    if (!api_def_attr.description().empty()) {
-      AppendWithinWidth(&desc, api_def_attr.description(),
-                        kRightMargin - 4 /* indent */);
-    }
-    strings::StrAppend(&result_, Indent(4, 6, desc));
   }
 }
 
-void GenPythonOp::AddDocStringNameArg() {
-  strings::StrAppend(&result_,
-                     "    name: A name for the operation (optional).\n");
+void GenEagerPythonOp::AddEagerInputCasts(const string& indentation) {
+  // Cast remaining args to eager tensors
+  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
+    const auto& arg(op_def_.input_arg(i));
+    if (!arg.type_attr().empty() || !arg.type_list_attr().empty()) continue;
+    const string& param = param_names_[i].GetRenameTo();
+    const string fn = arg.number_attr().empty() ? "" : "n_";
+    const string dtype =
+        python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
+    strings::StrAppend(&result_, indentation, param, " = _ops.convert_", fn,
+                       "to_tensor(", param, ", ", dtype, ")\n");
+  }
 }
 
-void GenPythonOp::AddOutputGlobals() {
-  // Prepare a NamedTuple type to hold the outputs, if there are multiple
-  if (num_outs_ > 1) {
-    // Prepare the list of output names
-    std::vector<string> out_names(num_outs_);
-    for (int i = 0; i < num_outs_; ++i) {
-      if (!api_def_.out_arg(i).rename_to().empty()) {
-        out_names[i] = api_def_.out_arg(i).rename_to();
-      } else {
-        out_names[i] = strings::StrCat("output", i);
-      }
+void GenEagerPythonOp::AddEagerAttrs(const string& indentation) {
+  // Compute eager attrs
+  if (op_def_.attr_size() > 0) {
+    string attr_values;
+    for (int i = 0; i < op_def_.attr_size(); ++i) {
+      if (i > 0) strings::StrAppend(&attr_values, ", ");
+      const auto& attr_name(op_def_.attr(i).name());
+      strings::StrAppend(&attr_values, "\"", attr_name, "\", ",
+                         attr_expressions_[attr_name]);
     }
-    string out_names_list =
-        strings::StrCat("[\"", str_util::Join(out_names, "\", \""), "\"]");
-
-    // Provide the output names as a Python list
-    string lower_op_name_outputs =
-        strings::StrCat("_", function_name_, "_outputs");
-    const string outputs_prefix = strings::StrCat(lower_op_name_outputs, " = ");
-    strings::StrAppend(&prelude_, "\n",
-                       WordWrap(outputs_prefix, out_names_list, kRightMargin),
-                       "\n");
-
-    strings::StrAppend(&prelude_, "_", op_def_.name(),
-                       "Output = _collections.namedtuple(\n");
-    const string tuple_type_prefix = "    ";
-    const string tuple_type_suffix = strings::StrCat(
-        "\"", op_def_.name(), "\", ", lower_op_name_outputs, ")");
+    strings::StrAppend(&attr_values, ")");
     strings::StrAppend(
-        &prelude_, WordWrap(tuple_type_prefix, tuple_type_suffix, kRightMargin),
-        "\n\n");
-  }
-  strings::StrAppend(&prelude_, "\n");
-}
-
-void GenPythonOp::AddDocStringOutputs() {
-  std::vector<string> output_type_string;
-  output_type_string.reserve(num_outs_);
-  for (int i = 0; i < num_outs_; ++i) {
-    output_type_string.push_back(
-        ArgTypeName(op_def_, op_def_.output_arg(i), inferred_attrs_, true));
-  }
-  strings::StrAppend(&result_, GetReturns(op_def_, output_type_string));
-}
-
-void GenPythonOp::AddBody(const string& prefix) {
-  const string apply_prefix =
-      strings::StrCat(prefix, "_result = _op_def_lib.apply_op(");
-  AddBodyNoReturn(apply_prefix);
-  if (num_outs_ > 1) {
-    strings::StrAppend(&result_, prefix, "_result = _", op_def_.name(),
-                       "Output._make(_result)\n");
+        &result_,
+        WordWrap(indentation, strings::StrCat("_attrs = (", attr_values),
+                 kRightMargin),
+        "\n");
+  } else {
+    strings::StrAppend(&result_, indentation, "_attrs = None\n");
   }
-  strings::StrAppend(&result_, prefix, "return _result\n");
 }
 
-void GenPythonOp::AddBodyNoReturn(const string& apply_prefix) {
-  string args = strings::StrCat("\"", op_def_.name(), "\", ");
-  for (size_t i = 0; i < param_names_.size(); ++i) {
-    strings::StrAppend(&args, AvoidPythonReserved(param_names_[i].GetName()),
-                       "=", param_names_[i].GetRenameTo(), ", ");
-  }
-  strings::StrAppend(&args, "name=name)");
-
+void GenEagerPythonOp::AddEagerExecute(const string& indentation,
+                                       const string& num_outputs_expr) {
+  const string return_prefix =
+      strings::StrCat(indentation, "_result = _execute.execute(");
+  const string return_args = strings::StrCat(
+      "b\"", op_def_.name(), "\", ", num_outputs_expr,
+      ", inputs=_inputs_flat, attrs=_attrs, ctx=_ctx, name=name)");
   strings::StrAppend(&result_,
                      // Wrap the arguments, and indent to the (.
-                     WordWrap(apply_prefix, args, kRightMargin), "\n");
-}
-
-}  // namespace python_op_gen_internal
-
-string GetPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                   const string& function_name) {
-  return python_op_gen_internal::GenPythonOp(op_def, api_def, function_name)
-      .Code();
+                     WordWrap(return_prefix, return_args, kRightMargin), "\n");
 }
 
 string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                    const std::vector<string>& hidden_ops,
-                    bool require_shapes) {
+                    const std::vector<string>& hidden_ops, bool require_shapes,
+                    const string& source_file_name = "") {
   string result;
   // Header
   // TODO(josh11b): Mention the library for which wrappers are being generated.
   strings::StrAppend(&result, R"("""Python wrappers around TensorFlow ops.
 
 This file is MACHINE GENERATED! Do not edit.
-"""
+)");
+
+  // Mention the original source file so someone tracing back through
+  // generated Python code will know where to look next.
+  if (!source_file_name.empty()) {
+    strings::StrAppend(&result, "Original C++ source file: ");
+    strings::StrAppend(&result, source_file_name);
+    strings::StrAppend(&result, "\n");
+  }
+
+  strings::StrAppend(&result, R"("""
 
 import collections as _collections
+import six as _six
 
-from tensorflow.core.framework import op_def_pb2 as _op_def_pb2
+from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
+from tensorflow.python.eager import context as _context
+from tensorflow.python.eager import core as _core
+from tensorflow.python.eager import execute as _execute
+from tensorflow.python.framework import dtypes as _dtypes
+from tensorflow.python.framework import errors as _errors
+from tensorflow.python.framework import tensor_shape as _tensor_shape
 
+from tensorflow.core.framework import op_def_pb2 as _op_def_pb2
 # Needed to trigger the call to _set_call_cpp_shape_fn.
 from tensorflow.python.framework import common_shapes as _common_shapes
-
 from tensorflow.python.framework import op_def_registry as _op_def_registry
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.framework import op_def_library as _op_def_library
 from tensorflow.python.util.tf_export import tf_export
+
 )");
 
   // We'll make a copy of ops that filters out descriptions.
@@ -839,7 +957,6 @@ from tensorflow.python.util.tf_export import tf_export
     if (api_def->visibility() == ApiDef::SKIP) {
       continue;
     }
-
     // An op is hidden if either its ApiDef visibility is HIDDEN
     // or it is in the hidden_ops list.
     bool is_hidden = api_def->visibility() == ApiDef::HIDDEN;
@@ -875,11 +992,12 @@ from tensorflow.python.util.tf_export import tf_export
       continue;
     }
 
-    strings::StrAppend(&result, GetPythonOp(op_def, *api_def, function_name));
+    strings::StrAppend(&result,
+                       GetEagerPythonOp(op_def, *api_def, function_name));
 
     if (!require_shapes) {
       strings::StrAppend(&result, "_ops.RegisterShape(\"", op_def.name(),
-                         "\")(None)\n");
+                         "\")(None)\n\n");
     }
 
     auto added = out->Add();
@@ -894,8 +1012,6 @@ from tensorflow.python.util.tf_export import tf_export
   op_def_lib = _op_def_library.OpDefLibrary()
   op_def_lib.add_op_list(op_list)
   return op_def_lib
-
-
 )");
 
   result.append("# ");
@@ -908,16 +1024,21 @@ from tensorflow.python.util.tf_export import tf_export
   return result;
 }
 
+}  // namespace
+
 void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                    const std::vector<string>& hidden_ops,
-                    bool require_shapes) {
-  printf("%s", GetPythonOps(ops, api_defs, hidden_ops, require_shapes).c_str());
+                    const std::vector<string>& hidden_ops, bool require_shapes,
+                    const string& source_file_name) {
+  printf("%s", GetPythonOps(ops, api_defs, hidden_ops, require_shapes,
+                            source_file_name)
+                   .c_str());
 }
 
 string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
   string op_list_str(op_list_buf, op_list_len);
   OpList ops;
   ops.ParseFromString(op_list_str);
+
   ApiDefMap api_def_map(ops);
   return GetPythonOps(ops, api_def_map, {}, false);
 }
diff --git a/tensorflow/python/framework/python_op_gen.h b/tensorflow/python/framework/python_op_gen.h
index 4d20888dc634620515b17c4824341cdab6d6bb02..7e754fd12246bdd2c0d04232af154f44980dbfdf 100644
--- a/tensorflow/python/framework/python_op_gen.h
+++ b/tensorflow/python/framework/python_op_gen.h
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,29 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #ifndef TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_H_
 #define TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_H_
 
 #include <string>
 #include <vector>
-#include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-// hidden_ops should be a vector of Op names that should get a leading _ in the
-// output.
-// The Print* version prints the output to stdout, Get* version returns the
-// output as a string.
+// hidden_ops should be a list of Op names that should get a leading _
+// in the output. Prints the output to stdout.
+// Optional fourth argument is the name of the original C++ source file
+// where the ops' REGISTER_OP() calls reside.
 void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                    const std::vector<string>& hidden_ops, bool require_shapes);
-string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                    const std::vector<string>& hidden_ops, bool require_shapes);
-string GetPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                   const string& function_name);
+                    const std::vector<string>& hidden_ops, bool require_shapes,
+                    const string& source_file_name = "");
 
 // Get the python wrappers for a list of ops in a OpList.
 // `op_list_buf` should be a pointer to a buffer containing
diff --git a/tensorflow/python/framework/python_op_gen.i b/tensorflow/python/framework/python_op_gen.i
index efcce2f20941790571199d358dfcfcc3c0283d08..26ec4e8e66b5d4e3be433c9e59f9b6034109d153 100644
--- a/tensorflow/python/framework/python_op_gen.i
+++ b/tensorflow/python/framework/python_op_gen.i
@@ -16,10 +16,10 @@ limitations under the License.
 %include "tensorflow/python/platform/base.i"
 
 %{
-#include "tensorflow/python/eager/python_eager_op_gen.h"
+#include "tensorflow/python/framework/python_op_gen.h"
 %}
 
-// Input typemap for GetEagerPythonWrappers.
+// Input typemap for GetPythonWrappers.
 // Accepts a python object of 'bytes' type, and converts it to
 // a const char* pointer and size_t length. The default typemap
 // going from python bytes to const char* tries to decode the
@@ -37,5 +37,5 @@ limitations under the License.
 
 
 %ignoreall;
-%unignore tensorflow::GetEagerPythonWrappers;
-%include "tensorflow/python/eager/python_eager_op_gen.h"
+%unignore tensorflow::GetPythonWrappers;
+%include "tensorflow/python/framework/python_op_gen.h"
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
new file mode 100644
index 0000000000000000000000000000000000000000..940bffb906db753f3699b6a8d2401741bc50a517
--- /dev/null
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -0,0 +1,800 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/framework/python_op_gen_internal.h"
+
+#include <stdio.h>
+#include <sstream>
+#include <unordered_map>
+#include "tensorflow/core/framework/api_def.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb_text.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def_util.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace python_op_gen_internal {
+
+const int kRightMargin = 78;
+
+bool IsPythonReserved(const string& s) {
+  static const std::set<string>* const kPythonReserved = new std::set<string>(
+      {// Keywords in Python, from:
+       //   import keyword
+       //   print keyword.kwlist
+       "and", "as", "assert", "break", "class", "continue", "def", "del",
+       "elif", "else", "except", "exec", "finally", "for", "from", "global",
+       "if", "import", "in", "is", "lambda", "not", "or", "pass", "print",
+       "raise", "return", "try", "while", "with", "yield",
+       // Built-in functions and types in Python, from:
+       //   [x for x in dir(__builtins__) if not x[0].islower()]
+       "ArithmeticError", "AssertionError", "AttributeError", "BaseException",
+       "BufferError", "BytesWarning", "DeprecationWarning", "EOFError",
+       "Ellipsis", "EnvironmentError", "Exception", "False",
+       "FloatingPointError", "FutureWarning", "GeneratorExit", "IOError",
+       "ImportError", "ImportWarning", "IndentationError", "IndexError",
+       "KeyError", "KeyboardInterrupt", "LookupError", "MemoryError",
+       "NameError", "None", "NotImplemented", "NotImplementedError", "OSError",
+       "OverflowError", "PendingDeprecationWarning", "ReferenceError",
+       "RuntimeError", "RuntimeWarning", "StandardError", "StopIteration",
+       "SyntaxError", "SyntaxWarning", "SystemError", "SystemExit", "TabError",
+       "True", "TypeError", "UnboundLocalError", "UnicodeDecodeError",
+       "UnicodeEncodeError", "UnicodeError", "UnicodeTranslateError",
+       "UnicodeWarning", "UserWarning", "ValueError", "Warning",
+       "ZeroDivisionError", "__debug__", "__doc__", "__import__", "__name__",
+       "__package__"});
+
+  return kPythonReserved->count(s) > 0;
+}
+
+bool IsOpWithUnderscorePrefix(const string& s) {
+  static const std::set<string>* const kUnderscoreOps = new std::set<string>(
+      {// Lowercase built-in functions and types in Python, from:
+       // [x for x in dir(__builtins__) if x[0].islower()] except "round".
+       // These need to be excluded so they don't conflict with actual built-in
+       // functions since we use '*' imports.
+       "abs", "all", "any", "apply", "bin", "bool", "buffer", "bytearray",
+       "bytes", "callable", "chr", "classmethod", "cmp", "coerce", "compile",
+       "complex", "copyright", "credits", "delattr", "dict", "dir", "divmod",
+       "enumerate", "eval", "execfile", "exit", "file", "filter", "float",
+       "format", "frozenset", "getattr", "globals", "hasattr", "hash", "help",
+       "hex", "id", "input", "int", "intern", "isinstance", "issubclass",
+       "iter", "len", "license", "list", "locals", "long", "map", "max",
+       "memoryview", "min", "next", "object", "oct", "open", "ord", "pow",
+       "print", "property", "quit", "range", "raw_input", "reduce", "reload",
+       "repr", "reversed", "set", "setattr", "slice", "sorted", "staticmethod",
+       "str", "sum", "super", "tuple", "type", "unichr", "unicode", "vars",
+       "xrange", "zip",
+       // These have the same name as ops defined in Python and might be used
+       // incorrectly depending on order of '*' imports.
+       // TODO(annarev): reduce usage of '*' imports and remove these from the
+       // list.
+       "fused_batch_norm", "histogram_fixed_width", "stack",
+       "batch_norm_with_global_normalization", "clip_by_value"});
+  return kUnderscoreOps->count(s) > 0;
+}
+
+string AvoidPythonReserved(const string& s) {
+  if (IsPythonReserved(s)) return strings::StrCat(s, "_");
+  return s;
+}
+
+// Indent the first line by "initial" spaces and all following lines
+// by "rest" spaces.
+string Indent(int initial, int rest, StringPiece in) {
+  // TODO(josh11b): Also word-wrapping?
+  string copy(in.data(), in.size());
+  str_util::StripTrailingWhitespace(&copy);
+  std::vector<string> v = str_util::Split(copy, '\n');
+
+  string result;
+  bool first = true;
+  for (const string& line : v) {
+    if (first) {
+      result = strings::StrCat(Spaces(initial), line, "\n");
+      first = false;
+    } else {
+      if (line.empty()) {
+        strings::StrAppend(&result, "\n");
+      } else {
+        strings::StrAppend(&result, Spaces(rest), line, "\n");
+      }
+    }
+  }
+  return result;
+}
+
+// Adds append to *dest, with a space if the first line will be <= width,
+// or a newline otherwise.
+void AppendWithinWidth(string* dest, StringPiece append, int width) {
+  auto first_line = append.find('\n');
+  if (first_line == string::npos) first_line = append.size();
+  if (dest->size() + first_line + 1 /* space */ > static_cast<size_t>(width)) {
+    strings::StrAppend(dest, "\n", append);
+  } else {
+    strings::StrAppend(dest, " ", append);
+  }
+}
+
+// Like DataTypeString() but uses the Python names for the
+// float types.
+string PythonDataTypeString(DataType dtype) {
+  switch (dtype) {
+    case DT_FLOAT:
+      return "float32";
+    case DT_DOUBLE:
+      return "float64";
+    default:
+      return DataTypeString(dtype);
+  }
+}
+
+string TypeString(DataType dtype, bool ref) {
+  if (ref) {
+    return strings::StrCat("mutable `", PythonDataTypeString(dtype), "`");
+  } else {
+    return strings::StrCat("`", PythonDataTypeString(dtype), "`");
+  }
+}
+
+string TypeListString(const AttrValue& value) {
+  string ret;
+  for (int t : value.list().type()) {
+    if (!ret.empty()) strings::StrAppend(&ret, ", ");
+    DataType dtype = static_cast<DataType>(t);
+    if (IsRefType(dtype)) {
+      strings::StrAppend(&ret, PythonDataTypeString(RemoveRefType(dtype)),
+                         " mutable");
+    } else {
+      strings::StrAppend(&ret, "`", PythonDataTypeString(dtype), "`");
+    }
+  }
+  return ret;
+}
+
+string SingleTensorName(DataType dtype, bool is_ref) {
+  const string type_str = TypeString(dtype, is_ref);
+  return strings::StrCat("A `Tensor` of type ", type_str, ".");
+}
+
+const char kUnknownTensorType[] = {"A `Tensor`."};
+
+string ArgTypeName(const OpDef& op_def, const OpDef::ArgDef& arg,
+                   const std::unordered_map<string, string>& inferred_attrs,
+                   bool is_output) {
+  if (!arg.number_attr().empty()) {
+    // N Tensors with the same type
+    const string* original_arg =
+        gtl::FindOrNull(inferred_attrs, arg.number_attr());
+    string prefix;
+    if (original_arg == nullptr) {
+      prefix = strings::StrCat("A list of `", arg.number_attr(), "`");
+    } else if (*original_arg == arg.name()) {
+      const OpDef::AttrDef* attr = FindAttr(arg.number_attr(), op_def);
+      if (attr->has_minimum() && attr->minimum() > 0) {
+        prefix = strings::StrCat("A list of at least ", attr->minimum());
+      } else {
+        prefix = "A list of";
+      }
+    } else {
+      prefix = strings::StrCat("A list with the same length as `",
+                               AvoidPythonReserved(*original_arg), "` of");
+    }
+
+    if (arg.type() != DT_INVALID) {
+      return strings::StrCat(prefix, " `Tensor` objects with type ",
+                             TypeString(arg.type(), arg.is_ref()), ".");
+    } else {
+      original_arg = gtl::FindOrNull(inferred_attrs, arg.type_attr());
+      if (arg.is_ref()) {
+        strings::StrAppend(&prefix, " mutable");
+      }
+      if (original_arg == nullptr) {
+        return strings::StrCat(prefix, " `Tensor` objects with type `",
+                               arg.type_attr(), "`.");
+      } else if (*original_arg == arg.name()) {
+        const OpDef::AttrDef* attr = FindAttr(arg.type_attr(), op_def);
+        if (attr->has_allowed_values()) {
+          return strings::StrCat(prefix,
+                                 " `Tensor` objects with the same type in: ",
+                                 TypeListString(attr->allowed_values()), ".");
+        } else {
+          return strings::StrCat(prefix,
+                                 " `Tensor` objects with the same type.");
+        }
+      } else {
+        return strings::StrCat(prefix,
+                               " `Tensor` objects with the same type as `",
+                               AvoidPythonReserved(*original_arg), "`.");
+      }
+    }
+  } else if (!arg.type_attr().empty() || !arg.type_list_attr().empty()) {
+    const bool is_list = !arg.type_list_attr().empty();
+    const string attr_name = is_list ? arg.type_list_attr() : arg.type_attr();
+    const OpDef::AttrDef* attr = FindAttr(attr_name, op_def);
+    const string mutable_str = arg.is_ref() ? "mutable " : "";
+    const string prefix =
+        is_list ? strings::StrCat("A list of ", mutable_str, "`Tensor` objects")
+                : strings::StrCat("A ", mutable_str, "`Tensor`");
+    const string* original_arg = gtl::FindOrNull(inferred_attrs, attr_name);
+    if (original_arg == nullptr) {
+      return strings::StrCat(prefix, " of type `", attr_name, "`.");
+    } else if (*original_arg == arg.name()) {
+      if (attr->has_allowed_values()) {
+        if (is_list) {
+          return strings::StrCat(prefix, " with types from: ",
+                                 TypeListString(attr->allowed_values()), ".");
+        } else {
+          return strings::StrCat(
+              prefix, is_output ? ". Has one of the following types: "
+                                : ". Must be one of the following types: ",
+              TypeListString(attr->allowed_values()), ".");
+        }
+      } else {
+        return strings::StrCat(prefix, ".");
+      }
+    } else {
+      return strings::StrCat(prefix,
+                             is_output ? ". Has the same type as `"
+                                       : ". Must have the same type as `",
+                             AvoidPythonReserved(*original_arg), "`.");
+    }
+  } else {
+    return SingleTensorName(arg.type(), arg.is_ref());
+  }
+}
+
+string GetReturns(const OpDef& op_def,
+                  const std::vector<string>& output_type_string) {
+  string result;
+  DCHECK_EQ(op_def.output_arg_size(), output_type_string.size());
+  const int num_outs = op_def.output_arg_size();
+  strings::StrAppend(&result, "\n  Returns:\n");
+  if (num_outs == 0) {
+    strings::StrAppend(&result, "    The created Operation.\n");
+  } else {
+    if (num_outs == 1) {
+      StringPiece description = op_def.output_arg(0).description();
+      if (ConsumeEquals(&description)) {  // Skip the generated type info.
+        strings::StrAppend(&result, Indent(4, 4, description));
+      } else {
+        // Special case of one output, don't use the name of the output unless
+        // there is no description.
+        string desc = output_type_string.empty() ? kUnknownTensorType
+                                                 : output_type_string[0];
+        if (desc == kUnknownTensorType) {
+          // Special case where we don't understand how the output tensor type
+          // depends on the input tensor types, just use the output arg
+          // description if we can.
+          if (!description.empty()) {
+            desc = op_def.output_arg(0).description();
+          } else if (!op_def.output_arg(0).name().empty()) {
+            desc = strings::StrCat(" The ", op_def.output_arg(0).name(),
+                                   " `Tensor`.");
+          }
+        } else if (!description.empty()) {
+          AppendWithinWidth(&desc, description, kRightMargin - 4 /* indent */);
+        }
+        strings::StrAppend(&result, Indent(4, 4, desc));
+      }
+    } else {
+      std::vector<string> out_names(num_outs);
+      for (int i = 0; i < num_outs; ++i) {
+        if (!op_def.output_arg(i).name().empty()) {
+          out_names[i] = op_def.output_arg(i).name();
+        } else {
+          out_names[i] = strings::StrCat("output", i);
+        }
+      }
+      strings::StrAppend(&result, "    A tuple of `Tensor` objects (",
+                         str_util::Join(out_names, ", "), ").\n\n");
+      for (int i = 0; i < num_outs; ++i) {
+        string desc = strings::StrCat(out_names[i], ": ");
+        StringPiece description = op_def.output_arg(i).description();
+        if (ConsumeEquals(&description)) {  // Skip the generated type info.
+          strings::StrAppend(&desc, description);
+        } else {
+          const string type = static_cast<size_t>(i) < output_type_string.size()
+                                  ? output_type_string[i]
+                                  : kUnknownTensorType;
+          if (!description.empty()) {
+            if (type == kUnknownTensorType) {
+              // Special case where we don't understand how the output tensor
+              // type depends on the input tensor types, so we just use the
+              // output arg description.
+              strings::StrAppend(&desc, description);
+            } else {
+              strings::StrAppend(&desc, type, " ", description);
+            }
+          } else {
+            strings::StrAppend(&desc, type);
+          }
+        }
+        strings::StrAppend(&result, Indent(4, 6, desc));
+      }
+    }
+  }
+  return result;
+}
+
+string StringToPython(const string& str) {
+  return strings::StrCat("\"", str_util::CEscape(str), "\"");
+}
+
+string DataTypeToPython(DataType dtype, const string& dtype_module) {
+  return strings::StrCat(dtype_module, PythonDataTypeString(dtype));
+}
+
+string ShapeToPython(const TensorShapeProto& shape) {
+  if (shape.unknown_rank()) {
+    return "None";
+  }
+  string python = "[";
+  for (const auto& dim : shape.dim()) {
+    if (python.size() > 1) strings::StrAppend(&python, ", ");
+    if (!dim.name().empty()) {
+      strings::StrAppend(&python, "(", StringToPython(dim.name()), ", ",
+                         dim.size(), ")");
+    } else {
+      strings::StrAppend(&python, dim.size());
+    }
+  }
+  strings::StrAppend(&python, "]");
+  return python;
+}
+
+string TensorToPython(const TensorProto& proto) {
+  return ProtoShortDebugString(proto);
+}
+
+string AttrListToPython(const AttrValue& value,
+                        const string& dtype_module = "tf.") {
+  string ret;
+  if (value.list().s_size() > 0) {
+    for (int i = 0; i < value.list().s_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, StringToPython(value.list().s(i)));
+    }
+  } else if (value.list().i_size() > 0) {
+    for (int i = 0; i < value.list().i_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, value.list().i(i));
+    }
+  } else if (value.list().f_size() > 0) {
+    for (int i = 0; i < value.list().f_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, value.list().f(i));
+    }
+  } else if (value.list().b_size() > 0) {
+    for (int i = 0; i < value.list().b_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, value.list().b(i) ? "True" : "False");
+    }
+  } else if (value.list().type_size() > 0) {
+    for (int i = 0; i < value.list().type_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret,
+                         DataTypeToPython(value.list().type(i), dtype_module));
+    }
+  } else if (value.list().shape_size() > 0) {
+    for (int i = 0; i < value.list().shape_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, ShapeToPython(value.list().shape(i)));
+    }
+  } else if (value.list().tensor_size() > 0) {
+    for (int i = 0; i < value.list().tensor_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, TensorToPython(value.list().tensor(i)));
+    }
+  } else if (value.list().func_size() > 0) {
+    for (int i = 0; i < value.list().func_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, StringToPython(value.list().func(i).name()));
+    }
+  }
+  return ret;
+}
+
+// NOTE: The return value may contain spaces (for example, it could be
+// a string "foo bar" with an embedded space) and is not safe to pass
+// to WordWrap().
+string AttrValueToPython(const string& type, const AttrValue& value,
+                         const string& dtype_module) {
+  if (type == "string") {
+    return StringToPython(value.s());
+  } else if (type == "int") {
+    return strings::StrCat(value.i());
+  } else if (type == "float") {
+    if (std::isnan(value.f()) || std::isinf(value.f())) {
+      return strings::StrCat("float('", value.f(), "')");
+    } else {
+      return strings::StrCat(value.f());
+    }
+  } else if (type == "bool") {
+    return value.b() ? "True" : "False";
+  } else if (type == "type") {
+    return DataTypeToPython(value.type(), dtype_module);
+  } else if (type == "shape") {
+    return ShapeToPython(value.shape());
+  } else if (type == "tensor") {
+    return TensorToPython(value.tensor());
+  } else if (type == "func") {
+    return StringToPython(value.func().name());
+  } else if (str_util::StartsWith(type, "list(")) {
+    return strings::StrCat("[", AttrListToPython(value, dtype_module), "]");
+  } else {
+    return "?";
+  }
+}
+
+void GenerateLowerCaseOpName(const string& str, string* result) {
+  const char joiner = '_';
+  const int last_index = str.size() - 1;
+  for (int i = 0; i <= last_index; ++i) {
+    const char c = str[i];
+    // Emit a joiner only if a previous-lower-to-now-upper or a
+    // now-upper-to-next-lower transition happens.
+    if (isupper(c) && (i > 0)) {
+      if (islower(str[i - 1]) || ((i < last_index) && islower(str[i + 1]))) {
+        result->push_back(joiner);
+      }
+    }
+    result->push_back(tolower(c));
+  }
+}
+
+static void AddDelimiter(string* append_to, const string& delim) {
+  if (!append_to->empty()) strings::StrAppend(append_to, delim);
+}
+
+const ApiDef::Attr* FindAttr(StringPiece name, const ApiDef& api_def) {
+  for (int i = 0; i < api_def.attr_size(); ++i) {
+    if (api_def.attr(i).name() == name) {
+      return &api_def.attr(i);
+    }
+  }
+  return nullptr;
+}
+
+const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
+  for (int i = 0; i < api_def.in_arg_size(); ++i) {
+    if (api_def.in_arg(i).name() == name) {
+      return &api_def.in_arg(i);
+    }
+  }
+  return nullptr;
+}
+
+GenPythonOp::GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
+                         const string& function_name)
+    : op_def_(op_def),
+      api_def_(api_def),
+      function_name_(function_name),
+      num_outs_(op_def.output_arg_size()) {}
+
+GenPythonOp::~GenPythonOp() {}
+
+string GenPythonOp::Code() {
+  // This has all the input args followed by those attrs that don't have
+  // defaults.
+  std::vector<ParamNames> params_no_default;
+  // The parameters with defaults (these have to be listed after those without).
+  // No input args are included, just attrs.
+  std::vector<ParamNames> params_with_default;
+
+  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
+    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
+    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
+    params_no_default.emplace_back(api_def_arg.name(), api_def_arg.rename_to());
+    if (!arg.type_attr().empty()) {
+      gtl::InsertIfNotPresent(&inferred_attrs_, arg.type_attr(), arg.name());
+    } else if (!arg.type_list_attr().empty()) {
+      gtl::InsertIfNotPresent(&inferred_attrs_, arg.type_list_attr(),
+                              arg.name());
+    }
+    if (!arg.number_attr().empty()) {
+      gtl::InsertIfNotPresent(&inferred_attrs_, arg.number_attr(), arg.name());
+    }
+  }
+  for (int i = 0; i < api_def_.attr_size(); ++i) {
+    const auto& attr(api_def_.attr(i));
+    // Do not add inferred attrs to the Python function signature.
+    if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
+      if (attr.has_default_value()) {
+        params_with_default.emplace_back(attr.name(), attr.rename_to());
+      } else {
+        params_no_default.emplace_back(attr.name(), attr.rename_to());
+      }
+    }
+  }
+
+  // Save the list of attr parameters (attrs that won't be inferred),
+  // those with defaults go at the end.
+  // Get the attrs in the order we want by taking the attrs without defaults
+  // from the end of args_no_default, and adding args_no_default.
+  attrs_.reserve(params_no_default.size() - op_def_.input_arg_size() +
+                 params_with_default.size());
+  for (int i = op_def_.input_arg_size(); i < params_no_default.size(); ++i) {
+    attrs_.push_back(params_no_default[i].GetName());
+  }
+  for (int i = 0; i < params_with_default.size(); ++i) {
+    attrs_.push_back(params_with_default[i].GetName());
+  }
+
+  param_names_.reserve(params_no_default.size() + params_with_default.size());
+  param_names_.insert(param_names_.begin(), params_no_default.begin(),
+                      params_no_default.end());
+  for (const auto& param : params_with_default) {
+    param_names_.push_back(param);
+  }
+
+  string parameters;
+  for (const auto& param : params_no_default) {
+    AddDelimiter(&parameters, ", ");
+    strings::StrAppend(&parameters, param.GetRenameTo());
+  }
+  for (const auto& param_and_default : params_with_default) {
+    AddDelimiter(&parameters, ", ");
+    strings::StrAppend(&parameters, param_and_default.GetRenameTo(), "=None");
+  }
+  AddDelimiter(&parameters, ", ");
+  strings::StrAppend(&parameters, "name=None");
+
+  AddExport();
+  AddDefLine(parameters);
+  AddDocStringDescription();
+  AddDocStringArgs();
+  AddDocStringInputs();
+  AddDocStringAttrs();
+  AddDocStringNameArg();
+  AddOutputGlobals();
+  AddDocStringOutputs();
+  strings::StrAppend(&result_, "  \"\"\"\n");
+  AddBody("  ");
+  strings::StrAppend(&result_, "\n\n");
+
+  return prelude_ + result_;
+}
+
+void GenPythonOp::AddExport() {
+  if (api_def_.visibility() != ApiDef::VISIBLE) {
+    return;
+  }
+
+  strings::StrAppend(&result_, "@tf_export(");
+
+  // Add all endpoint names to tf_export.
+  bool first_endpoint = true;
+  for (const auto& endpoint : api_def_.endpoint()) {
+    if (!first_endpoint) {
+      strings::StrAppend(&result_, ", ");
+    } else {
+      first_endpoint = false;
+    }
+    string endpoint_name;
+    python_op_gen_internal::GenerateLowerCaseOpName(endpoint.name(),
+                                                    &endpoint_name);
+    strings::StrAppend(&result_, "'", endpoint_name, "'");
+  }
+  strings::StrAppend(&result_, ")\n");
+}
+
+void GenPythonOp::AddDefLine(const string& function_name,
+                             const string& parameters) {
+  strings::StrAppend(&result_, "def ", function_name, "(", parameters, "):\n");
+}
+
+void GenPythonOp::AddDefLine(const string& parameters) {
+  AddDefLine(function_name_, parameters);
+}
+
+void GenPythonOp::AddDocStringDescription() {
+  string comment;
+  if (api_def_.summary().empty()) {
+    comment = "TODO: add doc.\n";
+  } else {
+    comment = strings::StrCat(api_def_.summary(), "\n");
+    if (!api_def_.description().empty()) {
+      strings::StrAppend(&comment, "\n", Indent(2, 2, api_def_.description()));
+    }
+  }
+  strings::StrAppend(&result_, "  r\"\"\"", comment, "\n");
+}
+
+void GenPythonOp::AddDocStringArgs() {
+  strings::StrAppend(&result_, "  Args:\n");
+}
+
+void GenPythonOp::AddDocStringInputs() {
+  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
+    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
+    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
+    StringPiece description = api_def_arg.description();
+    string desc;
+    if (ConsumeEquals(&description)) {  // Skip the generated type info.
+      desc = strings::StrCat(param_names_[i].GetRenameTo(), ": ");
+    } else {
+      desc = strings::StrCat(param_names_[i].GetRenameTo(), ": ",
+                             ArgTypeName(op_def_, arg, inferred_attrs_, false));
+    }
+    if (!description.empty()) {
+      AppendWithinWidth(&desc, description, kRightMargin - 4 /* indent */);
+    }
+    strings::StrAppend(&result_, Indent(4, 6, desc));
+  }
+}
+
+void GenPythonOp::AddDocStringAttrs() {
+  for (const string& name : attrs_) {
+    const auto& attr = *FindAttr(name, op_def_);
+    const auto& api_def_attr = *FindAttr(name, api_def_);
+    string desc =
+        strings::StrCat(AvoidPythonReserved(api_def_attr.rename_to()), ": ");
+
+    static const char* const kAttrTypeName[][2] = {
+        {"string", "`string`"},
+        {"list(string)", "list of `strings`"},
+        {"int", "`int`"},
+        {"list(int)", "list of `ints`"},
+        {"float", "`float`"},
+        {"list(float)", "list of `floats`"},
+        {"bool", "`bool`"},
+        {"list(bool)", "list of `bools`"},
+        {"type", "`tf.DType`"},
+        {"list(type)", "list of `tf.DTypes`"},
+        {"shape", "`tf.TensorShape` or list of `ints`"},
+        {"list(shape)",
+         "list of shapes (each a `tf.TensorShape` or list of `ints`)"},
+        {"tensor", "`tf.TensorProto`"},
+        {"list(tensor)", "list of `tf.TensorProto` objects"},
+        {"func", "function decorated with @Defun"},
+        {"list(func)", "list of functions decorated with @Defun"},
+    };
+    for (size_t i = 0; i < TF_ARRAYSIZE(kAttrTypeName); ++i) {
+      if (attr.type() == kAttrTypeName[i][0]) {
+        string s;
+        if (api_def_attr.has_default_value()) {
+          s = strings::StrCat("optional ", kAttrTypeName[i][1]);
+        } else {
+          s = kAttrTypeName[i][1];
+        }
+        if (s[0] == 'o' || (s[0] == '`' && (s[1] == 'i' || s[1] == 'o'))) {
+          strings::StrAppend(&desc, "An ", s);
+        } else {
+          strings::StrAppend(&desc, "A ", s);
+        }
+        break;
+      }
+    }
+
+    if (attr.has_allowed_values()) {
+      strings::StrAppend(&desc, " from: `",
+                         AttrListToPython(attr.allowed_values()), "`");
+    }
+
+    if (attr.has_minimum()) {
+      if (attr.type() == "int") {
+        strings::StrAppend(&desc, " that is `>= ", attr.minimum(), "`");
+      } else if (attr.minimum() > 0) {
+        strings::StrAppend(&desc, " that has length `>= ", attr.minimum(), "`");
+      }
+    }
+
+    strings::StrAppend(&desc, ".");
+
+    if (api_def_attr.has_default_value()) {
+      strings::StrAppend(
+          &desc, " Defaults to `",
+          AttrValueToPython(attr.type(), api_def_attr.default_value()), "`.");
+    }
+    if (!api_def_attr.description().empty()) {
+      AppendWithinWidth(&desc, api_def_attr.description(),
+                        kRightMargin - 4 /* indent */);
+    }
+    strings::StrAppend(&result_, Indent(4, 6, desc));
+  }
+}
+
+void GenPythonOp::AddDocStringNameArg() {
+  strings::StrAppend(&result_,
+                     "    name: A name for the operation (optional).\n");
+}
+
+void GenPythonOp::AddOutputGlobals() {
+  // Prepare a NamedTuple type to hold the outputs, if there are multiple
+  if (num_outs_ > 1) {
+    // Prepare the list of output names
+    std::vector<string> out_names(num_outs_);
+    for (int i = 0; i < num_outs_; ++i) {
+      if (!api_def_.out_arg(i).rename_to().empty()) {
+        out_names[i] = api_def_.out_arg(i).rename_to();
+      } else {
+        out_names[i] = strings::StrCat("output", i);
+      }
+    }
+    string out_names_list =
+        strings::StrCat("[\"", str_util::Join(out_names, "\", \""), "\"]");
+
+    // Provide the output names as a Python list
+    string lower_op_name_outputs =
+        strings::StrCat("_", function_name_, "_outputs");
+    const string outputs_prefix = strings::StrCat(lower_op_name_outputs, " = ");
+    strings::StrAppend(&prelude_, "\n",
+                       WordWrap(outputs_prefix, out_names_list, kRightMargin),
+                       "\n");
+
+    strings::StrAppend(&prelude_, "_", op_def_.name(),
+                       "Output = _collections.namedtuple(\n");
+    const string tuple_type_prefix = "    ";
+    const string tuple_type_suffix = strings::StrCat(
+        "\"", op_def_.name(), "\", ", lower_op_name_outputs, ")");
+    strings::StrAppend(
+        &prelude_, WordWrap(tuple_type_prefix, tuple_type_suffix, kRightMargin),
+        "\n\n");
+  }
+  strings::StrAppend(&prelude_, "\n");
+}
+
+void GenPythonOp::AddDocStringOutputs() {
+  std::vector<string> output_type_string;
+  output_type_string.reserve(num_outs_);
+  for (int i = 0; i < num_outs_; ++i) {
+    output_type_string.push_back(
+        ArgTypeName(op_def_, op_def_.output_arg(i), inferred_attrs_, true));
+  }
+  strings::StrAppend(&result_, GetReturns(op_def_, output_type_string));
+}
+
+void GenPythonOp::AddBody(const string& prefix) {
+  const string apply_prefix =
+      strings::StrCat(prefix, "_result = _op_def_lib.apply_op(");
+  AddBodyNoReturn(apply_prefix);
+  if (num_outs_ > 1) {
+    strings::StrAppend(&result_, prefix, "_result = _", op_def_.name(),
+                       "Output._make(_result)\n");
+  }
+  strings::StrAppend(&result_, prefix, "return _result\n");
+}
+
+void GenPythonOp::AddBodyNoReturn(const string& apply_prefix) {
+  string args = strings::StrCat("\"", op_def_.name(), "\", ");
+  for (size_t i = 0; i < param_names_.size(); ++i) {
+    strings::StrAppend(&args, AvoidPythonReserved(param_names_[i].GetName()),
+                       "=", param_names_[i].GetRenameTo(), ", ");
+  }
+  strings::StrAppend(&args, "name=name)");
+
+  strings::StrAppend(&result_,
+                     // Wrap the arguments, and indent to the (.
+                     WordWrap(apply_prefix, args, kRightMargin), "\n");
+}
+
+}  // namespace python_op_gen_internal
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index ca6ed42beec4a3d5ff70d5a605ab006265d1cce9..8eb943b960800e6d82a39ac96afb78ae73b77c77 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/python/eager/python_eager_op_gen.h"
+#include "tensorflow/python/framework/python_op_gen.h"
 
 #include <memory>
 #include <string>
@@ -133,11 +133,10 @@ void PrintAllPythonOps(const std::vector<string>& op_list,
         *pruned_ops.mutable_op()->Add() = op_def;
       }
     }
-    PrintEagerPythonOps(pruned_ops, api_def_map, {}, require_shapes,
-                        source_file_name);
+    PrintPythonOps(pruned_ops, api_def_map, {}, require_shapes,
+                   source_file_name);
   } else {
-    PrintEagerPythonOps(ops, api_def_map, op_list, require_shapes,
-                        source_file_name);
+    PrintPythonOps(ops, api_def_map, op_list, require_shapes, source_file_name);
   }
 }
 
diff --git a/tensorflow/python/framework/smart_cond_test.py b/tensorflow/python/framework/smart_cond_test.py
index 1170a41c99995ae875e58a2d5491e05bc1e40df6..b8a9672b06da9b24d567a9779fb703ac7178d411 100644
--- a/tensorflow/python/framework/smart_cond_test.py
+++ b/tensorflow/python/framework/smart_cond_test.py
@@ -33,7 +33,6 @@ def raise_exception():
   raise RuntimeError("did not expect to be called")
 
 
-@test_util.with_c_api
 class SmartCondTest(test_util.TensorFlowTestCase):
 
   def testTrue(self):
@@ -64,9 +63,6 @@ class SmartCondTest(test_util.TensorFlowTestCase):
         self.assertEqual(y.eval(feed_dict={x: -1}), 2)
 
   def testEval(self):
-    # Constant expression evaluation only works with the C API enabled.
-    if not ops._USE_C_API: return
-
     with ops.Graph().as_default():
       with session.Session():
         x = constant_op.constant(1)
@@ -101,7 +97,6 @@ class SmartCondTest(test_util.TensorFlowTestCase):
           smart_cond.smart_cond(True, lambda: x)
 
 
-@test_util.with_c_api
 class SmartCaseTest(test_util.TensorFlowTestCase):
 
   def testTrue(self):
@@ -130,9 +125,6 @@ class SmartCaseTest(test_util.TensorFlowTestCase):
       self.assertEqual(sess.run(z), 1)
 
   def testMix(self):
-    # Constant expression evaluation only works with the C API enabled.
-    if not ops._USE_C_API: return
-
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     y = constant_op.constant(10)
     conditions = [(x > 1, lambda: constant_op.constant(1)),
@@ -145,7 +137,6 @@ class SmartCaseTest(test_util.TensorFlowTestCase):
       self.assertEqual(sess.run(z, feed_dict={x: 0}), 3)
 
 
-@test_util.with_c_api
 class SmartConstantValueTest(test_util.TensorFlowTestCase):
 
   # TODO(skyewm): this is essentially a regression test for
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 1fe81e5f17a7de0a113596d920d63e5d9474c7c1..6a5c6468f77382b2b7e62a6a49d4fb637fed4dc0 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import collections
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -225,6 +226,7 @@ class SparseTensor(_TensorLike):
 SparseTensorValue = collections.namedtuple(
     "SparseTensorValue", ["indices", "values", "dense_shape"])
 tf_export("SparseTensorValue")(SparseTensorValue)
+pywrap_tensorflow.RegisterSparseTensorValueClass(SparseTensorValue)
 
 
 @tf_export("convert_to_tensor_or_sparse_tensor")
diff --git a/tensorflow/python/framework/subscribe_test.py b/tensorflow/python/framework/subscribe_test.py
index 8b95b25e82a1886c43e08f47a612300750643fb1..d6de45fdc41d8ea8f07458e37374f36d62d0fc4b 100644
--- a/tensorflow/python/framework/subscribe_test.py
+++ b/tensorflow/python/framework/subscribe_test.py
@@ -36,7 +36,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
-@test_util.with_c_api
 class SubscribeTest(test_util.TensorFlowTestCase):
 
   def _ExpectSubscribedIdentities(self, container):
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 8cf24206edab8be807eca1d067662a57585e2bda..ca63efbc84dab20850845841e9e212a681b6bb06 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -50,6 +50,13 @@ def SlowAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
       [ExtractBitsFromFloat16(x) for x in proto_values])
 
 
+def _MediumAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
+  # TODO: Remove the conversion if cython supports np.float16_t
+  fast_tensor_util.AppendFloat16ArrayToTensorProto(
+      tensor_proto,
+      np.asarray(proto_values, dtype=np.float16).view(np.uint16))
+
+
 def ExtractBitsFromBFloat16(x):
   return np.asscalar(
       np.asarray(x, dtype=dtypes.bfloat16.as_numpy_dtype).view(np.uint16))
@@ -64,11 +71,8 @@ if _FAST_TENSOR_UTIL_AVAILABLE:
   _NP_TO_APPEND_FN = {
       dtypes.bfloat16.as_numpy_dtype:
           SlowAppendBFloat16ArrayToTensorProto,
-      # TODO(sesse): We should have a
-      # fast_tensor_util.AppendFloat16ArrayToTensorProto,
-      # but it seems np.float16_t doesn't exist?
       np.float16:
-          SlowAppendFloat16ArrayToTensorProto,
+          _MediumAppendFloat16ArrayToTensorProto,
       np.float32:
           fast_tensor_util.AppendFloat32ArrayToTensorProto,
       np.float64:
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 5e02e7e3ec605e41deffad5e3faf1c9f8a2679ef..5b01df48fea19fa473658103341998b29920170c 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -639,6 +639,14 @@ def assert_no_garbage_created(f):
   return decorator
 
 
+def run_all_in_graph_and_eager_modes(cls):
+  base_decorator = run_in_graph_and_eager_modes()
+  for name, value in cls.__dict__.copy().items():
+    if callable(value) and name.startswith("test"):
+      setattr(cls, name, base_decorator(value))
+  return cls
+
+
 def run_in_graph_and_eager_modes(__unused__=None,
                                  config=None,
                                  use_gpu=True,
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 8d492256aac17d48dc5ac801a4eaf019fbe277de..0f53762f6fab16f3fef2f5511a11acf53467e250 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -44,7 +44,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
-@test_util.with_c_api
 class TestUtilTest(test_util.TensorFlowTestCase):
 
   def test_assert_ops_in_graph(self):
@@ -597,7 +596,6 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     self.assertIsNone(test_util.get_node_def_from_graph("bar", graph_def))
 
 
-@test_util.with_c_api
 class GarbageCollectionTest(test_util.TensorFlowTestCase):
 
   def test_no_reference_cycle_decorator(self):
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index 3f9d8864a2b4b750a33d14161fd18d764f68d7bf..7ed4b128e495c484d294ece40541427f21856cf1 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -25,7 +25,6 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -35,7 +34,6 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import training as train
 
 
-@test_util.with_c_api
 class MemoryOptimizerSwapTest(test.TestCase):
   """Tests the Grappler memory optimizer."""
 
@@ -96,7 +94,6 @@ class MemoryOptimizerSwapTest(test.TestCase):
         self.assertEqual('c', node.input[1])
 
 
-@test_util.with_c_api
 class MemoryOptimizerRecomputeTest(test.TestCase):
   """Tests the Python interface to recomputation rewrites.
 
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 1b66f589397527d62155c456b22f4c67ff470158..5d730695b9f818137da6f6ced821a941a4518f71 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -19,76 +19,37 @@ py_library(
     name = "keras",
     srcs = [
         "__init__.py",
-        "_impl/keras/__init__.py",
-        "_impl/keras/applications/__init__.py",
-        "_impl/keras/applications/densenet.py",
-        "_impl/keras/applications/imagenet_utils.py",
-        "_impl/keras/applications/inception_resnet_v2.py",
-        "_impl/keras/applications/inception_v3.py",
-        "_impl/keras/applications/mobilenet.py",
-        "_impl/keras/applications/nasnet.py",
-        "_impl/keras/applications/resnet50.py",
-        "_impl/keras/applications/vgg16.py",
-        "_impl/keras/applications/vgg19.py",
-        "_impl/keras/applications/xception.py",
-        "_impl/keras/datasets/__init__.py",
-        "_impl/keras/datasets/boston_housing.py",
-        "_impl/keras/datasets/cifar.py",
-        "_impl/keras/datasets/cifar10.py",
-        "_impl/keras/datasets/cifar100.py",
-        "_impl/keras/datasets/fashion_mnist.py",
-        "_impl/keras/datasets/imdb.py",
-        "_impl/keras/datasets/mnist.py",
-        "_impl/keras/datasets/reuters.py",
-        "_impl/keras/estimator.py",
-        "_impl/keras/preprocessing/__init__.py",
-        "_impl/keras/preprocessing/image.py",
-        "_impl/keras/preprocessing/sequence.py",
-        "_impl/keras/preprocessing/text.py",
-        "_impl/keras/testing_utils.py",
-        "_impl/keras/utils/__init__.py",
-        "_impl/keras/utils/multi_gpu_utils.py",
-        "_impl/keras/utils/np_utils.py",
-        "_impl/keras/utils/vis_utils.py",
-        "_impl/keras/wrappers/__init__.py",
-        "_impl/keras/wrappers/scikit_learn.py",
-        "activations/__init__.py",
         "applications/__init__.py",
-        "applications/densenet/__init__.py",
-        "applications/inception_resnet_v2/__init__.py",
-        "applications/inception_v3/__init__.py",
-        "applications/mobilenet/__init__.py",
-        "applications/nasnet/__init__.py",
-        "applications/resnet50/__init__.py",
-        "applications/vgg16/__init__.py",
-        "applications/vgg19/__init__.py",
-        "applications/xception/__init__.py",
-        "backend/__init__.py",
-        "callbacks/__init__.py",
-        "constraints/__init__.py",
+        "applications/densenet.py",
+        "applications/imagenet_utils.py",
+        "applications/inception_resnet_v2.py",
+        "applications/inception_v3.py",
+        "applications/mobilenet.py",
+        "applications/nasnet.py",
+        "applications/resnet50.py",
+        "applications/vgg16.py",
+        "applications/vgg19.py",
+        "applications/xception.py",
         "datasets/__init__.py",
-        "datasets/boston_housing/__init__.py",
-        "datasets/cifar10/__init__.py",
-        "datasets/cifar100/__init__.py",
-        "datasets/fashion_mnist/__init__.py",
-        "datasets/imdb/__init__.py",
-        "datasets/mnist/__init__.py",
-        "datasets/reuters/__init__.py",
-        "estimator/__init__.py",
-        "initializers/__init__.py",
-        "layers/__init__.py",
-        "losses/__init__.py",
-        "metrics/__init__.py",
-        "models/__init__.py",
-        "optimizers/__init__.py",
+        "datasets/boston_housing.py",
+        "datasets/cifar.py",
+        "datasets/cifar10.py",
+        "datasets/cifar100.py",
+        "datasets/fashion_mnist.py",
+        "datasets/imdb.py",
+        "datasets/mnist.py",
+        "datasets/reuters.py",
         "preprocessing/__init__.py",
-        "preprocessing/image/__init__.py",
-        "preprocessing/sequence/__init__.py",
-        "preprocessing/text/__init__.py",
-        "regularizers/__init__.py",
+        "preprocessing/image.py",
+        "preprocessing/sequence.py",
+        "preprocessing/text.py",
+        "testing_utils.py",
         "utils/__init__.py",
+        "utils/multi_gpu_utils.py",
+        "utils/np_utils.py",
+        "utils/vis_utils.py",
         "wrappers/__init__.py",
-        "wrappers/scikit_learn/__init__.py",
+        "wrappers/scikit_learn.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
@@ -99,8 +60,6 @@ py_library(
         ":backend",
         ":engine",
         ":layers",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/saved_model",
         "//tensorflow/python:training",
     ],
@@ -108,7 +67,7 @@ py_library(
 
 py_library(
     name = "backend",
-    srcs = ["_impl/keras/backend.py"],
+    srcs = ["backend.py"],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core:protos_all_py",
@@ -149,28 +108,28 @@ py_library(
 py_library(
     name = "engine",
     srcs = [
-        "_impl/keras/activations.py",
-        "_impl/keras/callbacks.py",
-        "_impl/keras/constraints.py",
-        "_impl/keras/engine/__init__.py",
-        "_impl/keras/engine/base_layer.py",
-        "_impl/keras/engine/input_layer.py",
-        "_impl/keras/engine/network.py",
-        "_impl/keras/engine/saving.py",
-        "_impl/keras/engine/sequential.py",
-        "_impl/keras/engine/training.py",
-        "_impl/keras/engine/training_arrays.py",
-        "_impl/keras/engine/training_eager.py",
-        "_impl/keras/engine/training_generator.py",
-        "_impl/keras/engine/training_utils.py",
-        "_impl/keras/initializers.py",
-        "_impl/keras/losses.py",
-        "_impl/keras/metrics.py",
-        "_impl/keras/models.py",
-        "_impl/keras/optimizers.py",
-        "_impl/keras/regularizers.py",
-        "_impl/keras/utils/data_utils.py",
-        "_impl/keras/utils/io_utils.py",
+        "activations.py",
+        "callbacks.py",
+        "constraints.py",
+        "engine/__init__.py",
+        "engine/base_layer.py",
+        "engine/input_layer.py",
+        "engine/network.py",
+        "engine/saving.py",
+        "engine/sequential.py",
+        "engine/training.py",
+        "engine/training_arrays.py",
+        "engine/training_eager.py",
+        "engine/training_generator.py",
+        "engine/training_utils.py",
+        "initializers.py",
+        "losses.py",
+        "metrics.py",
+        "models.py",
+        "optimizers.py",
+        "regularizers.py",
+        "utils/data_utils.py",
+        "utils/io_utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -183,25 +142,25 @@ py_library(
 py_library(
     name = "layers",
     srcs = [
-        "_impl/keras/layers/__init__.py",
-        "_impl/keras/layers/advanced_activations.py",
-        "_impl/keras/layers/convolutional.py",
-        "_impl/keras/layers/convolutional_recurrent.py",
-        "_impl/keras/layers/core.py",
-        "_impl/keras/layers/cudnn_recurrent.py",
-        "_impl/keras/layers/embeddings.py",
-        "_impl/keras/layers/local.py",
-        "_impl/keras/layers/merge.py",
-        "_impl/keras/layers/noise.py",
-        "_impl/keras/layers/normalization.py",
-        "_impl/keras/layers/pooling.py",
-        "_impl/keras/layers/recurrent.py",
-        "_impl/keras/layers/serialization.py",
-        "_impl/keras/layers/wrappers.py",
-        "_impl/keras/utils/conv_utils.py",
-        "_impl/keras/utils/generic_utils.py",
-        "_impl/keras/utils/layer_utils.py",
-        "_impl/keras/utils/tf_utils.py",
+        "layers/__init__.py",
+        "layers/advanced_activations.py",
+        "layers/convolutional.py",
+        "layers/convolutional_recurrent.py",
+        "layers/core.py",
+        "layers/cudnn_recurrent.py",
+        "layers/embeddings.py",
+        "layers/local.py",
+        "layers/merge.py",
+        "layers/noise.py",
+        "layers/normalization.py",
+        "layers/pooling.py",
+        "layers/recurrent.py",
+        "layers/serialization.py",
+        "layers/wrappers.py",
+        "utils/conv_utils.py",
+        "utils/generic_utils.py",
+        "utils/layer_utils.py",
+        "utils/tf_utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -228,7 +187,7 @@ py_library(
 py_test(
     name = "integration_test",
     size = "medium",
-    srcs = ["_impl/keras/integration_test.py"],
+    srcs = ["integration_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -243,7 +202,7 @@ py_test(
 py_test(
     name = "activations_test",
     size = "small",
-    srcs = ["_impl/keras/activations_test.py"],
+    srcs = ["activations_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -255,7 +214,7 @@ py_test(
 py_test(
     name = "constraints_test",
     size = "small",
-    srcs = ["_impl/keras/constraints_test.py"],
+    srcs = ["constraints_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -267,7 +226,7 @@ py_test(
 py_test(
     name = "initializers_test",
     size = "small",
-    srcs = ["_impl/keras/initializers_test.py"],
+    srcs = ["initializers_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -280,7 +239,7 @@ py_test(
 py_test(
     name = "regularizers_test",
     size = "small",
-    srcs = ["_impl/keras/regularizers_test.py"],
+    srcs = ["regularizers_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -291,7 +250,7 @@ py_test(
 py_test(
     name = "optimizers_test",
     size = "medium",
-    srcs = ["_impl/keras/optimizers_test.py"],
+    srcs = ["optimizers_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -305,7 +264,7 @@ py_test(
 py_test(
     name = "losses_test",
     size = "small",
-    srcs = ["_impl/keras/losses_test.py"],
+    srcs = ["losses_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -316,8 +275,8 @@ py_test(
 
 py_test(
     name = "metrics_test",
-    size = "small",
-    srcs = ["_impl/keras/metrics_test.py"],
+    size = "medium",
+    srcs = ["metrics_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "manual",
@@ -334,7 +293,7 @@ py_test(
 py_test(
     name = "densenet_test",
     size = "large",
-    srcs = ["_impl/keras/applications/densenet_test.py"],
+    srcs = ["applications/densenet_test.py"],
     srcs_version = "PY2AND3",
     tags = ["nomsan"],  # times out, http://b/78650237
     deps = [
@@ -347,7 +306,7 @@ py_test(
 py_test(
     name = "inception_resnet_v2_test",
     size = "medium",
-    srcs = ["_impl/keras/applications/inception_resnet_v2_test.py"],
+    srcs = ["applications/inception_resnet_v2_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -359,7 +318,7 @@ py_test(
 py_test(
     name = "inception_v3_test",
     size = "medium",
-    srcs = ["_impl/keras/applications/inception_v3_test.py"],
+    srcs = ["applications/inception_v3_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -371,7 +330,7 @@ py_test(
 py_test(
     name = "mobilenet_test",
     size = "medium",
-    srcs = ["_impl/keras/applications/mobilenet_test.py"],
+    srcs = ["applications/mobilenet_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -383,7 +342,7 @@ py_test(
 py_test(
     name = "nasnet_test",
     size = "large",
-    srcs = ["_impl/keras/applications/nasnet_test.py"],
+    srcs = ["applications/nasnet_test.py"],
     srcs_version = "PY2AND3",
     tags = ["nomsan"],  # times out, http://b/78573625
     deps = [
@@ -395,8 +354,8 @@ py_test(
 
 py_test(
     name = "resnet50_test",
-    size = "small",
-    srcs = ["_impl/keras/applications/resnet50_test.py"],
+    size = "medium",
+    srcs = ["applications/resnet50_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -407,7 +366,7 @@ py_test(
 py_test(
     name = "vgg16_test",
     size = "small",
-    srcs = ["_impl/keras/applications/vgg16_test.py"],
+    srcs = ["applications/vgg16_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -418,7 +377,7 @@ py_test(
 py_test(
     name = "vgg19_test",
     size = "small",
-    srcs = ["_impl/keras/applications/vgg19_test.py"],
+    srcs = ["applications/vgg19_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -429,7 +388,7 @@ py_test(
 py_test(
     name = "xception_test",
     size = "medium",
-    srcs = ["_impl/keras/applications/xception_test.py"],
+    srcs = ["applications/xception_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -441,7 +400,7 @@ py_test(
 py_test(
     name = "advanced_activations_test",
     size = "small",
-    srcs = ["_impl/keras/layers/advanced_activations_test.py"],
+    srcs = ["layers/advanced_activations_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -452,7 +411,7 @@ py_test(
 py_test(
     name = "convolutional_recurrent_test",
     size = "large",
-    srcs = ["_impl/keras/layers/convolutional_recurrent_test.py"],
+    srcs = ["layers/convolutional_recurrent_test.py"],
     shard_count = 2,
     srcs_version = "PY2AND3",
     deps = [
@@ -465,7 +424,7 @@ py_test(
 py_test(
     name = "convolutional_test",
     size = "large",
-    srcs = ["_impl/keras/layers/convolutional_test.py"],
+    srcs = ["layers/convolutional_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "manual",
@@ -482,7 +441,7 @@ py_test(
 cuda_py_test(
     name = "cudnn_recurrent_test",
     size = "large",
-    srcs = ["_impl/keras/layers/cudnn_recurrent_test.py"],
+    srcs = ["layers/cudnn_recurrent_test.py"],
     additional_deps = [
         ":keras",
         "@absl_py//absl/testing:parameterized",
@@ -495,7 +454,7 @@ cuda_py_test(
 py_test(
     name = "pooling_test",
     size = "small",
-    srcs = ["_impl/keras/layers/pooling_test.py"],
+    srcs = ["layers/pooling_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -506,7 +465,7 @@ py_test(
 py_test(
     name = "core_test",
     size = "medium",
-    srcs = ["_impl/keras/layers/core_test.py"],
+    srcs = ["layers/core_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -518,7 +477,7 @@ py_test(
 py_test(
     name = "embeddings_test",
     size = "small",
-    srcs = ["_impl/keras/layers/embeddings_test.py"],
+    srcs = ["layers/embeddings_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -529,7 +488,7 @@ py_test(
 py_test(
     name = "local_test",
     size = "medium",
-    srcs = ["_impl/keras/layers/local_test.py"],
+    srcs = ["layers/local_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -541,7 +500,7 @@ py_test(
 py_test(
     name = "merge_test",
     size = "small",
-    srcs = ["_impl/keras/layers/merge_test.py"],
+    srcs = ["layers/merge_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -553,7 +512,7 @@ py_test(
 py_test(
     name = "noise_test",
     size = "small",
-    srcs = ["_impl/keras/layers/noise_test.py"],
+    srcs = ["layers/noise_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -563,8 +522,8 @@ py_test(
 
 py_test(
     name = "normalization_test",
-    size = "small",
-    srcs = ["_impl/keras/layers/normalization_test.py"],
+    size = "medium",
+    srcs = ["layers/normalization_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -577,7 +536,7 @@ py_test(
 py_test(
     name = "simplernn_test",
     size = "medium",
-    srcs = ["_impl/keras/layers/simplernn_test.py"],
+    srcs = ["layers/simplernn_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -590,7 +549,7 @@ py_test(
 py_test(
     name = "gru_test",
     size = "medium",
-    srcs = ["_impl/keras/layers/gru_test.py"],
+    srcs = ["layers/gru_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],  # http://b/62136390
     deps = [
@@ -603,7 +562,8 @@ py_test(
 py_test(
     name = "lstm_test",
     size = "medium",
-    srcs = ["_impl/keras/layers/lstm_test.py"],
+    srcs = ["layers/lstm_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
         "noasan",  # times out b/63678675
@@ -619,7 +579,7 @@ py_test(
 py_test(
     name = "recurrent_test",
     size = "medium",
-    srcs = ["_impl/keras/layers/recurrent_test.py"],
+    srcs = ["layers/recurrent_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -631,7 +591,7 @@ py_test(
 py_test(
     name = "serialization_test",
     size = "small",
-    srcs = ["_impl/keras/layers/serialization_test.py"],
+    srcs = ["layers/serialization_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -642,7 +602,8 @@ py_test(
 py_test(
     name = "wrappers_test",
     size = "medium",
-    srcs = ["_impl/keras/layers/wrappers_test.py"],
+    srcs = ["layers/wrappers_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
         "noasan",  # http://b/78599823
@@ -658,7 +619,7 @@ py_test(
 py_test(
     name = "scikit_learn_test",
     size = "small",
-    srcs = ["_impl/keras/wrappers/scikit_learn_test.py"],
+    srcs = ["wrappers/scikit_learn_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -671,7 +632,7 @@ py_test(
 py_test(
     name = "data_utils_test",
     size = "large",
-    srcs = ["_impl/keras/utils/data_utils_test.py"],
+    srcs = ["utils/data_utils_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -690,7 +651,7 @@ py_test(
 py_test(
     name = "generic_utils_test",
     size = "small",
-    srcs = ["_impl/keras/utils/generic_utils_test.py"],
+    srcs = ["utils/generic_utils_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -701,7 +662,7 @@ py_test(
 py_test(
     name = "io_utils_test",
     size = "small",
-    srcs = ["_impl/keras/utils/io_utils_test.py"],
+    srcs = ["utils/io_utils_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",  # TODO: needs investigation on Windows
@@ -717,7 +678,7 @@ py_test(
 py_test(
     name = "np_utils_test",
     size = "small",
-    srcs = ["_impl/keras/utils/np_utils_test.py"],
+    srcs = ["utils/np_utils_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -728,7 +689,7 @@ py_test(
 
 cuda_py_test(
     name = "multi_gpu_utils_test",
-    srcs = ["_impl/keras/utils/multi_gpu_utils_test.py"],
+    srcs = ["utils/multi_gpu_utils_test.py"],
     additional_deps = [
         ":keras",
         "//third_party/py/numpy",
@@ -743,7 +704,7 @@ cuda_py_test(
 py_test(
     name = "imagenet_utils_test",
     size = "small",
-    srcs = ["_impl/keras/applications/imagenet_utils_test.py"],
+    srcs = ["applications/imagenet_utils_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -755,7 +716,7 @@ py_test(
 py_test(
     name = "image_test",
     size = "medium",
-    srcs = ["_impl/keras/preprocessing/image_test.py"],
+    srcs = ["preprocessing/image_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -767,7 +728,7 @@ py_test(
 py_test(
     name = "sequence_test",
     size = "small",
-    srcs = ["_impl/keras/preprocessing/sequence_test.py"],
+    srcs = ["preprocessing/sequence_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -779,7 +740,7 @@ py_test(
 py_test(
     name = "text_test",
     size = "small",
-    srcs = ["_impl/keras/preprocessing/text_test.py"],
+    srcs = ["preprocessing/text_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -791,7 +752,7 @@ py_test(
 py_test(
     name = "callbacks_test",
     size = "medium",
-    srcs = ["_impl/keras/callbacks_test.py"],
+    srcs = ["callbacks_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -804,7 +765,7 @@ py_test(
 py_test(
     name = "training_test",
     size = "medium",
-    srcs = ["_impl/keras/engine/training_test.py"],
+    srcs = ["engine/training_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -817,7 +778,7 @@ py_test(
 py_test(
     name = "training_eager_test",
     size = "medium",
-    srcs = ["_impl/keras/engine/training_eager_test.py"],
+    srcs = ["engine/training_eager_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -830,7 +791,7 @@ py_test(
 py_test(
     name = "model_subclassing_test",
     size = "medium",
-    srcs = ["_impl/keras/model_subclassing_test.py"],
+    srcs = ["model_subclassing_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -843,7 +804,7 @@ py_test(
 py_test(
     name = "topology_test",
     size = "small",
-    srcs = ["_impl/keras/engine/topology_test.py"],
+    srcs = ["engine/topology_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "no-internal-py3",
@@ -858,7 +819,7 @@ py_test(
 py_test(
     name = "saving_test",
     size = "medium",
-    srcs = ["_impl/keras/engine/saving_test.py"],
+    srcs = ["engine/saving_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -871,7 +832,7 @@ py_test(
 py_test(
     name = "sequential_test",
     size = "small",
-    srcs = ["_impl/keras/engine/sequential_test.py"],
+    srcs = ["engine/sequential_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -883,7 +844,7 @@ py_test(
 py_test(
     name = "models_test",
     size = "small",
-    srcs = ["_impl/keras/models_test.py"],
+    srcs = ["models_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],  # b/67509773
     deps = [
@@ -894,29 +855,10 @@ py_test(
     ],
 )
 
-py_test(
-    name = "estimator_test",
-    size = "large",
-    srcs = ["_impl/keras/estimator_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python/estimator:numpy_io",
-        "//tensorflow/python/estimator:run_config",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_test(
     name = "backend_test",
     size = "small",
-    srcs = ["_impl/keras/backend_test.py"],
+    srcs = ["backend_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -929,7 +871,7 @@ py_test(
 py_library(
     name = "testing_utils",
     srcs = [
-        "_impl/keras/testing_utils.py",
+        "testing_utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/python/keras/__init__.py b/tensorflow/python/keras/__init__.py
index f56be967ff5c066c1b119aed1ed8813490d2b045..197f3060970743d9bd245dc7018fa0503500d176 100644
--- a/tensorflow/python/keras/__init__.py
+++ b/tensorflow/python/keras/__init__.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,14 +21,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=wildcard-import
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import applications
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import datasets
-from tensorflow.python.keras import estimator
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import losses
@@ -40,11 +37,12 @@ from tensorflow.python.keras import preprocessing
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras import utils
 from tensorflow.python.keras import wrappers
-from tensorflow.python.keras._impl.keras import __version__
 from tensorflow.python.keras.layers import Input
 from tensorflow.python.keras.models import Model
 from tensorflow.python.keras.models import Sequential
 
+__version__ = '2.1.6-tf'
+
 del absolute_import
 del division
 del print_function
diff --git a/tensorflow/python/keras/_impl/keras/__init__.py b/tensorflow/python/keras/_impl/keras/__init__.py
deleted file mode 100644
index 53f5d31e9c5b861c551a7a9ca3700c383ea679d7..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/_impl/keras/__init__.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The Keras API.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import applications
-from tensorflow.python.keras._impl.keras import backend
-from tensorflow.python.keras._impl.keras import callbacks
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import datasets
-from tensorflow.python.keras._impl.keras import engine
-from tensorflow.python.keras._impl.keras import estimator
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import layers
-from tensorflow.python.keras._impl.keras import losses
-from tensorflow.python.keras._impl.keras import metrics
-from tensorflow.python.keras._impl.keras import models
-from tensorflow.python.keras._impl.keras import optimizers
-from tensorflow.python.keras._impl.keras import preprocessing
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras import utils
-from tensorflow.python.keras._impl.keras import wrappers
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.models import Sequential
-
-__version__ = '2.1.5-tf'
diff --git a/tensorflow/python/keras/_impl/keras/applications/__init__.py b/tensorflow/python/keras/_impl/keras/applications/__init__.py
deleted file mode 100644
index 206a769b377483c65a78b76fe44055eb50bdc7c4..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/_impl/keras/applications/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras Applications: models with automatic loading of pre-trained weights.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet121
-from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet169
-from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet201
-from tensorflow.python.keras._impl.keras.applications.inception_resnet_v2 import InceptionResNetV2
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import InceptionV3
-from tensorflow.python.keras._impl.keras.applications.mobilenet import MobileNet
-from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetLarge
-from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetMobile
-from tensorflow.python.keras._impl.keras.applications.resnet50 import ResNet50
-from tensorflow.python.keras._impl.keras.applications.vgg16 import VGG16
-from tensorflow.python.keras._impl.keras.applications.vgg19 import VGG19
-from tensorflow.python.keras._impl.keras.applications.xception import Xception
diff --git a/tensorflow/python/keras/_impl/keras/datasets/__init__.py b/tensorflow/python/keras/_impl/keras/datasets/__init__.py
deleted file mode 100644
index 60db3766fbce859269cecb92a537084ef18c0da5..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/_impl/keras/datasets/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras datasets: utilities for downloading and pre-processing common datasets.
-
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.datasets import boston_housing
-from tensorflow.python.keras._impl.keras.datasets import cifar10
-from tensorflow.python.keras._impl.keras.datasets import cifar100
-from tensorflow.python.keras._impl.keras.datasets import fashion_mnist
-from tensorflow.python.keras._impl.keras.datasets import imdb
-from tensorflow.python.keras._impl.keras.datasets import mnist
-from tensorflow.python.keras._impl.keras.datasets import reuters
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
deleted file mode 100644
index b9c99b22224480cb2d30f25e7969026fe39dc642..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py
+++ /dev/null
@@ -1,689 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras training and evaluation routines for eager execution.
-"""
-# pylint: disable=protected-access
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import copy
-
-import numpy as np
-
-from tensorflow.python.eager.backprop import GradientTape
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras._impl.keras import backend
-from tensorflow.python.keras._impl.keras import callbacks as cbks
-from tensorflow.python.keras._impl.keras import losses
-from tensorflow.python.keras._impl.keras import metrics as metrics_module
-from tensorflow.python.keras._impl.keras.engine import training_utils
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import tf_logging as logging
-
-
-def _get_metrics_info(metric, internal_output_shapes=None, loss_func=None):
-  if metric == 'accuracy' or metric == 'acc':
-    # custom handling of accuracy
-    # (because of class mode duality)
-    output_shape = internal_output_shapes
-    if output_shape[-1] == 1 or loss_func == losses.binary_crossentropy:
-      # case: binary accuracy
-      acc_fn = metrics_module.binary_accuracy
-    elif loss_func == losses.sparse_categorical_crossentropy:
-      # case: categorical accuracy with sparse targets
-      acc_fn = metrics_module.sparse_categorical_accuracy
-    else:
-      acc_fn = metrics_module.categorical_accuracy
-
-    metric_name = 'acc'
-    return metric_name, acc_fn
-  else:
-    metric_fn = metrics_module.get(metric)
-    metric_name = metric_fn.__name__
-    return metric_name, metric_fn
-
-
-def _eager_loss_fn(outputs, targets, loss_fn, output_name):
-  with backend.name_scope(output_name + '_loss'):
-    loss = loss_fn(targets, outputs)
-  return loss
-
-
-def _eager_metrics_fn(model, outputs, targets):
-  """Calculates the metrics for each output of the given model.
-
-  Arguments:
-      model: The model on which metrics are being calculated.
-      outputs: The outputs of the given model.
-      targets: The predictions or targets of the given model.
-
-  Returns:
-      Returns the metric names and metric results for each output of the model.
-  """
-  metric_names = []
-  metric_results = []
-  if not isinstance(outputs, list):
-    outputs = [outputs]
-
-  if not isinstance(targets, list):
-    targets = [targets]
-
-  for i in range(len(model.outputs)):
-    output_metrics = model.nested_metrics[i]
-    for nested_output_metric in output_metrics:
-      metric_name, metric_fn = _get_metrics_info(
-          nested_output_metric, backend.int_shape(model.outputs[i]),
-          model.loss_functions[i])
-
-      if len(model.output_names) > 1:
-        metric_name = model.output_names[i] + '_' + metric_name
-        if metric_name not in model.metrics_names:
-          model.metrics_names.append(metric_name)
-
-      with backend.name_scope(metric_name):
-        metric_result = metric_fn(targets[i], outputs[i])
-        metric_names.append(metric_name)
-        metric_results.append(backend.mean(metric_result))
-
-  return metric_results
-
-
-def _model_loss(model, inputs, targets, sample_weights=None, training=False):
-  """Calculates the loss for a given model.
-
-  Arguments:
-      model: The model on which metrics are being calculated.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      training: Whether the model should be run in inference or training mode.
-
-  Returns:
-     Returns the model output, total loss and loss value calculated using the
-     specified loss function. The total loss includes regularization losses and
-     applies masking and sample weighting to the loss value.
-  """
-  total_loss = 0
-  if len(inputs) == 1:
-    if model._expects_training_arg:
-      outs = model.call(inputs[0], training=training)
-    else:
-      outs = model.call(inputs[0])
-  else:
-    if model._expects_training_arg:
-      outs = model.call(inputs, training=training)
-    else:
-      outs = model.call(inputs)
-  if not isinstance(outs, list):
-    outs = [outs]
-
-  if not isinstance(targets, list):
-    targets = [targets]
-
-  loss_metrics = []
-  with backend.name_scope('loss'):
-    for i, loss_fn in enumerate(model.loss_functions):
-      if sample_weights:
-        weights = sample_weights[i]
-      else:
-        weights = None
-
-      # TODO(fchollet): support masking; in practice `_keras_mask` is never
-      # set in this context currently.
-      mask = outs[i]._keras_mask
-
-      weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
-      with backend.name_scope(model.output_names[i] + '_loss'):
-        output_loss = weighted_masked_fn(
-            targets[i], outs[i], weights, mask=mask)
-      # If the number of outputs is 1 then we don't append the loss metric
-      # associated with each model output. When there are multiple outputs
-      # associated with a model, each output's loss is calculated and returned
-      # as part of the loss_metrics.
-      if len(model.outputs) > 1:
-        loss_metrics.append(backend.mean(output_loss))
-
-      loss_weight = model.loss_weights_list[i]
-      if total_loss is None:
-        total_loss = loss_weight * output_loss
-      else:
-        total_loss += loss_weight * output_loss
-
-    total_loss = backend.mean(total_loss)
-    # Add regularization losses
-    custom_losses = []
-    for layer in model.layers:
-      if layer.losses:
-        custom_losses += layer.losses
-
-    if custom_losses:
-      total_loss += sum(custom_losses)
-
-  return outs, total_loss, loss_metrics
-
-
-def slice_arrays(arrays, indices, contiguous=True):
-  """Slices batches out of provided arrays (workaround for eager tensors).
-
-  Unfortunately eager tensors don't have the same slicing behavior as
-  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
-  hence we cannot use `generic_utils.slice_arrays` directly
-  and we have to implement this workaround based on `concat`. This has a
-  performance cost.
-
-  Arguments:
-    arrays: Single array or list of arrays.
-    indices: List of indices in the array that should be included in the output
-      batch.
-    contiguous: Boolean flag indicating whether the indices are contiguous.
-
-  Returns:
-    Slice of data (either single array or list of arrays).
-  """
-  if any(tensor_util.is_tensor(x) for x in arrays):
-    converted_to_list = False
-    if not isinstance(arrays, list):
-      converted_to_list = True
-      arrays = [arrays]
-    if not contiguous:
-      entries = [[x[i:i + 1] for i in indices] for x in arrays]
-      slices = [array_ops.concat(x, axis=0) for x in entries]
-    else:
-      slices = [x[indices[0]:indices[-1] + 1] for x in arrays]
-    if converted_to_list:
-      slices = slices[0]
-    return slices
-  else:
-    return generic_utils.slice_arrays(arrays, indices)
-
-
-def _process_single_batch(model,
-                          inputs,
-                          targets,
-                          sample_weights=None,
-                          training=False):
-  """Calculate the loss and gradient for one input batch.
-
-     The model weights are updated if training is set to True.
-
-  Arguments:
-      model: Model whose loss has to be calculated.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      training: The boolean represents if the weights of the model are updated.
-              'fit' methods will set this to True while 'evaluate' methods will
-              set this to False.
-
-  Returns:
-      output of the model, total loss and the loss associated with each output.
-
-  Raises:
-      ValueError: If the model has no loss to optimize.
-  """
-  with backend.learning_phase_scope(1 if training else 0):
-    with GradientTape() as tape:
-      outs, loss, loss_metrics = _model_loss(model, inputs, targets,
-                                             sample_weights=sample_weights,
-                                             training=training)
-      if loss is None:
-        raise ValueError('The model cannot be run '
-                         'because it has no loss to optimize.')
-    if training:
-      if not model._collected_trainable_weights:
-        logging.warning('The list of trainable weights is empty. Make sure that'
-                        ' you are not setting model.trainable to False before '
-                        'compiling the model.')
-      else:
-        grads = tape.gradient(loss, model._collected_trainable_weights)
-        model.optimizer.apply_gradients(zip(grads,
-                                            model._collected_trainable_weights))
-    return outs, loss, loss_metrics
-
-
-def train_on_batch(model, inputs, targets, sample_weights=None):
-  """Calculates the loss and gradient updates for one input batch.
-
-  Arguments:
-      model: Model whose loss has to be calculated.
-      inputs: Input batch data.
-      targets: Target batch data.
-      sample_weights: Sample weight batch data.
-
-  Returns:
-      total loss and the loss associated with each output.
-  """
-  inputs = [
-      ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs]
-  targets = [
-      ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets]
-  sample_weights = [
-      ops.convert_to_tensor(val, dtype=backend.floatx())
-      if val is not None else None for val in sample_weights]
-  outs, loss, _ = _process_single_batch(
-      model, inputs, targets, sample_weights=sample_weights, training=True)
-  if not isinstance(outs, list):
-    outs = [outs]
-  metrics_results = _eager_metrics_fn(
-      model, outs, targets)
-  if not isinstance(loss, list):
-    loss = [loss]
-  return loss + metrics_results
-
-
-def test_on_batch(model, inputs, targets, sample_weights=None):
-  """Calculates the loss for one input batch.
-
-  Arguments:
-      model: Model whose loss has to be calculated.
-      inputs: Input batch data.
-      targets: Target batch data.
-      sample_weights: Sample weight batch data.
-
-  Returns:
-      total loss, loss and metrics associated with each output.
-  """
-  inputs = [
-      ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs]
-  targets = [
-      ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets]
-  sample_weights = [
-      ops.convert_to_tensor(val, dtype=backend.floatx())
-      if val is not None else None for val in sample_weights]
-  outs, loss, loss_metrics = _process_single_batch(
-      model, inputs, targets, sample_weights=sample_weights, training=False)
-  if not isinstance(outs, list):
-    outs = [outs]
-  metrics_results = _eager_metrics_fn(
-      model, outs, targets)
-  if not isinstance(loss, list):
-    loss = [loss]
-  return loss + loss_metrics + metrics_results
-
-
-def fit_loop(
-    model,
-    inputs,
-    targets,
-    sample_weights=None,
-    val_inputs=None,
-    val_targets=None,
-    val_sample_weights=None,
-    batch_size=None,
-    epochs=100,
-    verbose=1,
-    callbacks=None,
-    shuffle=True,
-    callback_metrics=None,
-    initial_epoch=0,
-    steps_per_epoch=None,
-    validation_steps=None):
-  """Abstract fit function for eager execution.
-
-  Arguments:
-      model: Instance of the model that is being executed in Eager mode.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      val_inputs: Input data for validation.
-      val_targets: Target data for validation.
-      val_sample_weights: Sample weight data for validation.
-      batch_size: Integer batch size or None if unknown.
-      epochs: Number of times to iterate over the data
-      verbose: Verbosity mode, 0, 1 or 2
-      callbacks: List of callbacks to be called during training
-      shuffle: Whether to shuffle the data at the beginning of each epoch
-      callback_metrics: List of strings, the display names of the metrics
-          passed to the callbacks. They should be the
-          concatenation of list the display names of the outputs of
-           `f` and the list of display names of the outputs of `f_val`.
-      initial_epoch: Epoch at which to start training
-          (useful for resuming a previous training run)
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch. Ignored with the default value of `None`.
-      validation_steps: Number of steps to run validation for (only if doing
-        validation from data tensors). Ignored with default value of `None`.
-
-  Returns:
-      `History` object.
-
-  Raises:
-    ValueError: In case of invalid argument values.
-  """
-  if not batch_size:
-    raise ValueError('With eager execution, `batch_size` should be specified.')
-  if steps_per_epoch or validation_steps:
-    raise ValueError('With eager execution, `steps_per_epoch` and '
-                     '`validation_steps` are not valid arguments '
-                     '(set `batch_size` instead).')
-  # Required for Eager mode
-  with backend.learning_phase_scope(1):
-    do_validation = False
-    if val_inputs:
-      do_validation = True
-      if (verbose and inputs and hasattr(inputs[0], 'shape') and
-          hasattr(val_inputs[0], 'shape')):
-        print('Train on %d samples, validate on %d samples' %
-              (inputs[0].shape[0], val_inputs[0].shape[0]))
-    if validation_steps:
-      if steps_per_epoch is None:
-        raise ValueError('Can only use `validation_steps` when doing step-wise '
-                         'training, i.e. `steps_per_epoch` must be set.')
-      do_validation = True
-
-    out_labels = model.metrics_names
-    if do_validation:
-      callback_metrics = copy.copy(out_labels) + [
-          'val_' + n for n in out_labels
-      ]
-    else:
-      callback_metrics = copy.copy(out_labels)
-
-    if sample_weights:
-      feed_data = inputs + targets + sample_weights
-    else:
-      feed_data = inputs + targets
-    num_train_samples = training_utils.check_num_samples(
-        feed_data,
-        batch_size=batch_size,
-        steps=steps_per_epoch,
-        steps_name='steps_per_epoch')
-
-    if num_train_samples is not None:
-      index_array = np.arange(num_train_samples)
-
-    model.history = cbks.History()
-    callbacks = [cbks.BaseLogger()] + (callbacks or []) + [model.history]
-    if verbose:
-      if steps_per_epoch is not None:
-        count_mode = 'steps'
-      else:
-        count_mode = 'samples'
-      callbacks += [cbks.ProgbarLogger(count_mode)]
-    callbacks = cbks.CallbackList(callbacks)
-
-    # it's possible to callback a different model than self
-    # (used by Sequential models)
-    if hasattr(model, 'callback_model') and model.callback_model:
-      callback_model = model.callback_model
-    else:
-      callback_model = model
-
-    callbacks.set_model(callback_model)
-
-    callbacks.set_params({
-        'batch_size': batch_size,
-        'epochs': epochs,
-        'steps': steps_per_epoch,
-        'samples': num_train_samples,
-        'verbose': verbose,
-        'do_validation': do_validation,
-        'metrics': callback_metrics or [],
-    })
-    callbacks.on_train_begin()
-    callback_model.stop_training = False
-    for cbk in callbacks:
-      if not val_inputs:
-        cbk.validation_data = []
-      elif val_sample_weights:
-        cbk.validation_data = val_inputs + val_targets + val_sample_weights
-      else:
-        cbk.validation_data = val_inputs + val_targets
-
-    for epoch in range(initial_epoch, epochs):
-      callbacks.on_epoch_begin(epoch)
-      epoch_logs = {}
-      if shuffle == 'batch':
-        index_array = model._batch_shuffle(index_array, batch_size)
-      elif shuffle:
-        np.random.shuffle(index_array)
-
-      batches = generic_utils.make_batches(num_train_samples, batch_size)
-
-      for batch_index, (batch_start, batch_end) in enumerate(batches):
-        batch_ids = index_array[batch_start:batch_end]
-        try:
-          inputs_batch = slice_arrays(inputs, batch_ids,
-                                      contiguous=not shuffle)
-          targets_batch = slice_arrays(targets, batch_ids,
-                                       contiguous=not shuffle)
-          if sample_weights:
-            sample_weights_batch = slice_arrays(sample_weights, batch_ids,
-                                                contiguous=not shuffle)
-          else:
-            sample_weights_batch = None
-        except TypeError:
-          raise TypeError('TypeError while preparing batch. '
-                          'If using HDF5 input data, '
-                          'pass shuffle="batch".')
-        batch_logs = {}
-        batch_logs['batch'] = batch_index
-        batch_logs['size'] = len(batch_ids)
-
-        callbacks.on_batch_begin(batch_index, batch_logs)
-
-        inputs_batch = [
-            ops.convert_to_tensor(val, dtype=backend.floatx())
-            for val in inputs_batch]
-        targets_batch = [
-            ops.convert_to_tensor(val, dtype=backend.floatx())
-            for val in targets_batch]
-        if sample_weights:
-          sample_weights_batch = [
-              ops.convert_to_tensor(val, dtype=backend.floatx())
-              if val is not None else None
-              for val in sample_weights_batch]
-
-        outs, loss, loss_metrics = _process_single_batch(
-            model,
-            inputs_batch,
-            targets_batch,
-            sample_weights=sample_weights_batch,
-            training=True)
-
-        if not isinstance(outs, list):
-          outs = [outs]
-
-        for l, o in zip(out_labels, outs):
-          batch_logs[l] = o
-        # Required for Eager mode
-        metrics_results = _eager_metrics_fn(model, outs, targets_batch)
-        batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
-
-        for k, v in zip(model.metrics_names,
-                        [backend.mean(loss)] + loss_metrics + metrics_results):
-          batch_logs[k] = tensor_util.constant_value(v)
-        callbacks.on_batch_end(batch_index, batch_logs)
-        if callback_model.stop_training:
-          break
-
-        if batch_index == len(batches) - 1:  # Last batch.
-          if do_validation:
-            val_outs = test_loop(
-                model, val_inputs, val_targets,
-                sample_weights=val_sample_weights,
-                batch_size=batch_size,
-                verbose=0)
-            if not isinstance(val_outs, list):
-              val_outs = [val_outs]
-            # Same labels assumed.
-            for l, o in zip(out_labels, val_outs):
-              epoch_logs['val_' + l] = o
-      callbacks.on_epoch_end(epoch, epoch_logs)
-      if callback_model.stop_training:
-        break
-    callbacks.on_train_end()
-    return model.history
-
-
-def test_loop(model, inputs, targets,
-              sample_weights=None,
-              batch_size=None,
-              verbose=0,
-              steps=None):
-  """Abstract method to loop over some data in batches.
-
-  Arguments:
-      model: Model instance that is being evaluated in Eager mode.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      batch_size: integer batch size or `None`.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring predictions finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-  """
-  with backend.learning_phase_scope(0):
-    feed_data = inputs + targets
-    if sample_weights:
-      feed_data += sample_weights
-    num_samples = training_utils.check_num_samples(
-        feed_data, batch_size=batch_size, steps=steps, steps_name='steps')
-    outs = []
-    if verbose == 1:
-      progbar = generic_utils.Progbar(target=num_samples)
-    batches = generic_utils.make_batches(num_samples, batch_size)
-    index_array = np.arange(num_samples)
-    for batch_index, (batch_start, batch_end) in enumerate(batches):
-      batch_ids = index_array[batch_start:batch_end]
-      inputs_batch = slice_arrays(inputs, batch_ids)
-      targets_batch = slice_arrays(targets, batch_ids)
-      if sample_weights:
-        sample_weights_batch = slice_arrays(sample_weights, batch_ids)
-      else:
-        sample_weights_batch = None
-
-      inputs_batch = [
-          ops.convert_to_tensor(val, dtype=backend.floatx())
-          for val in inputs_batch]
-      targets_batch = [
-          ops.convert_to_tensor(val, dtype=backend.floatx())
-          for val in targets_batch]
-      if sample_weights:
-        sample_weights_batch = [
-            ops.convert_to_tensor(val, dtype=backend.floatx())
-            if val is not None else None
-            for val in sample_weights_batch]
-
-      loss_outs, loss, loss_metrics = _model_loss(
-          model,
-          inputs_batch,
-          targets_batch,
-          sample_weights=sample_weights_batch,
-          training=False)
-      metrics_results = _eager_metrics_fn(model, loss_outs, targets_batch)
-      batch_outs = []
-      for _, v in zip(model.metrics_names,
-                      [backend.mean(loss)] + loss_metrics + metrics_results):
-        batch_outs.append(tensor_util.constant_value(v))
-
-      if isinstance(batch_outs, list):
-        if batch_index == 0:
-          for batch_out in enumerate(batch_outs):
-            outs.append(0.)
-        for i, batch_out in enumerate(batch_outs):
-          outs[i] += batch_out * len(batch_ids)
-      else:
-        if batch_index == 0:
-          outs.append(0.)
-        outs[0] += batch_outs * len(batch_ids)
-
-      if verbose == 1:
-        progbar.update(batch_end)
-    for i in range(len(outs)):
-      outs[i] /= num_samples
-    if len(outs) == 1:
-      return outs[0]
-    return outs
-
-
-def predict_loop(model, inputs,
-                 batch_size=32,
-                 verbose=0,
-                 steps=None):
-  """Abstract method to loop over some data in batches.
-
-  Arguments:
-      model:
-      inputs: List of input arrays.
-      batch_size: integer batch size.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions
-      (if the model has multiple outputs).
-  """
-  with backend.learning_phase_scope(0):
-    num_samples = training_utils.check_num_samples(
-        inputs, batch_size, steps, 'steps')
-    if verbose == 1:
-      if steps is not None:
-        progbar = generic_utils.Progbar(target=steps)
-      else:
-        progbar = generic_utils.Progbar(target=num_samples)
-
-    outs = []
-    batches = generic_utils.make_batches(num_samples, batch_size)
-    index_array = np.arange(num_samples)
-    for batch_index, (batch_start, batch_end) in enumerate(batches):
-      batch_ids = index_array[batch_start:batch_end]
-      inputs_batch = slice_arrays(inputs, batch_ids)
-
-      inputs_batch = [
-          ops.convert_to_tensor(val, dtype=backend.floatx())
-          for val in inputs_batch]
-
-      if len(inputs_batch) == 1:
-        if model._expects_training_arg:
-          batch_outs = model.call(inputs_batch[0], training=False)
-        else:
-          batch_outs = model.call(inputs_batch[0])
-      else:
-        if model._expects_training_arg:
-          batch_outs = model.call(inputs_batch, training=False)
-        else:
-          batch_outs = model.call(inputs_batch)
-
-      if not isinstance(batch_outs, list):
-        batch_outs = [batch_outs]
-      if batch_index == 0:
-        # Pre-allocate the results arrays.
-        for batch_out in batch_outs:
-          dims = batch_out.shape[1:].dims
-          dims_list = [d.value for d in dims]
-          shape = (num_samples,) + tuple(dims_list)
-          outs.append(np.zeros(shape, dtype=batch_out.dtype.as_numpy_dtype))
-      for i, batch_out in enumerate(batch_outs):
-        outs[i][batch_start:batch_end] = batch_out
-      if verbose == 1:
-        progbar.update(batch_end)
-    if len(outs) == 1:
-      return outs[0]
-    return outs
diff --git a/tensorflow/python/keras/_impl/keras/layers/__init__.py b/tensorflow/python/keras/_impl/keras/layers/__init__.py
deleted file mode 100644
index d7bc859280eeedfb41d2c78d4042a181484f3d20..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/_impl/keras/layers/__init__.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras layers module.
-"""
-# pylint: disable=wildcard-import
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.engine import Input
-from tensorflow.python.keras._impl.keras.engine import InputLayer
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import *
-from tensorflow.python.keras._impl.keras.layers.convolutional import *
-from tensorflow.python.keras._impl.keras.layers.convolutional_recurrent import *
-from tensorflow.python.keras._impl.keras.layers.core import *
-from tensorflow.python.keras._impl.keras.layers.cudnn_recurrent import *
-from tensorflow.python.keras._impl.keras.layers.embeddings import *
-from tensorflow.python.keras._impl.keras.layers.local import *
-from tensorflow.python.keras._impl.keras.layers.merge import *
-from tensorflow.python.keras._impl.keras.layers.noise import *
-from tensorflow.python.keras._impl.keras.layers.normalization import *
-from tensorflow.python.keras._impl.keras.layers.pooling import *
-from tensorflow.python.keras._impl.keras.layers.recurrent import *
-from tensorflow.python.keras._impl.keras.layers.serialization import deserialize
-from tensorflow.python.keras._impl.keras.layers.serialization import serialize
-from tensorflow.python.keras._impl.keras.layers.wrappers import *
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/__init__.py b/tensorflow/python/keras/_impl/keras/preprocessing/__init__.py
deleted file mode 100644
index 2ca48cdbf9c066194f4f4ed448fd621167db7ba9..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/_impl/keras/preprocessing/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Data preprocessing module.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.preprocessing import image
-from tensorflow.python.keras._impl.keras.preprocessing import sequence
-from tensorflow.python.keras._impl.keras.preprocessing import text
-
diff --git a/tensorflow/python/keras/_impl/keras/utils/__init__.py b/tensorflow/python/keras/_impl/keras/utils/__init__.py
deleted file mode 100644
index 0c9f19a0c8dcf3bf929e102b31679a03b27728f7..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/_impl/keras/utils/__init__.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras utilities.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.keras._impl.keras.utils.data_utils import OrderedEnqueuer
-from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
-from tensorflow.python.keras._impl.keras.utils.generic_utils import custom_object_scope
-from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import get_custom_objects
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.io_utils import HDF5Matrix
-from tensorflow.python.keras._impl.keras.utils.layer_utils import convert_all_kernels_in_model
-from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary
-from tensorflow.python.keras._impl.keras.utils.multi_gpu_utils import multi_gpu_model
-from tensorflow.python.keras._impl.keras.utils.np_utils import normalize
-from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
-from tensorflow.python.keras._impl.keras.utils.vis_utils import plot_model
-
diff --git a/tensorflow/python/keras/_impl/keras/wrappers/__init__.py b/tensorflow/python/keras/_impl/keras/wrappers/__init__.py
deleted file mode 100644
index 20c95929e3d2e1f66e66efe43b9685c5d6ed1c10..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/_impl/keras/wrappers/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras API wrappers.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.wrappers import scikit_learn
-
diff --git a/tensorflow/python/keras/_impl/keras/activations.py b/tensorflow/python/keras/activations.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/activations.py
rename to tensorflow/python/keras/activations.py
index 8def7ec49375c7ce23e8f2a24a4c3615d05ca9bb..a62dadb830cd1cce51fca6efd8cc071f22499cdd 100644
--- a/tensorflow/python/keras/_impl/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import six
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/keras/activations/__init__.py b/tensorflow/python/keras/activations/__init__.py
deleted file mode 100644
index d04838c218d6643a703723a1d163c88547c14da7..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/activations/__init__.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras built-in activation functions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Activation functions.
-from tensorflow.python.keras._impl.keras.activations import elu
-from tensorflow.python.keras._impl.keras.activations import hard_sigmoid
-from tensorflow.python.keras._impl.keras.activations import linear
-from tensorflow.python.keras._impl.keras.activations import relu
-from tensorflow.python.keras._impl.keras.activations import selu
-from tensorflow.python.keras._impl.keras.activations import sigmoid
-from tensorflow.python.keras._impl.keras.activations import softmax
-from tensorflow.python.keras._impl.keras.activations import softplus
-from tensorflow.python.keras._impl.keras.activations import softsign
-from tensorflow.python.keras._impl.keras.activations import tanh
-
-# Auxiliary utils.
-# pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.activations import deserialize
-from tensorflow.python.keras._impl.keras.activations import serialize
-from tensorflow.python.keras._impl.keras.activations import get
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/activations_test.py b/tensorflow/python/keras/activations_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/activations_test.py
rename to tensorflow/python/keras/activations_test.py
index fb0bb5f1269d112e3f268ce211a2ddeb24b417bf..5cff1f8f9cb06569029150e44a4c2adfb370229d 100644
--- a/tensorflow/python/keras/_impl/keras/activations_test.py
+++ b/tensorflow/python/keras/activations_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/applications/__init__.py b/tensorflow/python/keras/applications/__init__.py
index fccedf919a7b261bb30f332172b1388db9da1939..062135266dd8b11c489b7dff83b46ae29a0d21e6 100644
--- a/tensorflow/python/keras/applications/__init__.py
+++ b/tensorflow/python/keras/applications/__init__.py
@@ -18,15 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras.applications import densenet
-from tensorflow.python.keras.applications import inception_resnet_v2
-from tensorflow.python.keras.applications import inception_v3
-from tensorflow.python.keras.applications import mobilenet
-from tensorflow.python.keras.applications import nasnet
-from tensorflow.python.keras.applications import resnet50
-from tensorflow.python.keras.applications import vgg16
-from tensorflow.python.keras.applications import vgg19
-from tensorflow.python.keras.applications import xception
 from tensorflow.python.keras.applications.densenet import DenseNet121
 from tensorflow.python.keras.applications.densenet import DenseNet169
 from tensorflow.python.keras.applications.densenet import DenseNet201
diff --git a/tensorflow/python/keras/_impl/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py
similarity index 90%
rename from tensorflow/python/keras/_impl/keras/applications/densenet.py
rename to tensorflow/python/keras/applications/densenet.py
index ca83e8691237216e799f2ca738dcb6822506e2cb..f81f10719a31e2e79589d3b389049353c992091c 100644
--- a/tensorflow/python/keras/_impl/keras/applications/densenet.py
+++ b/tensorflow/python/keras/applications/densenet.py
@@ -27,24 +27,24 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.applications import imagenet_utils
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Activation
-from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import BatchNormalization
-from tensorflow.python.keras._impl.keras.layers import Concatenate
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.applications import imagenet_utils
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Activation
+from tensorflow.python.keras.layers import AveragePooling2D
+from tensorflow.python.keras.layers import BatchNormalization
+from tensorflow.python.keras.layers import Concatenate
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.layers import ZeroPadding2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/densenet_test.py b/tensorflow/python/keras/applications/densenet_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/applications/densenet_test.py
rename to tensorflow/python/keras/applications/densenet_test.py
index 3b92287a1e77a944c069a6c234e11e4a79ad7d32..8b6aa281ad0e2d0798952b7489c89892709cda29 100644
--- a/tensorflow/python/keras/_impl/keras/applications/densenet_test.py
+++ b/tensorflow/python/keras/applications/densenet_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py b/tensorflow/python/keras/applications/imagenet_utils.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
rename to tensorflow/python/keras/applications/imagenet_utils.py
index d928a7afdc639485d443be382420cac09ba9abd6..0d8ccca1b5c2a6c05f0d933a8f0fe176ea62c2a3 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/applications/imagenet_utils.py
@@ -23,8 +23,8 @@ import json
 import numpy as np
 
 from tensorflow.python.framework import constant_op
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils_test.py b/tensorflow/python/keras/applications/imagenet_utils_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/applications/imagenet_utils_test.py
rename to tensorflow/python/keras/applications/imagenet_utils_test.py
index d843dace59f1c88744217fbaee605d2ac859ec55..349339309017f3e9e3a9922d95188f1954ed8634 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils_test.py
+++ b/tensorflow/python/keras/applications/imagenet_utils_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
+from tensorflow.python import keras
+from tensorflow.python.keras.applications.imagenet_utils import preprocess_input
 from tensorflow.python.platform import test
 
 
@@ -197,4 +197,3 @@ class ImageNetUtilsTest(test.TestCase):
 
 if __name__ == '__main__':
   test.main()
-
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py
similarity index 91%
rename from tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
rename to tensorflow/python/keras/applications/inception_resnet_v2.py
index 17e407dd58460e6d6802a3e137a96faf38a6f576..fe1d0f2d4fb47f7ebab38f94afc8ace2f7b73cbc 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2.py
@@ -27,24 +27,24 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.applications import imagenet_utils
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Activation
-from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import BatchNormalization
-from tensorflow.python.keras._impl.keras.layers import Concatenate
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import Lambda
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.applications import imagenet_utils
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Activation
+from tensorflow.python.keras.layers import AveragePooling2D
+from tensorflow.python.keras.layers import BatchNormalization
+from tensorflow.python.keras.layers import Concatenate
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import Lambda
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2/__init__.py b/tensorflow/python/keras/applications/inception_resnet_v2/__init__.py
deleted file mode 100644
index 223660e9bef33896bc83f43ed26c1792e48105b9..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/applications/inception_resnet_v2/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""InceptionResNetV2 Keras application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.inception_resnet_v2 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.inception_resnet_v2 import InceptionResNetV2
-from tensorflow.python.keras._impl.keras.applications.inception_resnet_v2 import preprocess_input
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2_test.py b/tensorflow/python/keras/applications/inception_resnet_v2_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2_test.py
rename to tensorflow/python/keras/applications/inception_resnet_v2_test.py
index de71e9615a09ecdf07a51fff0b3ee3b1d8ca50ca..0a12f885052ae9530e82190f7580c8288860c9a8 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2_test.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py b/tensorflow/python/keras/applications/inception_v3.py
similarity index 92%
rename from tensorflow/python/keras/_impl/keras/applications/inception_v3.py
rename to tensorflow/python/keras/applications/inception_v3.py
index 2897c6058eb445ceacc34084b53dc89f556e3e9c..857ad49dae9ef234fe7d8251601ee122de39c947 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/applications/inception_v3.py
@@ -32,23 +32,23 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import layers
-from tensorflow.python.keras._impl.keras.applications import imagenet_utils
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Activation
-from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import BatchNormalization
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import layers
+from tensorflow.python.keras.applications import imagenet_utils
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Activation
+from tensorflow.python.keras.layers import AveragePooling2D
+from tensorflow.python.keras.layers import BatchNormalization
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/applications/inception_v3/__init__.py b/tensorflow/python/keras/applications/inception_v3/__init__.py
deleted file mode 100644
index abf8393ae45d71dc0cb746706abb72f77b82d199..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/applications/inception_v3/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Inception V3 Keras application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import InceptionV3
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import preprocess_input
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_v3_test.py b/tensorflow/python/keras/applications/inception_v3_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/applications/inception_v3_test.py
rename to tensorflow/python/keras/applications/inception_v3_test.py
index 20e11fa019134423cc7c0499e7507680e13cb86d..a3fcdd55644af5a2211b58169d87ab4fba996b19 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_v3_test.py
+++ b/tensorflow/python/keras/applications/inception_v3_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
similarity index 91%
rename from tensorflow/python/keras/_impl/keras/applications/mobilenet.py
rename to tensorflow/python/keras/applications/mobilenet.py
index 7b7288793de954d05d8405515037d4b197ed4df2..9d845be0d5b1ab06dd8a41bc04f75ae7b5f00789 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -71,28 +71,28 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.applications import imagenet_utils
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Activation
-from tensorflow.python.keras._impl.keras.layers import BatchNormalization
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import DepthwiseConv2D
-from tensorflow.python.keras._impl.keras.layers import Dropout
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import Reshape
-from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.applications import imagenet_utils
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Activation
+from tensorflow.python.keras.layers import BatchNormalization
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import DepthwiseConv2D
+from tensorflow.python.keras.layers import Dropout
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import Reshape
+from tensorflow.python.keras.layers import ZeroPadding2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -240,10 +240,15 @@ def MobileNet(input_shape=None,
                        '`0.25`, `0.50`, `0.75` or `1.0` only.')
 
     if rows != cols or rows not in [128, 160, 192, 224]:
-      raise ValueError('If imagenet weights are being loaded, '
-                       'input must have a static square shape (one of '
-                       '(128,128), (160,160), (192,192), or (224, 224)).'
-                       ' Input shape provided = %s' % (input_shape,))
+      if rows is None:
+        rows = 224
+        logging.warning('MobileNet shape is undefined.'
+                        ' Weights for input shape (224, 224) will be loaded.')
+      else:
+        raise ValueError('If imagenet weights are being loaded, '
+                         'input must have a static square shape (one of '
+                         '(128, 128), (160, 160), (192, 192), or (224, 224)).'
+                         ' Input shape provided = %s' % (input_shape,))
 
   if K.image_data_format() != 'channels_last':
     logging.warning('The MobileNet family of models is only available '
diff --git a/tensorflow/python/keras/applications/mobilenet/__init__.py b/tensorflow/python/keras/applications/mobilenet/__init__.py
deleted file mode 100644
index b809e91193b459a46906443796344c092e1d2a6b..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/applications/mobilenet/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""MobileNet Keras application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.mobilenet import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.mobilenet import MobileNet
-from tensorflow.python.keras._impl.keras.applications.mobilenet import preprocess_input
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet_test.py b/tensorflow/python/keras/applications/mobilenet_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/applications/mobilenet_test.py
rename to tensorflow/python/keras/applications/mobilenet_test.py
index 601d417e496b8230a2ad846eab204763ff5564b8..5661ed7856ad6e307cf3e388ea3db98c69db983f 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet_test.py
+++ b/tensorflow/python/keras/applications/mobilenet_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
similarity index 93%
rename from tensorflow/python/keras/_impl/keras/applications/nasnet.py
rename to tensorflow/python/keras/applications/nasnet.py
index dd33230a7eb9272f8fc60daee63e1f92574cf5e3..b521bc673139403dcdecbba8e35b5bafec2d42bf 100644
--- a/tensorflow/python/keras/_impl/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -45,27 +45,27 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import preprocess_input
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Activation
-from tensorflow.python.keras._impl.keras.layers import add
-from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import BatchNormalization
-from tensorflow.python.keras._impl.keras.layers import concatenate
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Cropping2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import SeparableConv2D
-from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.applications.inception_v3 import preprocess_input
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Activation
+from tensorflow.python.keras.layers import add
+from tensorflow.python.keras.layers import AveragePooling2D
+from tensorflow.python.keras.layers import BatchNormalization
+from tensorflow.python.keras.layers import concatenate
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Cropping2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.layers import SeparableConv2D
+from tensorflow.python.keras.layers import ZeroPadding2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -96,10 +96,9 @@ def NASNet(input_shape=None,
   at `~/.keras/keras.json`.
 
   Arguments:
-      input_shape: Optional shape tuple, only to be specified
-          if `include_top` is False (otherwise the input shape
-          has to be `(331, 331, 3)` for NASNetLarge or
-          `(224, 224, 3)` for NASNetMobile
+      input_shape: Optional shape tuple, the input shape
+          is by default `(331, 331, 3)` for NASNetLarge and
+          `(224, 224, 3)` for NASNetMobile.
           It should have exactly 3 inputs channels,
           and width and height should be no smaller than 32.
           E.g. `(224, 224, 3)` would be one valid value.
@@ -169,6 +168,14 @@ def NASNet(input_shape=None,
     raise ValueError('If using `weights` as ImageNet with `include_top` '
                      'as true, `classes` should be 1000')
 
+  if (isinstance(input_shape, tuple) and None in input_shape and
+      weights == 'imagenet'):
+    raise ValueError('When specifying the input shape of a NASNet'
+                     ' and loading `ImageNet` weights, '
+                     'the input_shape argument must be static '
+                     '(no None entries). Got: `input_shape=' +
+                     str(input_shape) + '`.')
+
   if default_size is None:
     default_size = 331
 
@@ -178,7 +185,7 @@ def NASNet(input_shape=None,
       default_size=default_size,
       min_size=32,
       data_format=K.image_data_format(),
-      require_flatten=include_top or weights,
+      require_flatten=False,
       weights=weights)
 
   if K.image_data_format() != 'channels_last':
diff --git a/tensorflow/python/keras/_impl/keras/applications/nasnet_test.py b/tensorflow/python/keras/applications/nasnet_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/applications/nasnet_test.py
rename to tensorflow/python/keras/applications/nasnet_test.py
index aa1dec670cb995e47bdcf88bd69594c532781b18..f96c3aa51c17ff3a123ad1a22ceff6c23f69d311 100644
--- a/tensorflow/python/keras/_impl/keras/applications/nasnet_test.py
+++ b/tensorflow/python/keras/applications/nasnet_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50.py b/tensorflow/python/keras/applications/resnet50.py
similarity index 88%
rename from tensorflow/python/keras/_impl/keras/applications/resnet50.py
rename to tensorflow/python/keras/applications/resnet50.py
index c3a92bea8920cad3297fee3efc50158813e72361..508550f445e39dcf2a249bc91aaee289abfe3d1f 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/applications/resnet50.py
@@ -29,26 +29,26 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import layers
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Activation
-from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import BatchNormalization
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import Flatten
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils import layer_utils
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import layers
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.applications.imagenet_utils import preprocess_input
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Activation
+from tensorflow.python.keras.layers import AveragePooling2D
+from tensorflow.python.keras.layers import BatchNormalization
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import Flatten
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.layers import ZeroPadding2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/applications/resnet50/__init__.py b/tensorflow/python/keras/applications/resnet50/__init__.py
deleted file mode 100644
index 530805d150bfe32c5b81d7d7d3f92e203b83b602..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/applications/resnet50/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""ResNet50 Keras application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.resnet50 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.resnet50 import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.resnet50 import ResNet50
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50_test.py b/tensorflow/python/keras/applications/resnet50_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/applications/resnet50_test.py
rename to tensorflow/python/keras/applications/resnet50_test.py
index 07f9ffd73f55ee39351af71223e7919b08ca66e1..22a3f055805f48bb27ad75db664b142d7916b654 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50_test.py
+++ b/tensorflow/python/keras/applications/resnet50_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg16.py b/tensorflow/python/keras/applications/vgg16.py
similarity index 83%
rename from tensorflow/python/keras/_impl/keras/applications/vgg16.py
rename to tensorflow/python/keras/applications/vgg16.py
index cefb25063e30505c9c34b49fd2df6eb7210d7ca8..659a6533e6772402663aee891ed90df792b12f09 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/applications/vgg16.py
@@ -28,21 +28,21 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import Flatten
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils import layer_utils
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.applications.imagenet_utils import preprocess_input
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import Flatten
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -223,16 +223,6 @@ def VGG16(include_top=True,
           cache_subdir='models',
           file_hash='6d6bbae143d832006294945121d1f1fc')
     model.load_weights(weights_path)
-    if K.backend() == 'theano':
-      layer_utils.convert_all_kernels_in_model(model)
-
-    if K.image_data_format() == 'channels_first':
-      if include_top:
-        maxpool = model.get_layer(name='block5_pool')
-        shape = maxpool.output_shape[1:]
-        dense = model.get_layer(name='fc1')
-        layer_utils.convert_dense_weights_data_format(dense, shape,
-                                                      'channels_first')
 
   elif weights is not None:
     model.load_weights(weights)
diff --git a/tensorflow/python/keras/applications/vgg16/__init__.py b/tensorflow/python/keras/applications/vgg16/__init__.py
deleted file mode 100644
index 118361604bbc7e0a88ed34243c0d5ea98856a301..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/applications/vgg16/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""VGG16 Keras application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.vgg16 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.vgg16 import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.vgg16 import VGG16
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg16_test.py b/tensorflow/python/keras/applications/vgg16_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/applications/vgg16_test.py
rename to tensorflow/python/keras/applications/vgg16_test.py
index e6eba83678def582c1a9fb477399790dbded8a15..cad65765f3d18c5a458c802a6b1aed688468d444 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg16_test.py
+++ b/tensorflow/python/keras/applications/vgg16_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg19.py b/tensorflow/python/keras/applications/vgg19.py
similarity index 84%
rename from tensorflow/python/keras/_impl/keras/applications/vgg19.py
rename to tensorflow/python/keras/applications/vgg19.py
index dadaf4fdf0cc5922752c6867720c5d8cdbcab19a..5e27ab8fb1fb99c65566cc4519798e3b8e0e1b0b 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/applications/vgg19.py
@@ -28,21 +28,21 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import Flatten
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils import layer_utils
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.applications.imagenet_utils import preprocess_input
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import Flatten
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -232,16 +232,6 @@ def VGG19(include_top=True,
           cache_subdir='models',
           file_hash='253f8cb515780f3b799900260a226db6')
     model.load_weights(weights_path)
-    if K.backend() == 'theano':
-      layer_utils.convert_all_kernels_in_model(model)
-
-    if K.image_data_format() == 'channels_first':
-      if include_top:
-        maxpool = model.get_layer(name='block5_pool')
-        shape = maxpool.output_shape[1:]
-        dense = model.get_layer(name='fc1')
-        layer_utils.convert_dense_weights_data_format(dense, shape,
-                                                      'channels_first')
 
   elif weights is not None:
     model.load_weights(weights)
diff --git a/tensorflow/python/keras/applications/vgg19/__init__.py b/tensorflow/python/keras/applications/vgg19/__init__.py
deleted file mode 100644
index cda52628f3c10d65fdbe70b2f86cc12c771870a9..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/applications/vgg19/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""VGG19 Keras application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.vgg19 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.vgg19 import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.vgg19 import VGG19
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg19_test.py b/tensorflow/python/keras/applications/vgg19_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/applications/vgg19_test.py
rename to tensorflow/python/keras/applications/vgg19_test.py
index 25100a2993f8a650b9ec441bf0c2c528f13364a4..61dccc0c5cc315cc0e5c0284cf829ac2034c69d2 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg19_test.py
+++ b/tensorflow/python/keras/applications/vgg19_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/xception.py b/tensorflow/python/keras/applications/xception.py
similarity index 90%
rename from tensorflow/python/keras/_impl/keras/applications/xception.py
rename to tensorflow/python/keras/applications/xception.py
index 971063a16d1f5ba0e25189f1ef2f6c24eb5f8d61..e1be8a3c46e6eafa43405f1472a2f0292b73aa0c 100644
--- a/tensorflow/python/keras/_impl/keras/applications/xception.py
+++ b/tensorflow/python/keras/applications/xception.py
@@ -39,23 +39,23 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import layers
-from tensorflow.python.keras._impl.keras.applications import imagenet_utils
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Activation
-from tensorflow.python.keras._impl.keras.layers import BatchNormalization
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import SeparableConv2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import layers
+from tensorflow.python.keras.applications import imagenet_utils
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Activation
+from tensorflow.python.keras.layers import BatchNormalization
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.layers import SeparableConv2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/applications/xception/__init__.py b/tensorflow/python/keras/applications/xception/__init__.py
deleted file mode 100644
index ae9cd9cd18c5ccc5ec37c8cd1bf36f8aabd9929c..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/applications/xception/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Xception Keras application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.xception import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.xception import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.xception import Xception
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/applications/xception_test.py b/tensorflow/python/keras/applications/xception_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/applications/xception_test.py
rename to tensorflow/python/keras/applications/xception_test.py
index 7ebdc30010aa48362046b3c0c281fe1f2be64a84..7e2efd0017836ae671d88b561385b6e61be9fa0b 100644
--- a/tensorflow/python/keras/_impl/keras/applications/xception_test.py
+++ b/tensorflow/python/keras/applications/xception_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/backend.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/backend.py
rename to tensorflow/python/keras/backend.py
index b1f1270623ddccc2030bd5a37df4ce43158c387a..af3d1fa33d3431e7b13d1910a8581393e7b912c6 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -74,12 +74,6 @@ _SESSION = None
 # either train mode (learning_phase == 1) or test mode (learning_phase == 0).
 _GRAPH_LEARNING_PHASES = {}
 
-# This dictionary holds a mapping {graph: UID_DICT}.
-# each UID_DICT is a dictionary mapping name prefixes to a current index,
-# used for generating graph-specific string UIDs
-# for various names (e.g. layer names).
-_GRAPH_UID_DICTS = {}
-
 # This boolean flag can be set to True to leave variable initialization
 # up to the user.
 # Change its value via `manual_variable_initialization(value)`.
@@ -298,6 +292,8 @@ def get_uid(prefix=''):
 
 @tf_export('keras.backend.reset_uids')
 def reset_uids():
+  """Resets graph identifiers.
+  """
   per_graph_layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS
   keys = list(per_graph_layer_name_uids.keys())
   for key in keys:
@@ -1421,6 +1417,9 @@ def batch_dot(x, y, axes=None):
     axes = (axes, axes)
   x_ndim = ndim(x)
   y_ndim = ndim(y)
+  if axes is None:
+    # behaves like tf.batch_matmul as default
+    axes = [x_ndim - 1, y_ndim - 2]
   if x_ndim > y_ndim:
     diff = x_ndim - y_ndim
     y = array_ops.reshape(y,
@@ -2927,7 +2926,7 @@ def function(inputs, outputs, updates=None, **kwargs):
 
 @tf_export('keras.backend.gradients')
 def gradients(loss, variables):
-  """Returns the gradients of `variables` w.r.t. `loss`.
+  """Returns the gradients of `loss` w.r.t. `variables`.
 
   Arguments:
       loss: Scalar tensor to minimize.
@@ -3395,16 +3394,18 @@ def elu(x, alpha=1.):
 
 
 @tf_export('keras.backend.softmax')
-def softmax(x):
+def softmax(x, axis=-1):
   """Softmax of a tensor.
 
   Arguments:
       x: A tensor or variable.
+      axis: The dimension softmax would be performed on.
+          The default is -1 which indicates the last dimension.
 
   Returns:
       A tensor.
   """
-  return nn.softmax(x)
+  return nn.softmax(x, axis=axis)
 
 
 @tf_export('keras.backend.softplus')
@@ -4588,8 +4589,8 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
       Tensor with shape (samples,1) containing the
           CTC loss of each element.
   """
-  label_length = math_ops.to_int32(array_ops.squeeze(label_length))
-  input_length = math_ops.to_int32(array_ops.squeeze(input_length))
+  label_length = math_ops.to_int32(array_ops.squeeze(label_length, axis=-1))
+  input_length = math_ops.to_int32(array_ops.squeeze(input_length, axis=-1))
   sparse_labels = math_ops.to_int32(
       ctc_label_dense_to_sparse(y_true, label_length))
 
diff --git a/tensorflow/python/keras/backend/__init__.py b/tensorflow/python/keras/backend/__init__.py
deleted file mode 100644
index 10ef5a75852deb6595bced2703d7c5f29b0efac3..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/backend/__init__.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras backend API."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=redefined-builtin
-from tensorflow.python.keras._impl.keras.backend import abs
-from tensorflow.python.keras._impl.keras.backend import all
-from tensorflow.python.keras._impl.keras.backend import any
-from tensorflow.python.keras._impl.keras.backend import arange
-from tensorflow.python.keras._impl.keras.backend import argmax
-from tensorflow.python.keras._impl.keras.backend import argmin
-from tensorflow.python.keras._impl.keras.backend import backend
-from tensorflow.python.keras._impl.keras.backend import batch_dot
-from tensorflow.python.keras._impl.keras.backend import batch_flatten
-from tensorflow.python.keras._impl.keras.backend import batch_get_value
-from tensorflow.python.keras._impl.keras.backend import batch_normalization
-from tensorflow.python.keras._impl.keras.backend import batch_set_value
-from tensorflow.python.keras._impl.keras.backend import bias_add
-from tensorflow.python.keras._impl.keras.backend import binary_crossentropy
-from tensorflow.python.keras._impl.keras.backend import cast
-from tensorflow.python.keras._impl.keras.backend import cast_to_floatx
-from tensorflow.python.keras._impl.keras.backend import categorical_crossentropy
-from tensorflow.python.keras._impl.keras.backend import clear_session
-from tensorflow.python.keras._impl.keras.backend import clip
-from tensorflow.python.keras._impl.keras.backend import concatenate
-from tensorflow.python.keras._impl.keras.backend import constant
-from tensorflow.python.keras._impl.keras.backend import conv1d
-from tensorflow.python.keras._impl.keras.backend import conv2d
-from tensorflow.python.keras._impl.keras.backend import conv2d_transpose
-from tensorflow.python.keras._impl.keras.backend import conv3d
-from tensorflow.python.keras._impl.keras.backend import cos
-from tensorflow.python.keras._impl.keras.backend import count_params
-from tensorflow.python.keras._impl.keras.backend import ctc_batch_cost
-from tensorflow.python.keras._impl.keras.backend import ctc_decode
-from tensorflow.python.keras._impl.keras.backend import ctc_label_dense_to_sparse
-from tensorflow.python.keras._impl.keras.backend import dot
-from tensorflow.python.keras._impl.keras.backend import dropout
-from tensorflow.python.keras._impl.keras.backend import dtype
-from tensorflow.python.keras._impl.keras.backend import elu
-from tensorflow.python.keras._impl.keras.backend import epsilon
-from tensorflow.python.keras._impl.keras.backend import equal
-from tensorflow.python.keras._impl.keras.backend import eval
-from tensorflow.python.keras._impl.keras.backend import exp
-from tensorflow.python.keras._impl.keras.backend import expand_dims
-from tensorflow.python.keras._impl.keras.backend import eye
-from tensorflow.python.keras._impl.keras.backend import flatten
-from tensorflow.python.keras._impl.keras.backend import floatx
-from tensorflow.python.keras._impl.keras.backend import foldl
-from tensorflow.python.keras._impl.keras.backend import foldr
-from tensorflow.python.keras._impl.keras.backend import function
-from tensorflow.python.keras._impl.keras.backend import gather
-from tensorflow.python.keras._impl.keras.backend import get_session
-from tensorflow.python.keras._impl.keras.backend import get_uid
-from tensorflow.python.keras._impl.keras.backend import get_value
-from tensorflow.python.keras._impl.keras.backend import gradients
-from tensorflow.python.keras._impl.keras.backend import greater
-from tensorflow.python.keras._impl.keras.backend import greater_equal
-from tensorflow.python.keras._impl.keras.backend import hard_sigmoid
-from tensorflow.python.keras._impl.keras.backend import image_data_format
-from tensorflow.python.keras._impl.keras.backend import in_test_phase
-from tensorflow.python.keras._impl.keras.backend import in_top_k
-from tensorflow.python.keras._impl.keras.backend import in_train_phase
-from tensorflow.python.keras._impl.keras.backend import int_shape
-from tensorflow.python.keras._impl.keras.backend import is_sparse
-from tensorflow.python.keras._impl.keras.backend import l2_normalize
-from tensorflow.python.keras._impl.keras.backend import learning_phase
-from tensorflow.python.keras._impl.keras.backend import less
-from tensorflow.python.keras._impl.keras.backend import less_equal
-from tensorflow.python.keras._impl.keras.backend import log
-from tensorflow.python.keras._impl.keras.backend import manual_variable_initialization
-from tensorflow.python.keras._impl.keras.backend import map_fn
-from tensorflow.python.keras._impl.keras.backend import max
-from tensorflow.python.keras._impl.keras.backend import maximum
-from tensorflow.python.keras._impl.keras.backend import mean
-from tensorflow.python.keras._impl.keras.backend import min
-from tensorflow.python.keras._impl.keras.backend import minimum
-from tensorflow.python.keras._impl.keras.backend import moving_average_update
-from tensorflow.python.keras._impl.keras.backend import name_scope
-from tensorflow.python.keras._impl.keras.backend import ndim
-from tensorflow.python.keras._impl.keras.backend import normalize_batch_in_training
-from tensorflow.python.keras._impl.keras.backend import not_equal
-from tensorflow.python.keras._impl.keras.backend import one_hot
-from tensorflow.python.keras._impl.keras.backend import ones
-from tensorflow.python.keras._impl.keras.backend import ones_like
-from tensorflow.python.keras._impl.keras.backend import permute_dimensions
-from tensorflow.python.keras._impl.keras.backend import placeholder
-from tensorflow.python.keras._impl.keras.backend import pool2d
-from tensorflow.python.keras._impl.keras.backend import pool3d
-from tensorflow.python.keras._impl.keras.backend import pow
-from tensorflow.python.keras._impl.keras.backend import print_tensor
-from tensorflow.python.keras._impl.keras.backend import prod
-from tensorflow.python.keras._impl.keras.backend import random_binomial
-from tensorflow.python.keras._impl.keras.backend import random_normal
-from tensorflow.python.keras._impl.keras.backend import random_normal_variable
-from tensorflow.python.keras._impl.keras.backend import random_uniform
-from tensorflow.python.keras._impl.keras.backend import random_uniform_variable
-from tensorflow.python.keras._impl.keras.backend import relu
-from tensorflow.python.keras._impl.keras.backend import repeat
-from tensorflow.python.keras._impl.keras.backend import repeat_elements
-from tensorflow.python.keras._impl.keras.backend import reset_uids
-from tensorflow.python.keras._impl.keras.backend import reshape
-from tensorflow.python.keras._impl.keras.backend import resize_images
-from tensorflow.python.keras._impl.keras.backend import resize_volumes
-from tensorflow.python.keras._impl.keras.backend import reverse
-from tensorflow.python.keras._impl.keras.backend import rnn
-from tensorflow.python.keras._impl.keras.backend import round
-from tensorflow.python.keras._impl.keras.backend import separable_conv2d
-from tensorflow.python.keras._impl.keras.backend import set_epsilon
-from tensorflow.python.keras._impl.keras.backend import set_floatx
-from tensorflow.python.keras._impl.keras.backend import set_image_data_format
-from tensorflow.python.keras._impl.keras.backend import set_learning_phase
-from tensorflow.python.keras._impl.keras.backend import set_session
-from tensorflow.python.keras._impl.keras.backend import set_value
-from tensorflow.python.keras._impl.keras.backend import shape
-from tensorflow.python.keras._impl.keras.backend import sigmoid
-from tensorflow.python.keras._impl.keras.backend import sign
-from tensorflow.python.keras._impl.keras.backend import sin
-from tensorflow.python.keras._impl.keras.backend import softmax
-from tensorflow.python.keras._impl.keras.backend import softplus
-from tensorflow.python.keras._impl.keras.backend import softsign
-from tensorflow.python.keras._impl.keras.backend import sparse_categorical_crossentropy
-from tensorflow.python.keras._impl.keras.backend import spatial_2d_padding
-from tensorflow.python.keras._impl.keras.backend import spatial_3d_padding
-from tensorflow.python.keras._impl.keras.backend import sqrt
-from tensorflow.python.keras._impl.keras.backend import square
-from tensorflow.python.keras._impl.keras.backend import squeeze
-from tensorflow.python.keras._impl.keras.backend import stack
-from tensorflow.python.keras._impl.keras.backend import std
-from tensorflow.python.keras._impl.keras.backend import stop_gradient
-from tensorflow.python.keras._impl.keras.backend import sum
-from tensorflow.python.keras._impl.keras.backend import switch
-from tensorflow.python.keras._impl.keras.backend import tanh
-from tensorflow.python.keras._impl.keras.backend import temporal_padding
-from tensorflow.python.keras._impl.keras.backend import to_dense
-from tensorflow.python.keras._impl.keras.backend import transpose
-from tensorflow.python.keras._impl.keras.backend import truncated_normal
-from tensorflow.python.keras._impl.keras.backend import update
-from tensorflow.python.keras._impl.keras.backend import update_add
-from tensorflow.python.keras._impl.keras.backend import update_sub
-from tensorflow.python.keras._impl.keras.backend import var
-from tensorflow.python.keras._impl.keras.backend import variable
-from tensorflow.python.keras._impl.keras.backend import zeros
-from tensorflow.python.keras._impl.keras.backend import zeros_like
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/backend_test.py
rename to tensorflow/python/keras/backend_test.py
index de1ed467a2764a2b08269181bb8bcd615868d3b1..58df263a4f24278f8b61bd9e89f5d8af5e589c6d 100644
--- a/tensorflow/python/keras/_impl/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 import numpy as np
 import scipy.sparse
 
+from tensorflow.python import keras
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.keras._impl import keras
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
@@ -1122,6 +1122,30 @@ class TestCTC(test.TestCase):
           keras.backend.ctc_batch_cost(labels, inputs, input_lens, label_lens))
       self.assertAllClose(res[:, 0], loss_log_probs, atol=1e-05)
 
+      # test when batch_size = 1, that is, one sample only
+      ref = [3.34211]
+      input_lens = np.expand_dims(np.asarray([5]), 1)
+      label_lens = np.expand_dims(np.asarray([5]), 1)
+
+      labels = np.asarray([[0, 1, 2, 1, 0]])
+      inputs = np.asarray(
+          [[[0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553], [
+              0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436
+          ], [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688],
+            [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533],
+            [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]]
+          ],
+          dtype=np.float32)
+
+      k_labels = keras.backend.variable(labels, dtype='int32')
+      k_inputs = keras.backend.variable(inputs, dtype='float32')
+      k_input_lens = keras.backend.variable(input_lens, dtype='int32')
+      k_label_lens = keras.backend.variable(label_lens, dtype='int32')
+      res = keras.backend.eval(
+          keras.backend.ctc_batch_cost(k_labels, k_inputs, k_input_lens,
+                                       k_label_lens))
+      self.assertAllClose(res[:, 0], ref, atol=1e-05)
+
 
 class TestRandomOps(test.TestCase):
 
diff --git a/tensorflow/python/keras/_impl/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/callbacks.py
rename to tensorflow/python/keras/callbacks.py
index deb1e8867dba3d52816ebda02bd9a3bf2ec7bc09..a6dbe2ba71ae1aab025ab61820c5309d166414f8 100644
--- a/tensorflow/python/keras/_impl/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -30,8 +30,8 @@ import time
 import numpy as np
 import six
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary as tf_summary
@@ -268,9 +268,6 @@ class TerminateOnNaN(Callback):
   """Callback that terminates training when a NaN loss is encountered.
   """
 
-  def __init__(self):
-    super(TerminateOnNaN, self).__init__()
-
   def on_batch_end(self, batch, logs=None):
     logs = logs or {}
     loss = logs.get('loss')
@@ -468,8 +465,8 @@ class ModelCheckpoint(Callback):
               self.model.save(filepath, overwrite=True)
           else:
             if self.verbose > 0:
-              print('\nEpoch %05d: %s did not improve' % (epoch + 1,
-                                                          self.monitor))
+              print('\nEpoch %05d: %s did not improve from %0.5f' %
+                    (epoch + 1, self.monitor, self.best))
       else:
         if self.verbose > 0:
           print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
@@ -571,38 +568,49 @@ class RemoteMonitor(Callback):
   Events are sent to `root + '/publish/epoch/end/'` by default. Calls are
   HTTP POST, with a `data` argument which is a
   JSON-encoded dictionary of event data.
+  If send_as_json is set to True, the content type of the request will be
+  application/json. Otherwise the serialized JSON will be sent within a form.
 
   Arguments:
       root: String; root url of the target server.
       path: String; path relative to `root` to which the events will be sent.
       field: String; JSON field under which the data will be stored.
+          The field is used only if the payload is sent within a form
+          (i.e. send_as_json is set to False).
       headers: Dictionary; optional custom HTTP headers.
+      send_as_json: Boolean; whether the request should be
+          sent as application/json.
   """
 
   def __init__(self,
                root='http://localhost:9000',
                path='/publish/epoch/end/',
                field='data',
-               headers=None):
+               headers=None,
+               send_as_json=False):
     super(RemoteMonitor, self).__init__()
 
     self.root = root
     self.path = path
     self.field = field
     self.headers = headers
+    self.send_as_json = send_as_json
 
   def on_epoch_end(self, epoch, logs=None):
     if requests is None:
-      raise ImportError('RemoteMonitor requires ' 'the `requests` library.')
+      raise ImportError('RemoteMonitor requires the `requests` library.')
     logs = logs or {}
     send = {}
     send['epoch'] = epoch
     for k, v in logs.items():
       send[k] = v
     try:
-      requests.post(
-          self.root + self.path, {self.field: json.dumps(send)},
-          headers=self.headers)
+      if self.send_as_json:
+        requests.post(self.root + self.path, json=send, headers=self.headers)
+      else:
+        requests.post(
+            self.root + self.path, {self.field: json.dumps(send)},
+            headers=self.headers)
     except requests.exceptions.RequestException:
       logging.warning('Warning: could not reach RemoteMonitor '
                       'root server at ' + str(self.root))
@@ -712,15 +720,6 @@ class TensorBoard(Callback):
         for weight in layer.weights:
           mapped_weight_name = weight.name.replace(':', '_')
           tf_summary.histogram(mapped_weight_name, weight)
-          if self.write_grads:
-            grads = model.optimizer.get_gradients(model.total_loss, weight)
-
-            def is_indexed_slices(grad):
-              return type(grad).__name__ == 'IndexedSlices'
-
-            grads = [grad.values if is_indexed_slices(grad) else grad
-                     for grad in grads]
-            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
           if self.write_images:
             w_img = array_ops.squeeze(weight)
             shape = K.int_shape(w_img)
@@ -747,6 +746,18 @@ class TensorBoard(Callback):
             assert len(shape) == 4 and shape[-1] in [1, 3, 4]
             tf_summary.image(mapped_weight_name, w_img)
 
+        if self.write_grads:
+          for weight in layer.trainable_weights:
+            mapped_weight_name = weight.name.replace(':', '_')
+            grads = model.optimizer.get_gradients(model.total_loss, weight)
+
+            def is_indexed_slices(grad):
+              return type(grad).__name__ == 'IndexedSlices'
+
+            grads = [grad.values if is_indexed_slices(grad) else grad
+                     for grad in grads]
+            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
+
         if hasattr(layer, 'output'):
           tf_summary.histogram('{}_out'.format(layer.name), layer.output)
     self.merged = tf_summary.merge_all()
@@ -846,7 +857,7 @@ class ReduceLROnPlateau(Callback):
           monitored has stopped increasing; in `auto`
           mode, the direction is automatically inferred
           from the name of the monitored quantity.
-      epsilon: threshold for measuring the new optimum,
+      min_delta: threshold for measuring the new optimum,
           to only focus on significant changes.
       cooldown: number of epochs to wait before resuming
           normal operation after lr has been reduced.
@@ -859,17 +870,22 @@ class ReduceLROnPlateau(Callback):
                patience=10,
                verbose=0,
                mode='auto',
-               epsilon=1e-4,
+               min_delta=1e-4,
                cooldown=0,
-               min_lr=0):
+               min_lr=0,
+               **kwargs):
     super(ReduceLROnPlateau, self).__init__()
 
     self.monitor = monitor
     if factor >= 1.0:
       raise ValueError('ReduceLROnPlateau ' 'does not support a factor >= 1.0.')
+    if 'epsilon' in kwargs:
+      min_delta = kwargs.pop('epsilon')
+      logging.warning('`epsilon` argument is deprecated and '
+                      'will be removed, use `min_delta` instead.')
     self.factor = factor
     self.min_lr = min_lr
-    self.epsilon = epsilon
+    self.min_delta = min_delta
     self.patience = patience
     self.verbose = verbose
     self.cooldown = cooldown
@@ -889,10 +905,10 @@ class ReduceLROnPlateau(Callback):
       self.mode = 'auto'
     if (self.mode == 'min' or
         (self.mode == 'auto' and 'acc' not in self.monitor)):
-      self.monitor_op = lambda a, b: np.less(a, b - self.epsilon)
+      self.monitor_op = lambda a, b: np.less(a, b - self.min_delta)
       self.best = np.Inf
     else:
-      self.monitor_op = lambda a, b: np.greater(a, b + self.epsilon)
+      self.monitor_op = lambda a, b: np.greater(a, b + self.min_delta)
       self.best = -np.Inf
     self.cooldown_counter = 0
     self.wait = 0
@@ -918,6 +934,7 @@ class ReduceLROnPlateau(Callback):
         self.best = current
         self.wait = 0
       elif not self.in_cooldown():
+        self.wait += 1
         if self.wait >= self.patience:
           old_lr = float(K.get_value(self.model.optimizer.lr))
           if old_lr > self.min_lr:
@@ -929,7 +946,6 @@ class ReduceLROnPlateau(Callback):
                     'rate to %s.' % (epoch + 1, new_lr))
             self.cooldown_counter = self.cooldown
             self.wait = 0
-        self.wait += 1
 
   def in_cooldown(self):
     return self.cooldown_counter > 0
diff --git a/tensorflow/python/keras/callbacks/__init__.py b/tensorflow/python/keras/callbacks/__init__.py
deleted file mode 100644
index 2d884790ddb9ccf49649c6af4cfd40cddbc38cb3..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/callbacks/__init__.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras callback classes."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.callbacks import BaseLogger
-from tensorflow.python.keras._impl.keras.callbacks import Callback
-from tensorflow.python.keras._impl.keras.callbacks import CSVLogger
-from tensorflow.python.keras._impl.keras.callbacks import EarlyStopping
-from tensorflow.python.keras._impl.keras.callbacks import History
-from tensorflow.python.keras._impl.keras.callbacks import LambdaCallback
-from tensorflow.python.keras._impl.keras.callbacks import LearningRateScheduler
-from tensorflow.python.keras._impl.keras.callbacks import ModelCheckpoint
-from tensorflow.python.keras._impl.keras.callbacks import ProgbarLogger
-from tensorflow.python.keras._impl.keras.callbacks import ReduceLROnPlateau
-from tensorflow.python.keras._impl.keras.callbacks import RemoteMonitor
-from tensorflow.python.keras._impl.keras.callbacks import TensorBoard
-from tensorflow.python.keras._impl.keras.callbacks import TerminateOnNaN
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
similarity index 86%
rename from tensorflow/python/keras/_impl/keras/callbacks_test.py
rename to tensorflow/python/keras/callbacks_test.py
index 79dfcd1bb669db09de0cbaa103914efaaf19c6fb..eb40fb4acc11d278fd456b95af0f24058b0df7c1 100644
--- a/tensorflow/python/keras/_impl/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -27,9 +27,10 @@ import unittest
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary.writer import writer_cache
 
 try:
@@ -354,7 +355,7 @@ class KerasCallbacksTest(test.TestCase):
           keras.callbacks.ReduceLROnPlateau(
               monitor='val_loss',
               factor=0.1,
-              epsilon=10,
+              min_delta=10,
               patience=1,
               cooldown=5)
       ]
@@ -371,6 +372,63 @@ class KerasCallbacksTest(test.TestCase):
           0.01,
           atol=1e-4)
 
+      model = make_model()
+      cbks = [
+          keras.callbacks.ReduceLROnPlateau(
+              monitor='val_loss',
+              factor=0.1,
+              min_delta=0,
+              patience=1,
+              cooldown=5)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=5,
+          verbose=2)
+      self.assertAllClose(
+          float(keras.backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4)
+
+  def test_ReduceLROnPlateau_patience(self):
+
+    class DummyOptimizer(object):
+
+      def __init__(self):
+        self.lr = keras.backend.variable(1.0)
+
+    class DummyModel(object):
+
+      def __init__(self):
+        self.optimizer = DummyOptimizer()
+
+    reduce_on_plateau = keras.callbacks.ReduceLROnPlateau(
+        monitor='val_loss', patience=2)
+    reduce_on_plateau.model = DummyModel()
+
+    losses = [0.0860, 0.1096, 0.1040]
+    lrs = []
+
+    for epoch in range(len(losses)):
+      reduce_on_plateau.on_epoch_end(epoch, logs={'val_loss': losses[epoch]})
+      lrs.append(keras.backend.get_value(reduce_on_plateau.model.optimizer.lr))
+
+    # The learning rates should be 1.0 except the last one
+    for lr in lrs[:-1]:
+      self.assertEqual(lr, 1.0)
+    self.assertLess(lrs[-1], 1.0)
+
+  def test_ReduceLROnPlateau_backwards_compatibility(self):
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      reduce_on_plateau = keras.callbacks.ReduceLROnPlateau(epsilon=1e-13)
+      self.assertRegexpMatches(
+          str(mock_log.call_args), '`epsilon` argument is deprecated')
+    self.assertFalse(hasattr(reduce_on_plateau, 'epsilon'))
+    self.assertTrue(hasattr(reduce_on_plateau, 'min_delta'))
+    self.assertEqual(reduce_on_plateau.min_delta, 1e-13)
+
   def test_CSVLogger(self):
     with self.test_session():
       np.random.seed(1337)
@@ -507,33 +565,39 @@ class KerasCallbacksTest(test.TestCase):
       assert 'nan' in values[-1], 'The last epoch was not logged.'
 
   def test_TerminateOnNaN(self):
-    np.random.seed(1337)
-    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
 
-    y_test = keras.utils.to_categorical(y_test)
-    y_train = keras.utils.to_categorical(y_train)
-    cbks = [keras.callbacks.TerminateOnNaN()]
-    model = keras.models.Sequential()
-    initializer = keras.initializers.Constant(value=1e5)
-    for _ in range(5):
-      model.add(keras.layers.Dense(2,
-                                   input_dim=INPUT_DIM,
-                                   activation='relu',
-                                   kernel_initializer=initializer))
-    model.add(keras.layers.Dense(NUM_CLASSES))
-    model.compile(loss='mean_squared_error',
-                  optimizer='rmsprop')
-
-    history = model.fit(x_train, y_train, batch_size=BATCH_SIZE,
-                        validation_data=(x_test, y_test),
-                        callbacks=cbks, epochs=20)
-    loss = history.history['loss']
-    assert len(loss) == 1
-    assert loss[0] == np.inf
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+      cbks = [keras.callbacks.TerminateOnNaN()]
+      model = keras.models.Sequential()
+      initializer = keras.initializers.Constant(value=1e5)
+      for _ in range(5):
+        model.add(
+            keras.layers.Dense(
+                2,
+                input_dim=INPUT_DIM,
+                activation='relu',
+                kernel_initializer=initializer))
+      model.add(keras.layers.Dense(NUM_CLASSES))
+      model.compile(loss='mean_squared_error', optimizer='rmsprop')
+
+      history = model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=20)
+      loss = history.history['loss']
+      assert len(loss) == 1
+      assert loss[0] == np.inf
 
   def test_TensorBoard(self):
     np.random.seed(1337)
@@ -571,6 +635,8 @@ class KerasCallbacksTest(test.TestCase):
       model.add(
           keras.layers.Dense(
               NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      # non_trainable_weights: moving_variance, moving_mean
+      model.add(keras.layers.BatchNormalization())
       model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
       model.compile(
           loss='categorical_crossentropy',
@@ -875,6 +941,37 @@ class KerasCallbacksTest(test.TestCase):
 
       assert os.path.exists(temp_dir)
 
+  def test_RemoteMonitorWithJsonPayload(self):
+    if requests is None:
+      self.skipTest('`requests` required to run this test')
+    with self.test_session():
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.np_utils.to_categorical(y_test)
+      y_train = keras.utils.np_utils.to_categorical(y_train)
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='rmsprop',
+          metrics=['accuracy'])
+      cbks = [keras.callbacks.RemoteMonitor(send_as_json=True)]
+
+      with test.mock.patch.object(requests, 'post'):
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=BATCH_SIZE,
+            validation_data=(x_test, y_test),
+            callbacks=cbks,
+            epochs=1)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/constraints.py b/tensorflow/python/keras/constraints.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/constraints.py
rename to tensorflow/python/keras/constraints.py
index abe95d8e0ca68b2e62f9574fba9ae912a9179fff..bf3a3a728aafc8071d8ddb7e3acf4f7282ed4c16 100644
--- a/tensorflow/python/keras/_impl/keras/constraints.py
+++ b/tensorflow/python/keras/constraints.py
@@ -21,9 +21,9 @@ from __future__ import print_function
 
 import six
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/constraints/__init__.py b/tensorflow/python/keras/constraints/__init__.py
deleted file mode 100644
index 152606d8ebbcadf57d971d508e15283da65e4aa3..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/constraints/__init__.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras built-in constraints functions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Constraints functions / callable classes.
-from tensorflow.python.keras._impl.keras.constraints import Constraint
-from tensorflow.python.keras._impl.keras.constraints import max_norm
-from tensorflow.python.keras._impl.keras.constraints import MaxNorm
-from tensorflow.python.keras._impl.keras.constraints import min_max_norm
-from tensorflow.python.keras._impl.keras.constraints import MinMaxNorm
-from tensorflow.python.keras._impl.keras.constraints import non_neg
-from tensorflow.python.keras._impl.keras.constraints import NonNeg
-from tensorflow.python.keras._impl.keras.constraints import unit_norm
-from tensorflow.python.keras._impl.keras.constraints import UnitNorm
-
-# Auxiliary utils.
-# pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.constraints import deserialize
-from tensorflow.python.keras._impl.keras.constraints import serialize
-from tensorflow.python.keras._impl.keras.constraints import get
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/constraints_test.py b/tensorflow/python/keras/constraints_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/constraints_test.py
rename to tensorflow/python/keras/constraints_test.py
index 87905693caa900a2cc565cef4bcea3fa30a4bc6c..84e2db10332c82f566a35d5ebba0c340e502fcd5 100644
--- a/tensorflow/python/keras/_impl/keras/constraints_test.py
+++ b/tensorflow/python/keras/constraints_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py b/tensorflow/python/keras/datasets/boston_housing.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
rename to tensorflow/python/keras/datasets/boston_housing.py
index 13fa9aed2b8da124af4e9f68c779e08d3094cb5d..8c043638c0d1167948d19b44bcc3272bcef4f830 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
+++ b/tensorflow/python/keras/datasets/boston_housing.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/datasets/boston_housing/__init__.py b/tensorflow/python/keras/datasets/boston_housing/__init__.py
deleted file mode 100644
index b5371a03fd5f5755ba8844415276113c565f52db..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/datasets/boston_housing/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Boston housing price regression dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.datasets.boston_housing import load_data
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar.py b/tensorflow/python/keras/datasets/cifar.py
similarity index 100%
rename from tensorflow/python/keras/_impl/keras/datasets/cifar.py
rename to tensorflow/python/keras/datasets/cifar.py
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar10.py b/tensorflow/python/keras/datasets/cifar10.py
similarity index 90%
rename from tensorflow/python/keras/_impl/keras/datasets/cifar10.py
rename to tensorflow/python/keras/datasets/cifar10.py
index 6b772433822474c06efcce1701226a4a67abe361..d627160875c007971c695891d1dab34b8bf1ba39 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
+++ b/tensorflow/python/keras/datasets/cifar10.py
@@ -22,9 +22,9 @@ import os
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.datasets.cifar import load_batch
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.datasets.cifar import load_batch
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar100.py b/tensorflow/python/keras/datasets/cifar100.py
similarity index 90%
rename from tensorflow/python/keras/_impl/keras/datasets/cifar100.py
rename to tensorflow/python/keras/datasets/cifar100.py
index 28d74116a50979abab207dbec88e384210dfc070..e9a6d634a5308ab8c749e8861e0e4a33ac56d464 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
+++ b/tensorflow/python/keras/datasets/cifar100.py
@@ -22,9 +22,9 @@ import os
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.datasets.cifar import load_batch
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.datasets.cifar import load_batch
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py b/tensorflow/python/keras/datasets/fashion_mnist.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
rename to tensorflow/python/keras/datasets/fashion_mnist.py
index 508e95f719a02977960b80c283495ced642293c5..45e27aad34f6584721b11d306a3f24e78f2ed48b 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
+++ b/tensorflow/python/keras/datasets/fashion_mnist.py
@@ -23,7 +23,7 @@ import os
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/imdb.py b/tensorflow/python/keras/datasets/imdb.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/datasets/imdb.py
rename to tensorflow/python/keras/datasets/imdb.py
index 7467bb24646227705972262381aa5cf1de809f1c..411b3e8635f5b95b19375fb5f4686b20c643190f 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/datasets/imdb.py
@@ -22,8 +22,8 @@ import json
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import _remove_long_seq
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras.preprocessing.sequence import _remove_long_seq
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/datasets/imdb/__init__.py b/tensorflow/python/keras/datasets/imdb/__init__.py
deleted file mode 100644
index 1c6396d2d32b88eaa900a5af4e62c7484fceab63..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/datasets/imdb/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""IMDB movie review sentiment classification dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.datasets.imdb import get_word_index
-from tensorflow.python.keras._impl.keras.datasets.imdb import load_data
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/datasets/mnist.py b/tensorflow/python/keras/datasets/mnist.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/datasets/mnist.py
rename to tensorflow/python/keras/datasets/mnist.py
index e30691373e9aafad61b101476e21d6860527ce98..631189731a91bf352ed347b1887f80f17860c807 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/mnist.py
+++ b/tensorflow/python/keras/datasets/mnist.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/datasets/mnist/__init__.py b/tensorflow/python/keras/datasets/mnist/__init__.py
deleted file mode 100644
index 364255f3387b59a419c010db9b93cdfbcba36186..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/datasets/mnist/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""MNIST handwritten digits classification dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.datasets.mnist import load_data
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/datasets/reuters.py b/tensorflow/python/keras/datasets/reuters.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/datasets/reuters.py
rename to tensorflow/python/keras/datasets/reuters.py
index b711696b5eecf9ba07a66cef25c1811c182b3b60..b070ba8d125614e4c555915e4acdbf8a8803863d 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/reuters.py
+++ b/tensorflow/python/keras/datasets/reuters.py
@@ -22,8 +22,8 @@ import json
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import _remove_long_seq
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras.preprocessing.sequence import _remove_long_seq
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/datasets/reuters/__init__.py b/tensorflow/python/keras/datasets/reuters/__init__.py
deleted file mode 100644
index bb6791a344ad0c372ac60cd4a332f5632841dd46..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/datasets/reuters/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Reuters newswire topic classification dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.datasets.reuters import get_word_index
-from tensorflow.python.keras._impl.keras.datasets.reuters import load_data
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/engine/__init__.py b/tensorflow/python/keras/engine/__init__.py
similarity index 62%
rename from tensorflow/python/keras/_impl/keras/engine/__init__.py
rename to tensorflow/python/keras/engine/__init__.py
index 1bc533ab8f7ba37948d82bc69fe1c9bfe00d6834..ec7c0831992b2691c442bbd30445dbff8dba662f 100644
--- a/tensorflow/python/keras/_impl/keras/engine/__init__.py
+++ b/tensorflow/python/keras/engine/__init__.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.engine.base_layer import InputSpec
-from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
-from tensorflow.python.keras._impl.keras.engine.input_layer import Input
-from tensorflow.python.keras._impl.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.engine.network import Network
-from tensorflow.python.keras._impl.keras.engine.training import Model
+from tensorflow.python.keras.engine.base_layer import InputSpec
+from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_layer import Input
+from tensorflow.python.keras.engine.input_layer import InputLayer
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.engine.network import Network
+from tensorflow.python.keras.engine.training import Model
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/python/keras/_impl/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/engine/base_layer.py
rename to tensorflow/python/keras/engine/base_layer.py
index 3af4eaabe90217ba08c4855bb3c88be3b0c963eb..24716cfbe4978c6fd5a7884a581524f804817d60 100644
--- a/tensorflow/python/keras/_impl/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -25,25 +25,25 @@ import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
-from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras._impl.keras import backend
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_utils
 # A module that only depends on `keras.layers` import these from here.
-from tensorflow.python.keras._impl.keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras.utils.tf_utils import is_tensor_or_tensor_list  # pylint: disable=unused-import
+from tensorflow.python.keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
+from tensorflow.python.keras.utils.tf_utils import is_tensor_or_tensor_list  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
@@ -146,7 +146,7 @@ class Layer(checkpointable.CheckpointableBase):
     # return tensors. When using graph execution, _losses is a list of ops.
     self._losses = []
     self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
-    self._call_fn_args = estimator_util.fn_args(self.call)
+    self._call_fn_args = function_utils.fn_args(self.call)
     self._compute_previous_mask = ('mask' in self._call_fn_args or
                                    hasattr(self, 'compute_mask'))
     self._uses_inputs_arg = True
@@ -436,7 +436,7 @@ class Layer(checkpointable.CheckpointableBase):
   def _name_scope(self):
     return self.name
 
-  def build(self, _):
+  def build(self, input_shape):
     """Creates the variables of the layer."""
     self.built = True
 
@@ -484,8 +484,7 @@ class Layer(checkpointable.CheckpointableBase):
     """
     if dtype is None:
       dtype = self.dtype or backend.floatx()
-    else:
-      dtype = dtypes.as_dtype(dtype)
+    dtype = dtypes.as_dtype(dtype)
     initializer = initializers.get(initializer)
     regularizer = regularizers.get(regularizer)
     constraint = constraints.get(constraint)
@@ -513,7 +512,7 @@ class Layer(checkpointable.CheckpointableBase):
         # Manage errors in Layer rather than Checkpointable.
         overwrite=True,
         initializer=initializer,
-        dtype=dtypes.as_dtype(dtype),
+        dtype=dtype,
         constraint=constraint,
         trainable=trainable and self.trainable,
         partitioner=partitioner,
@@ -644,7 +643,7 @@ class Layer(checkpointable.CheckpointableBase):
         self._compute_previous_mask):
       previous_mask = collect_previous_mask(inputs)
       if not hasattr(self, '_call_fn_args'):
-        self._call_fn_args = estimator_util.fn_args(self.call)
+        self._call_fn_args = function_utils.fn_args(self.call)
       if ('mask' in self._call_fn_args and 'mask' not in kwargs and
           not generic_utils.is_all_none(previous_mask)):
         # The previous layer generated a mask, and mask was not explicitly pass
@@ -1658,7 +1657,7 @@ class DeferredTensor(object):
   """Tensor-like object used to build graphs of layers in Eager mode.
 
   When calling a layer on a DeferredTensor, the layer will not perform any
-  computation and will simply perfom shape inference to return new
+  computation and will simply perform shape inference to return new
   DeferredTensors with appropriate shape information. Thus DeferredTensor
   behaves like a graph-mode Tensor when manipulated by layers.
   """
diff --git a/tensorflow/python/keras/_impl/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/engine/input_layer.py
rename to tensorflow/python/keras/engine/input_layer.py
index bd9dcbe3c576851123dfcabe3e36379019627ac5..b04dc3c60be2a9e13cb1bc56ef12f0de36ed105c 100644
--- a/tensorflow/python/keras/_impl/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.engine import base_layer
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/engine/network.py
rename to tensorflow/python/keras/engine/network.py
index 3197d49fcee4da5d166b4dd56ce033df854d8268..552e7b7d735137af7edaa2f475a3af00bb4ba5d3 100644
--- a/tensorflow/python/keras/_impl/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -32,16 +32,16 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend
-from tensorflow.python.keras._impl.keras.engine import base_layer
-from tensorflow.python.keras._impl.keras.engine import saving
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
-from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
-from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary as print_layer_summary
+from tensorflow.python.keras import backend
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import saving
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.keras.utils.layer_utils import print_summary as print_layer_summary
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpointable
-from tensorflow.python.training import checkpointable_utils
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
@@ -146,14 +146,14 @@ class Network(base_layer.Layer):
           raise TypeError('When eager execution is enabled, '
                           'inputs must come from a call to '
                           '`tf.keras.Input` (called after '
-                          'tfe.enable_eager_execution()). '
+                          'tf.enable_eager_execution()). '
                           'Received invalid input: ' + str(tensor))
       for tensor in self.outputs:
         if not isinstance(tensor, base_layer.DeferredTensor):  # pylint: disable=protected-access
           raise TypeError('When eager execution is enabled, '
                           'outputs must come from a call to '
                           'a layer (called after '
-                          'tfe.enable_eager_execution()). '
+                          'tf.enable_eager_execution()). '
                           'Received invalid output: ' + str(tensor))
     # Check for redundancy in inputs.
     if len(set(self.inputs)) != len(self.inputs):
@@ -318,6 +318,9 @@ class Network(base_layer.Layer):
           layer, name='layer-%d' % layer_index, overwrite=True)
 
   def __setattr__(self, name, value):
+    no_dependency = isinstance(value, checkpointable.NoDependency)
+    if no_dependency:
+      value = value.value
     if isinstance(value, (base_layer.Layer, Network)):
       try:
         is_graph_network = self._is_graph_network
@@ -332,7 +335,8 @@ class Network(base_layer.Layer):
             # In subclassed models, legacy layers (tf.layers) must always use
             # resource variables.
             value._use_resource_variables = True
-    if isinstance(value, checkpointable.CheckpointableBase):
+    if (not no_dependency
+        and isinstance(value, checkpointable.CheckpointableBase)):
       # Layer (and therefore Network/Model) inherit from CheckpointableBase
       # rather than Checkpointable, which means there is no Checkpointable
       # __setattr__ override (it would be a performance issue for functional
@@ -835,10 +839,14 @@ class Network(base_layer.Layer):
               output_tensors = nest.flatten(
                   layer.call(computed_tensor, **kwargs))
               if hasattr(layer, 'compute_mask'):
-                output_masks = nest.flatten(
-                    layer.compute_mask(computed_tensor, computed_mask))
+                output_masks = layer.compute_mask(computed_tensor,
+                                                  computed_mask)
+                if output_masks is None:
+                  output_masks = [None for _ in output_tensors]
+                else:
+                  output_masks = nest.flatten(output_masks)
               else:
-                output_masks = [None for _ in range(len(output_tensors))]
+                output_masks = [None for _ in output_tensors]
               computed_tensors = [computed_tensor]
               computed_masks = [computed_mask]
             else:
@@ -851,11 +859,16 @@ class Network(base_layer.Layer):
 
               output_tensors = nest.flatten(
                   layer.call(computed_tensors, **kwargs))
+
               if hasattr(layer, 'compute_mask'):
-                output_masks = nest.flatten(
-                    layer.compute_mask(computed_tensors, computed_masks))
+                output_masks = layer.compute_mask(computed_tensors,
+                                                  computed_masks)
+                if output_masks is None:
+                  output_masks = [None for _ in output_tensors]
+                else:
+                  output_masks = nest.flatten(output_masks)
               else:
-                output_masks = [None for _ in range(len(output_tensors))]
+                output_masks = [None for _ in output_tensors]
 
             if not context.executing_eagerly():
               if layer.activity_regularizer is not None:
@@ -1069,7 +1082,7 @@ class Network(base_layer.Layer):
       layer_name = layer_data['name']
 
       # Instantiate layer.
-      from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+      from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
 
       layer = deserialize_layer(layer_data, custom_objects=custom_objects)
       created_layers[layer_name] = layer
@@ -1153,7 +1166,7 @@ class Network(base_layer.Layer):
     if not self._is_graph_network:
       raise NotImplementedError
 
-    from tensorflow.python.keras._impl.keras.models import save_model  # pylint: disable=g-import-not-at-top
+    from tensorflow.python.keras.models import save_model  # pylint: disable=g-import-not-at-top
     save_model(self, filepath, overwrite, include_optimizer)
 
   def save_weights(self, filepath, overwrite=True, save_format=None):
@@ -1198,7 +1211,7 @@ class Network(base_layer.Layer):
             format.
         ValueError: For invalid/unknown format arguments.
     """
-    filepath_is_h5 = filepath.endswith('.h5') or filepath.endswith('.keras')
+    filepath_is_h5 = _is_hdf5_filepath(filepath)
     if save_format is None:
       if filepath_is_h5:
         save_format = 'h5'
@@ -1280,12 +1293,15 @@ class Network(base_layer.Layer):
         ImportError: If h5py is not available and the weight file is in HDF5
             format.
     """
-    try:
-      pywrap_tensorflow.NewCheckpointReader(filepath)
-      save_format = 'tf'
-    except errors_impl.DataLossError:
-      # The checkpoint is not readable in TensorFlow format. Try HDF5.
+    if _is_hdf5_filepath(filepath):
       save_format = 'h5'
+    else:
+      try:
+        pywrap_tensorflow.NewCheckpointReader(filepath)
+        save_format = 'tf'
+      except errors_impl.DataLossError:
+        # The checkpoint is not readable in TensorFlow format. Try HDF5.
+        save_format = 'h5'
     if save_format == 'tf':
       status = self._checkpointable_saver.restore(filepath)
       if by_name:
@@ -1332,7 +1348,7 @@ class Network(base_layer.Layer):
     Returns:
         Model config with Keras version information added.
     """
-    from tensorflow.python.keras._impl.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+    from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
 
     config = self.get_config()
     model_config = {
@@ -1456,6 +1472,10 @@ def get_source_inputs(tensor, layer=None, node_index=None):
       return source_tensors
 
 
+def _is_hdf5_filepath(filepath):
+  return filepath.endswith('.h5') or filepath.endswith('.keras')
+
+
 def _make_node_key(layer_name, node_index):
   return layer_name + '_ib-' + str(node_index)
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/saving.py b/tensorflow/python/keras/engine/saving.py
similarity index 86%
rename from tensorflow/python/keras/_impl/keras/engine/saving.py
rename to tensorflow/python/keras/engine/saving.py
index a0b709a1a58436b2d8cb33398c013f26f023dd70..99ce64a469db97f27372f4821fdc5c4f7565c763 100644
--- a/tensorflow/python/keras/_impl/keras/engine/saving.py
+++ b/tensorflow/python/keras/engine/saving.py
@@ -25,11 +25,12 @@ import os
 import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import optimizers
-from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import serialization
 from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=g-import-not-at-top
@@ -61,7 +62,9 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
 
   Arguments:
       model: Keras model instance to be saved.
-      filepath: String, path where to save the model.
+      filepath: One of the following:
+          - String, path where to save the model
+          - `h5py.File` object where to save the model
       overwrite: Whether we should overwrite any existing
           model at the target location, or instead
           ask the user with a manual prompt.
@@ -74,49 +77,22 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
   if h5py is None:
     raise ImportError('`save_model` requires h5py.')
 
-  def get_json_type(obj):
-    """Serializes any object to a JSON-serializable structure.
+  from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
 
-    Arguments:
-        obj: the object to serialize
-
-    Returns:
-        JSON-serializable structure representing `obj`.
-
-    Raises:
-        TypeError: if `obj` cannot be serialized.
-    """
-    # if obj is a serializable Keras class instance
-    # e.g. optimizer, layer
-    if hasattr(obj, 'get_config'):
-      return {'class_name': obj.__class__.__name__, 'config': obj.get_config()}
-
-    # if obj is any numpy type
-    if type(obj).__module__ == np.__name__:
-      if isinstance(obj, np.ndarray):
-        return {'type': type(obj), 'value': obj.tolist()}
-      else:
-        return obj.item()
-
-    # misc functions (e.g. loss function)
-    if callable(obj):
-      return obj.__name__
-
-    # if obj is a python 'type'
-    if type(obj).__name__ == type.__name__:
-      return obj.__name__
+  if not isinstance(filepath, h5py.File):
+    # If file exists and should not be overwritten.
+    if not overwrite and os.path.isfile(filepath):
+      proceed = ask_to_proceed_with_overwrite(filepath)
+      if not proceed:
+        return
 
-    raise TypeError('Not JSON Serializable:', obj)
-
-  from tensorflow.python.keras._impl.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-
-  # If file exists and should not be overwritten.
-  if not overwrite and os.path.isfile(filepath):
-    proceed = ask_to_proceed_with_overwrite(filepath)
-    if not proceed:
-      return
+    f = h5py.File(filepath, mode='w')
+    opened_new_file = True
+  else:
+    f = filepath
+    opened_new_file = False
 
-  with h5py.File(filepath, mode='w') as f:
+  try:
     f.attrs['keras_version'] = str(keras_version).encode('utf8')
     f.attrs['backend'] = K.backend().encode('utf8')
     f.attrs['model_config'] = json.dumps(
@@ -124,7 +100,7 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
             'class_name': model.__class__.__name__,
             'config': model.get_config()
         },
-        default=get_json_type).encode('utf8')
+        default=serialization.get_json_type).encode('utf8')
 
     model_weights_group = f.create_group('model_weights')
     model_layers = model.layers
@@ -154,7 +130,7 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
                 'sample_weight_mode': model.sample_weight_mode,
                 'loss_weights': model.loss_weights,
             },
-            default=get_json_type).encode('utf8')
+            default=serialization.get_json_type).encode('utf8')
 
         # Save optimizer weights.
         symbolic_weights = getattr(model.optimizer, 'weights')
@@ -175,6 +151,9 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
             else:
               param_dset[:] = val
     f.flush()
+  finally:
+    if opened_new_file:
+      f.close()
 
 
 @tf_export('keras.models.load_model')
@@ -182,7 +161,9 @@ def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=
   """Loads a model saved via `save_model`.
 
   Arguments:
-      filepath: String, path to the saved model.
+      filepath: One of the following:
+          - String, path to the saved model
+          - `h5py.File` object from which to load the model
       custom_objects: Optional dictionary mapping names
           (strings) to custom classes or functions to be
           considered during deserialization.
@@ -232,7 +213,14 @@ def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=
       return custom_objects[obj]
     return obj
 
-  with h5py.File(filepath, mode='r') as f:
+  opened_new_file = not isinstance(filepath, h5py.File)
+  if opened_new_file:
+    f = h5py.File(filepath, mode='r')
+  else:
+    f = filepath
+
+  model = None
+  try:
     # instantiate model
     model_config = f.attrs.get('model_config')
     if model_config is None:
@@ -243,54 +231,54 @@ def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=
     # set weights
     load_weights_from_hdf5_group(f['model_weights'], model.layers)
 
-    # Early return if compilation is not required.
-    if not compile:
-      return model
-
-    # instantiate optimizer
-    training_config = f.attrs.get('training_config')
-    if training_config is None:
-      logging.warning('No training configuration found in save file: '
-                      'the model was *not* compiled. Compile it manually.')
-      return model
-    training_config = json.loads(training_config.decode('utf-8'))
-    optimizer_config = training_config['optimizer_config']
-    optimizer = optimizers.deserialize(
-        optimizer_config, custom_objects=custom_objects)
-
-    # Recover loss functions and metrics.
-    loss = convert_custom_objects(training_config['loss'])
-    metrics = convert_custom_objects(training_config['metrics'])
-    sample_weight_mode = training_config['sample_weight_mode']
-    loss_weights = training_config['loss_weights']
-
-    # Compile model.
-    model.compile(
-        optimizer=optimizer,
-        loss=loss,
-        metrics=metrics,
-        loss_weights=loss_weights,
-        sample_weight_mode=sample_weight_mode)
-
-    # Set optimizer weights.
-    if 'optimizer_weights' in f:
-      # Build train function (to get weight updates).
-      model._make_train_function()
-      optimizer_weights_group = f['optimizer_weights']
-      optimizer_weight_names = [
-          n.decode('utf8')
-          for n in optimizer_weights_group.attrs['weight_names']
-      ]
-      optimizer_weight_values = [
-          optimizer_weights_group[n] for n in optimizer_weight_names
-      ]
-      try:
-        model.optimizer.set_weights(optimizer_weight_values)
-      except ValueError:
-        logging.warning('Error in loading the saved optimizer '
-                        'state. As a result, your model is '
-                        'starting with a freshly initialized '
-                        'optimizer.')
+    if compile:
+      # instantiate optimizer
+      training_config = f.attrs.get('training_config')
+      if training_config is None:
+        logging.warning('No training configuration found in save file: '
+                        'the model was *not* compiled. Compile it manually.')
+        return model
+      training_config = json.loads(training_config.decode('utf-8'))
+      optimizer_config = training_config['optimizer_config']
+      optimizer = optimizers.deserialize(
+          optimizer_config, custom_objects=custom_objects)
+
+      # Recover loss functions and metrics.
+      loss = convert_custom_objects(training_config['loss'])
+      metrics = convert_custom_objects(training_config['metrics'])
+      sample_weight_mode = training_config['sample_weight_mode']
+      loss_weights = training_config['loss_weights']
+
+      # Compile model.
+      model.compile(
+          optimizer=optimizer,
+          loss=loss,
+          metrics=metrics,
+          loss_weights=loss_weights,
+          sample_weight_mode=sample_weight_mode)
+
+      # Set optimizer weights.
+      if 'optimizer_weights' in f:
+        # Build train function (to get weight updates).
+        model._make_train_function()
+        optimizer_weights_group = f['optimizer_weights']
+        optimizer_weight_names = [
+            n.decode('utf8')
+            for n in optimizer_weights_group.attrs['weight_names']
+        ]
+        optimizer_weight_values = [
+            optimizer_weights_group[n] for n in optimizer_weight_names
+        ]
+        try:
+          model.optimizer.set_weights(optimizer_weight_values)
+        except ValueError:
+          logging.warning('Error in loading the saved optimizer '
+                          'state. As a result, your model is '
+                          'starting with a freshly initialized '
+                          'optimizer.')
+  finally:
+    if opened_new_file:
+      f.close()
   return model
 
 
@@ -314,7 +302,7 @@ def model_from_config(config, custom_objects=None):
     raise TypeError('`model_from_config` expects a dictionary, not a list. '
                     'Maybe you meant to use '
                     '`Sequential.from_config(config)`?')
-  from tensorflow.python.keras._impl.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
   return deserialize(config, custom_objects=custom_objects)
 
 
@@ -337,7 +325,7 @@ def model_from_yaml(yaml_string, custom_objects=None):
   if yaml is None:
     raise ImportError('Requires yaml module installed.')
   config = yaml.load(yaml_string)
-  from tensorflow.python.keras._impl.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
   return deserialize(config, custom_objects=custom_objects)
 
 
@@ -355,7 +343,7 @@ def model_from_json(json_string, custom_objects=None):
       A Keras model instance (uncompiled).
   """
   config = json.loads(json_string)
-  from tensorflow.python.keras._impl.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
   return deserialize(config, custom_objects=custom_objects)
 
 
@@ -669,7 +657,13 @@ def _convert_rnn_weights(layer, weights):
 
 
 def save_weights_to_hdf5_group(f, layers):
-  from tensorflow.python.keras._impl.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+  """Saves the weights of a list of layers to a HDF5 group.
+
+  Arguments:
+      f: HDF5 group.
+      layers: List of layer instances.
+  """
+  from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
 
   save_attributes_to_hdf5_group(
       f, 'layer_names', [layer.name.encode('utf8') for layer in layers])
@@ -743,7 +737,7 @@ def load_weights_from_hdf5_group(f, layers):
   for k, name in enumerate(layer_names):
     g = f[name]
     weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
-    weight_values = [g[weight_name] for weight_name in weight_names]
+    weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
     layer = filtered_layers[k]
     symbolic_weights = layer.weights
     weight_values = preprocess_weights_for_loading(
@@ -799,7 +793,7 @@ def load_weights_from_hdf5_group_by_name(f, layers):
   for k, name in enumerate(layer_names):
     g = f[name]
     weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
-    weight_values = [g[weight_name] for weight_name in weight_names]
+    weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
 
     for layer in index.get(name, []):
       symbolic_weights = layer.weights
diff --git a/tensorflow/python/keras/_impl/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
similarity index 92%
rename from tensorflow/python/keras/_impl/keras/engine/saving_test.py
rename to tensorflow/python/keras/engine/saving_test.py
index c0b16b6bf5a050b16fe6faa6583db53f264d86ba..30bcd3d1855d28012ce0ae747b900468c57465e0 100644
--- a/tensorflow/python/keras/_impl/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -25,13 +25,13 @@ import tempfile
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras.engine import training
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -253,7 +253,7 @@ class TestWholeModelSaving(test.TestCase):
 
   def test_sequential_model_saving(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       model = keras.models.Sequential()
@@ -290,7 +290,7 @@ class TestWholeModelSaving(test.TestCase):
 
   def test_sequential_model_saving_2(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       # test with custom optimizer, loss
@@ -326,7 +326,7 @@ class TestWholeModelSaving(test.TestCase):
 
   def test_functional_model_saving(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       inputs = keras.layers.Input(shape=(3,))
@@ -354,7 +354,7 @@ class TestWholeModelSaving(test.TestCase):
 
   def test_saving_without_compilation(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       model = keras.models.Sequential()
@@ -370,7 +370,7 @@ class TestWholeModelSaving(test.TestCase):
 
   def test_saving_with_tf_optimizer(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       model = keras.models.Sequential()
@@ -388,7 +388,7 @@ class TestWholeModelSaving(test.TestCase):
 
   def test_saving_right_after_compilation(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       model = keras.models.Sequential()
@@ -405,7 +405,7 @@ class TestWholeModelSaving(test.TestCase):
 
   def test_saving_lambda_numpy_array_arguments(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     mean = np.random.random((4, 2, 3))
     std = np.abs(np.random.random((4, 2, 3))) + 1e-5
@@ -427,7 +427,7 @@ class TestWholeModelSaving(test.TestCase):
 
   def test_saving_model_with_long_layer_names(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       # This layer name will make the `layers_name` HDF5 attribute blow
@@ -468,7 +468,7 @@ class TestWholeModelSaving(test.TestCase):
 
   def test_saving_model_with_long_weights_names(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       x = keras.Input(shape=(2,), name='nested_model_input')
@@ -511,6 +511,43 @@ class TestWholeModelSaving(test.TestCase):
       os.close(fd)
       os.remove(fname)
 
+  def test_model_saving_to_pre_created_h5py_file(self):
+    if h5py is None:
+      self.skipTest('h5py required to run this test')
+
+    with self.test_session():
+      inputs = keras.Input(shape=(3,))
+      x = keras.layers.Dense(2)(inputs)
+      outputs = keras.layers.Dense(3)(x)
+
+      model = keras.Model(inputs, outputs)
+      model.compile(loss=keras.losses.MSE,
+                    optimizer=keras.optimizers.Adam(),
+                    metrics=[keras.metrics.categorical_accuracy])
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+      model.train_on_batch(x, y)
+
+      out = model.predict(x)
+      fd, fname = tempfile.mkstemp('.h5')
+      with h5py.File(fname, mode='r+') as h5file:
+        keras.models.save_model(model, h5file)
+        loaded_model = keras.models.load_model(h5file)
+        out2 = loaded_model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+      # Test non-default options in h5
+      with h5py.File('_', driver='core',
+                     backing_store=False) as h5file:
+        keras.models.save_model(model, h5file)
+        loaded_model = keras.models.load_model(h5file)
+        out2 = loaded_model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+      # Cleanup
+      os.close(fd)
+      os.remove(fname)
+
 
 class SubclassedModel(training.Model):
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/engine/sequential.py
rename to tensorflow/python/keras/engine/sequential.py
index 8626626ca1a232de175af355e317f7df704fe148..52e29b0ffad7d26d2a2fbe8f287146daaffa3059 100644
--- a/tensorflow/python/keras/_impl/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -21,13 +21,13 @@ from __future__ import print_function
 
 import copy
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import layers as layer_module
-from tensorflow.python.keras._impl.keras.engine import base_layer
-from tensorflow.python.keras._impl.keras.engine import network
-from tensorflow.python.keras._impl.keras.engine.input_layer import Input
-from tensorflow.python.keras._impl.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras._impl.keras.engine.training import Model
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import layers as layer_module
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import network
+from tensorflow.python.keras.engine.input_layer import Input
+from tensorflow.python.keras.engine.input_layer import InputLayer
+from tensorflow.python.keras.engine.training import Model
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
similarity index 79%
rename from tensorflow/python/keras/_impl/keras/engine/sequential_test.py
rename to tensorflow/python/keras/engine/sequential_test.py
index 8aba16aef3e187e9e33bdb65c7d44b0e622730ef..69a288e69b60b03383b2cb54f8a2fde641516628 100644
--- a/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -20,8 +20,11 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import rmsprop
 
@@ -75,7 +78,7 @@ class TestSequential(test.TestCase):
       model.pop()
 
   @tf_test_util.run_in_graph_and_eager_modes()
-  def test_sequential_deferred_build(self):
+  def test_sequential_deferred_build_with_np_arrays(self):
     num_hidden = 5
     input_dim = 3
     batch_size = 5
@@ -99,6 +102,40 @@ class TestSequential(test.TestCase):
                      [None, num_classes])
     self.assertEqual(len(model.weights), 2 * 2)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_sequential_deferred_build_with_dataset_iterators(self):
+    if not context.executing_eagerly():
+      # TODO(psv/fchollet): Add support for this use case in graph mode.
+      return
+    num_hidden = 5
+    input_dim = 3
+    num_classes = 2
+    num_samples = 50
+    steps_per_epoch = 10
+
+    model = keras.models.Sequential()
+    # We don't specify the input shape.
+    model.add(keras.layers.Dense(num_hidden))
+    model.add(keras.layers.Dense(num_classes))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    self.assertEqual(len(model.layers), 2)
+    self.assertEqual(len(model.weights), 0)
+    self.assertFalse(model.built)
+
+    x = array_ops.ones((num_samples, input_dim))
+    y = array_ops.zeros((num_samples, num_classes))
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
+
+    model.fit(iterator, epochs=1, steps_per_epoch=steps_per_epoch)
+    self.assertTrue(model.built)
+    self.assertEqual(model.inputs[0].get_shape().as_list(), [None, input_dim])
+    self.assertEqual(model.outputs[0].get_shape().as_list(),
+                     [None, num_classes])
+    self.assertEqual(len(model.weights), 2 * 2)
+
   @tf_test_util.run_in_graph_and_eager_modes()
   def test_invalid_use_cases(self):
     # Added objects must be layer instances
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology_test.py b/tensorflow/python/keras/engine/topology_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/engine/topology_test.py
rename to tensorflow/python/keras/engine/topology_test.py
index 6993a042890088b21a3ab56bca75f5a0b3b2242a..183e26e8bf813ec0a8c84920a93dcb79a291ca9d 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/engine/topology_test.py
@@ -20,12 +20,12 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -883,6 +883,33 @@ class TopologyConstructionTest(test.TestCase):
       preds = model.predict(x)
       self.assertEqual(np.min(preds), 0.)  # At least one unit was dropped.
 
+  def test_multi_output_model_with_none_masking(self):
+
+    with self.test_session():
+      def func(x):
+        return [x * 0.2, x * 0.3]
+
+      def output_shape(input_shape):
+        return [input_shape, input_shape]
+
+      i = keras.layers.Input(shape=(3, 2, 1))
+      o = keras.layers.Lambda(function=func, output_shape=output_shape)(i)
+
+      self.assertEqual(keras.backend.int_shape(o[0]), (None, 3, 2, 1))
+      self.assertEqual(keras.backend.int_shape(o[1]), (None, 3, 2, 1))
+
+      o = keras.layers.add(o)
+      model = keras.Model(i, o)
+
+      i2 = keras.layers.Input(shape=(3, 2, 1))
+      o2 = model(i2)
+      model2 = keras.Model(i2, o2)
+
+      x = np.random.random((4, 3, 2, 1))
+      out = model2.predict(x)
+      assert out.shape == (4, 3, 2, 1)
+      self.assertAllClose(out, x * 0.2 + x * 0.3, atol=1e-4)
+
 
 class DeferredModeTest(test.TestCase):
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
similarity index 89%
rename from tensorflow/python/keras/_impl/keras/engine/training.py
rename to tensorflow/python/keras/engine/training.py
index 5f9b3e8c7d7a93a1fb0b7bc83d6378417391e393..ff50d0b6e27f15ef13449b4eef7b815f4684a033 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -18,25 +18,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import weakref
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import losses
-from tensorflow.python.keras._impl.keras import metrics as metrics_module
-from tensorflow.python.keras._impl.keras import optimizers
-from tensorflow.python.keras._impl.keras.engine import training_arrays
-from tensorflow.python.keras._impl.keras.engine import training_eager
-from tensorflow.python.keras._impl.keras.engine import training_generator
-from tensorflow.python.keras._impl.keras.engine import training_utils
-from tensorflow.python.keras._impl.keras.engine.base_layer import DeferredTensor
-from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
-from tensorflow.python.keras._impl.keras.engine.network import Network
-from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import losses
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.engine import training_arrays
+from tensorflow.python.keras.engine import training_eager
+from tensorflow.python.keras.engine import training_generator
+from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.engine.base_layer import DeferredTensor
+from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.network import Network
+from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as tf_optimizer_module
@@ -106,6 +108,11 @@ class Model(Network):
   ```
   """
 
+  def __init__(self, *args, **kwargs):
+    super(Model, self).__init__(*args, **kwargs)
+    # Create a cache for iterator get_next op.
+    self._iterator_get_next = weakref.WeakKeyDictionary()
+
   def compile(self,
               optimizer,
               loss=None,
@@ -278,6 +285,10 @@ class Model(Network):
           self.metrics_names.append(self.output_names[i] + '_loss')
       self.nested_metrics = training_utils.collect_metrics(metrics,
                                                            self.output_names)
+      # TODO(fchollet): support stateful metrics in eager execution.
+      self.stateful_metric_functions = []
+      self.stateful_metric_names = []
+
       with K.name_scope('metrics'):
         training_utils.populate_metric_names(self)
       self._feed_sample_weight_modes = []
@@ -454,6 +465,7 @@ class Model(Network):
                                                              self.output_names)
     self.metrics_updates = []
     self.stateful_metric_names = []
+    self.stateful_metric_functions = []
     with K.name_scope('metrics'):
       for i in range(len(self.outputs)):
         if i in skip_target_indices:
@@ -509,8 +521,9 @@ class Model(Network):
 
             # Keep track of state updates created by
             # stateful metrics (i.e. metrics layers).
-            if isinstance(metric_fn, Layer):
+            if isinstance(metric_fn, Layer) and metric_fn.stateful:
               self.stateful_metric_names.append(metric_name)
+              self.stateful_metric_functions.append(metric_fn)
               self.metrics_updates += metric_fn.updates
 
         handle_metrics(output_metrics)
@@ -623,12 +636,23 @@ class Model(Network):
           **kwargs)
     self._post_build_cleanup()
 
+  def _get_iterator_get_next_tensors(self, iterator):
+    get_next_op = self._iterator_get_next.get(iterator, None)
+    if get_next_op is None:
+      get_next_op = iterator.get_next()
+      self._iterator_get_next[iterator] = get_next_op
+    return get_next_op
+
   def _standardize_user_data(self,
                              x,
                              y=None,
                              sample_weight=None,
                              class_weight=None,
-                             batch_size=None):
+                             batch_size=None,
+                             check_steps=False,
+                             steps_name='steps',
+                             steps=None,
+                             validation_split=0):
     """Runs validation checks on input and target data passed by the user.
 
     Also standardizes the data to lists of arrays, in order.
@@ -660,6 +684,16 @@ class Model(Network):
         to, as conveyed by `y`.
       batch_size: Integer batch size. If provided, it is used to run additional
         validation checks on stateful models.
+      check_steps: boolean, True if we want to check for validity of `steps` and
+        False, otherwise. For example, when we are standardizing one batch of
+        data for train_on_batch/predict_on_batch/test_on_batch APIs, `steps`
+        value is not required and we should not check for its validity in these
+        cases.
+      steps_name: The public API's parameter name for `steps`.
+      steps: Integer or `None`. Total number of steps (batches of samples) to
+        execute.
+      validation_split: Float between 0 and 1.
+        Fraction of the training data to be used as validation data.
 
     Returns:
       A tuple of 3 lists: input arrays, target arrays, sample-weight arrays.
@@ -671,33 +705,54 @@ class Model(Network):
       ValueError: In case of invalid user-provided data.
       RuntimeError: If the model was never compiled.
     """
-    # First, we build/compile the model on the fly if necessary.
     if isinstance(x, dataset_ops.Dataset):
       raise ValueError('You passed a `Dataset` instance to your model (%s), '
                        'which is not supported. Instead, pass an `Iterator`, '
                        'which you can obtain e.g. via '
                        '`dataset.make_one_shot_iterator()` (the exact method '
                        'to use will depend on your specific dataset).' % x)
-    if isinstance(x, iterator_ops.Iterator):
-      if y is not None:
-        raise ValueError('You passed a dataset iterator (%s) as input `x` to '
-                         'your model. In that case, you should not specify '
-                         'a target (`y`) argument, since the dataset iterator '
-                         'generates both input data and target data. '
-                         'Received: %s' % (x, y))
-      if not context.executing_eagerly():
-        x, y = x.get_next()
-        # TODO(fchollet): handle case of `get_next` not returning 2 tensors?
-      else:
-        # TODO(psv): implement this. The way to support it will be to typecheck
-        # for `iterator` before `_standardize_user_data` is called and redirect
-        # to new training/eval functions in `training_eager.py`. The model
-        # may need to get built using the specs of the data from the first batch
-        # drawn from the iterator.
-        raise ValueError('Dataset iterators are not supported '
-                         'with eager execution yet.')
 
+    # Validates `steps` argument based on x's type.
+    if check_steps:
+      training_utils.check_steps_argument(x, steps, steps_name)
+
+    is_x_eager_iterator = isinstance(x, iterator_ops.EagerIterator)
+    is_x_iterator = isinstance(x, iterator_ops.Iterator)
+
+    # Validate user inputs when data is given as a dataset iterator.
+    if is_x_iterator or is_x_eager_iterator:
+      training_utils.validate_iterator_input(x, y, sample_weight,
+                                             validation_split)
+
+    # For eager iterators, when we have to process multiple batches of samples,
+    # we will standardize the data when we actually loop over iterator and get
+    # the batches. For now, we just return the iterator as is.
+    if is_x_eager_iterator and steps is not None:
+      return x, y, sample_weight
+
+    # If input data is a dataset iterator in graph mode or if it is an eager
+    # iterator and only one batch of samples is required, we fetch the data
+    # tensors from the iterator and then standardize them.
+    if is_x_iterator or is_x_eager_iterator:
+      try:
+        if is_x_iterator:
+          next_element = self._get_iterator_get_next_tensors(x)
+        else:
+          next_element = x.get_next()
+      except errors.OutOfRangeError:
+        raise RuntimeError('Your dataset iterator ran out of data; '
+                           'Make sure that your dataset can generate '
+                           'required number of samples.')
+
+      if not isinstance(next_element, (list, tuple)) or len(next_element) != 2:
+        raise ValueError('Please provide data as a list or tuple of 2 elements '
+                         ' - input and target pair. Received %s' % next_element)
+      x, y = next_element
+
+    # First, we build/compile the model on the fly if necessary.
     all_inputs = []
+    is_build_called = False
+    is_compile_called = False
     if not self.built:
       # We need to use `x` to set the model inputs.
       # We type-check that `x` and `y` are either single arrays
@@ -720,6 +775,7 @@ class Model(Network):
       # If values, then in symbolic-mode placeholders will be created
       # to match the value shapes.
       if not self.inputs:
+        is_build_called = True
         self._set_inputs(x)
 
     if y is not None:
@@ -736,6 +792,7 @@ class Model(Network):
             raise ValueError('Please provide as model targets either a single '
                              'array or a list of arrays. '
                              'You passed: y=' + str(y))
+          all_inputs += list(y)
         elif isinstance(y, dict):
           raise ValueError('Please do not pass a dictionary as model targets.')
         else:
@@ -743,14 +800,10 @@ class Model(Network):
             raise ValueError('Please provide as model targets either a single '
                              'array or a list of arrays. '
                              'You passed: y=' + str(y))
+          all_inputs.append(y)
 
         # Typecheck that all inputs are *either* value *or* symbolic.
         # TODO(fchollet): this check could be removed in Eager mode?
-        if y is not None:
-          if isinstance(y, (list, tuple)):
-            all_inputs += list(y)
-          else:
-            all_inputs.append(y)
         if any(tensor_util.is_tensor(v) for v in all_inputs):
           if not all(tensor_util.is_tensor(v) for v in all_inputs):
             raise ValueError('Do not pass inputs that mix Numpy arrays and '
@@ -764,17 +817,22 @@ class Model(Network):
           if not isinstance(y, (list, tuple)):
             y = [y]
           target_tensors = [v for v in y if tensor_util.is_tensor(v)]
+        is_compile_called = True
         self.compile(optimizer=self.optimizer,
                      loss=self.loss,
                      metrics=self.metrics,
                      loss_weights=self.loss_weights,
                      target_tensors=target_tensors)
 
-    # If `x` and `y` were all symbolic, then no model should not be fed any
-    # inputs and targets.
+    # In graph mode, if we had just set inputs and targets as symbolic tensors
+    # by invoking build and compile on the model respectively, we do not have to
+    # feed anything to the model. Model already has input and target data as
+    # part of the graph.
     # Note: in this case, `any` and `all` are equivalent since we disallow
     # mixed symbolic/value inputs.
-    if any(tensor_util.is_tensor(v) for v in all_inputs):
+    if (not context.executing_eagerly() and is_build_called and
+        is_compile_called and
+        any(tensor_util.is_tensor(v) for v in all_inputs)):
       return [], [], []
 
     # What follows is input validation and standardization to list format,
@@ -904,7 +962,12 @@ class Model(Network):
       if isinstance(inputs, list):
         assert len(inputs) == 1
         inputs = inputs[0]
-      self.build(input_shape=(None,) + inputs.shape[1:])
+
+      if tensor_util.is_tensor(inputs):
+        input_shape = (None,) + tuple(inputs.get_shape().as_list()[1:])
+      else:
+        input_shape = (None,) + inputs.shape[1:]
+      self.build(input_shape=input_shape)
     elif context.executing_eagerly():
       self._eager_set_inputs(inputs)
     else:
@@ -931,12 +994,18 @@ class Model(Network):
     # On-the-fly setting of model inputs/outputs as DeferredTensors,
     # to keep track of number of inputs and outputs and their ndim.
     if isinstance(inputs, (list, tuple)):
-      dummy_output_values = self.call(
-          [ops.convert_to_tensor(v, dtype=K.floatx()) for v in inputs])
+      if tensor_util.is_tensor(inputs[0]):
+        dummy_output_values = self.call(inputs)
+      else:
+        dummy_output_values = self.call(
+            [ops.convert_to_tensor(v, dtype=K.floatx()) for v in inputs])
       dummy_input_values = list(inputs)
     else:
-      dummy_output_values = self.call(
-          ops.convert_to_tensor(inputs, dtype=K.floatx()))
+      if tensor_util.is_tensor(inputs):
+        dummy_output_values = self.call(inputs)
+      else:
+        dummy_output_values = self.call(
+            ops.convert_to_tensor(inputs, dtype=K.floatx()))
       dummy_input_values = [inputs]
     if isinstance(dummy_output_values, (list, tuple)):
       dummy_output_values = list(dummy_output_values)
@@ -1071,7 +1140,7 @@ class Model(Network):
         batch_size: Integer or `None`.
             Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` is your data is in the
+            Do not specify the `batch_size` if your data is in the
             form of symbolic tensors or dataset iterators (since they generate
             batches).
         epochs: Integer. Number of epochs to train the model.
@@ -1094,7 +1163,8 @@ class Model(Network):
             the loss and any model metrics
             on this data at the end of each epoch.
             The validation data is selected from the last samples
-            in the `x` and `y` data provided, before shuffling.
+            in the `x` and `y` data provided, before shuffling. This argument is
+            not supported when `x` is a dataset iterator.
         validation_data: Data on which to evaluate
             the loss and any model metrics at the end of each epoch.
             The model will not be trained on this data.
@@ -1124,7 +1194,8 @@ class Model(Network):
             `(samples, sequence_length)`,
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
-            `sample_weight_mode="temporal"` in `compile()`.
+            `sample_weight_mode="temporal"` in `compile()`. This argument is not
+            supported when `x` is a dataset iterator.
         initial_epoch: Integer.
             Epoch at which to start training
             (useful for resuming a previous training run).
@@ -1165,21 +1236,23 @@ class Model(Network):
       epochs = kwargs.pop('nb_epoch')
     if kwargs:
       raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
-    if x is None and y is None and steps_per_epoch is None:
-      raise ValueError('If fitting from data tensors, '
-                       'you should specify the `steps_per_epoch` '
-                       'argument.')
 
-    # Validate user data.
+    # Validate and standardize user data.
     x, y, sample_weights = self._standardize_user_data(
         x,
         y,
         sample_weight=sample_weight,
         class_weight=class_weight,
-        batch_size=batch_size)
+        batch_size=batch_size,
+        check_steps=True,
+        steps_name='steps_per_epoch',
+        steps=steps_per_epoch,
+        validation_split=validation_split)
+
     # Prepare validation data.
     if validation_data:
-      if isinstance(validation_data, iterator_ops.Iterator):
+      if (isinstance(validation_data, iterator_ops.Iterator) or
+          isinstance(validation_data, iterator_ops.EagerIterator)):
         val_x = validation_data
         val_y = None
         val_sample_weight = None
@@ -1196,11 +1269,13 @@ class Model(Network):
             'or alternatively it could be a dataset iterator. However we '
             'received `validation_data=%s`' % validation_data)
 
+      # Validate and standardize validation data.
       val_x, val_y, val_sample_weights = self._standardize_user_data(
           val_x,
           val_y,
           sample_weight=val_sample_weight,
-          batch_size=batch_size)
+          batch_size=batch_size,
+          steps=validation_steps)
 
     elif validation_split and 0. < validation_split < 1.:
       if training_utils.has_symbolic_tensors(x):
@@ -1229,6 +1304,7 @@ class Model(Network):
           inputs=x,
           targets=y,
           sample_weights=sample_weights,
+          class_weight=class_weight,
           batch_size=batch_size,
           epochs=epochs,
           verbose=verbose,
@@ -1300,7 +1376,8 @@ class Model(Network):
             `(samples, sequence_length)`,
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
-            `sample_weight_mode="temporal"` in `compile()`.
+            `sample_weight_mode="temporal"` in `compile()`. This argument is not
+            supported when `x` is a dataset iterator.
         steps: Integer or `None`.
             Total number of steps (batches of samples)
             before declaring the evaluation round finished.
@@ -1318,17 +1395,16 @@ class Model(Network):
     # Backwards compatibility.
     if batch_size is None and steps is None:
       batch_size = 32
-    if x is None and y is None and steps is None:
-      raise ValueError('If evaluating from data tensors, '
-                       'you should specify the `steps` '
-                       'argument.')
 
-    # Validate user data.
+    # Validate and standardize user data.
     x, y, sample_weights = self._standardize_user_data(
         x,
         y,
         sample_weight=sample_weight,
-        batch_size=batch_size)
+        batch_size=batch_size,
+        check_steps=True,
+        steps_name='steps',
+        steps=steps)
 
     if context.executing_eagerly():
       return training_eager.test_loop(
@@ -1345,7 +1421,12 @@ class Model(Network):
     Computation is done in batches.
 
     Arguments:
-        x: Input samples, as Numpy array(s) or tensor(s).
+         x: Input samples. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A `tf.data` dataset iterator.
         batch_size: Integer or `None`.
             Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
@@ -1369,11 +1450,10 @@ class Model(Network):
     # Backwards compatibility.
     if batch_size is None and steps is None:
       batch_size = 32
-    if x is None and steps is None:
-      raise ValueError('If predicting from data tensors, '
-                       'you should specify the `steps` '
-                       'argument.')
-    x, _, _ = self._standardize_user_data(x)
+
+    # Validate and standardize user data.
+    x, _, _ = self._standardize_user_data(
+        x, check_steps=True, steps_name='steps', steps=steps)
 
     if context.executing_eagerly():
       return training_eager.predict_loop(
@@ -1406,7 +1486,9 @@ class Model(Network):
             with shape (samples, sequence_length),
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
-            sample_weight_mode="temporal" in compile().
+            sample_weight_mode="temporal" in compile(). This argument is not
+            supported when `x` is a dataset iterator.
+
         class_weight: Optional dictionary mapping
             class indices (integers) to
             a weight (float) to apply to the model's loss for the samples
@@ -1424,11 +1506,9 @@ class Model(Network):
     Raises:
       ValueError: In case of invalid user-provided arguments.
     """
+    # Validate and standardize user data.
     x, y, sample_weights = self._standardize_user_data(
-        x,
-        y,
-        sample_weight=sample_weight,
-        class_weight=class_weight)
+        x, y, sample_weight=sample_weight, class_weight=class_weight)
 
     if context.executing_eagerly():
       outputs = training_eager.train_on_batch(
@@ -1470,7 +1550,8 @@ class Model(Network):
             with shape (samples, sequence_length),
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
-            sample_weight_mode="temporal" in compile().
+            sample_weight_mode="temporal" in compile(). This argument is not
+            supported when `x` is a dataset iterator.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -1481,6 +1562,7 @@ class Model(Network):
     Raises:
         ValueError: In case of invalid user-provided arguments.
     """
+    # Validate and standardize user data.
     x, y, sample_weights = self._standardize_user_data(
         x, y, sample_weight=sample_weight)
 
@@ -1503,23 +1585,34 @@ class Model(Network):
     """Returns predictions for a single batch of samples.
 
     Arguments:
-        x: Input samples, as Numpy array(s) or tensor(s).
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A `tf.data` dataset iterator.
 
     Returns:
         Numpy array(s) of predictions.
 
+    Raises:
+        ValueError: In case of mismatch between given number of inputs and
+          expectations of the model.
     """
-    x, _, _ = self._standardize_user_data(x)
-
+    # Validate and standardize user data.
+    inputs, _, _ = self._standardize_user_data(x)
     if context.executing_eagerly():
-      inputs = [ops.convert_to_tensor(val, dtype=K.floatx()) for val in x]
+      if not isinstance(inputs, iterator_ops.EagerIterator):
+        inputs = [
+            ops.convert_to_tensor(val, dtype=K.floatx()) for val in inputs
+        ]
       return self(inputs)  # pylint: disable=not-callable
 
     if not context.executing_eagerly():
       if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        ins = x + [0]
+        ins = inputs + [0]
       else:
-        ins = x
+        ins = inputs
 
       self._make_predict_function()
       outputs = self.predict_function(ins)
@@ -1631,8 +1724,7 @@ class Model(Network):
                             steps_per_epoch=10000, epochs=10)
     ```
     Raises:
-        ValueError: In case the generator yields
-            data in an invalid format.
+        ValueError: In case the generator yields data in an invalid format.
     """
     if not self.built and not self._is_graph_network:
       raise NotImplementedError(
@@ -1659,7 +1751,8 @@ class Model(Network):
                          steps=None,
                          max_queue_size=10,
                          workers=1,
-                         use_multiprocessing=False):
+                         use_multiprocessing=False,
+                         verbose=0):
     """Evaluates the model on a data generator.
 
     The generator should return the same kind of data
@@ -1686,6 +1779,7 @@ class Model(Network):
             Note that because this implementation relies on multiprocessing,
             you should not pass non-picklable arguments to the generator
             as they can't be passed easily to children processes.
+        verbose: Verbosity mode, 0 or 1.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -1697,8 +1791,7 @@ class Model(Network):
         ValueError: in case of invalid arguments.
 
     Raises:
-        ValueError: In case the generator yields
-            data in an invalid format.
+        ValueError: In case the generator yields data in an invalid format.
     """
     if not self.built and not self._is_graph_network:
       raise NotImplementedError(
@@ -1711,7 +1804,8 @@ class Model(Network):
         steps=steps,
         max_queue_size=max_queue_size,
         workers=workers,
-        use_multiprocessing=use_multiprocessing)
+        use_multiprocessing=use_multiprocessing,
+        verbose=verbose)
 
   def predict_generator(self,
                         generator,
@@ -1751,8 +1845,7 @@ class Model(Network):
         Numpy array(s) of predictions.
 
     Raises:
-        ValueError: In case the generator yields
-            data in an invalid format.
+        ValueError: In case the generator yields data in an invalid format.
     """
     if not self.built and not self._is_graph_network:
       raise NotImplementedError(
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/engine/training_arrays.py
rename to tensorflow/python/keras/engine/training_arrays.py
index 4164cae864c7f8aa8f59bb66d585aaa682fc8ff8..93f4f1bd1dde848d9d3afbfd1dcbd26741b9c745 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -24,13 +24,12 @@ import copy
 import numpy as np
 
 from tensorflow.python.framework import errors
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import callbacks as cbks
-from tensorflow.python.keras._impl.keras.engine import training_utils
-from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
-from tensorflow.python.keras._impl.keras.utils.generic_utils import make_batches
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
-from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import callbacks as cbks
+from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.utils.generic_utils import make_batches
+from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.platform import tf_logging as logging
 
 try:
@@ -108,8 +107,8 @@ def fit_loop(model,
   do_validation = False
   if val_inputs:
     do_validation = True
-    if verbose and inputs and hasattr(inputs[0], 'shape') and hasattr(
-        val_inputs[0], 'shape'):
+    if (steps_per_epoch is None and verbose and inputs and
+        hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
       print('Train on %d samples, validate on %d samples' %
             (inputs[0].shape[0], val_inputs[0].shape[0]))
   if validation_steps:
@@ -180,9 +179,8 @@ def fit_loop(model,
 
   for epoch in range(initial_epoch, epochs):
     # Reset stateful metrics
-    for m in model.metrics:
-      if isinstance(m, Layer):
-        m.reset_states()
+    for m in model.stateful_metric_functions:
+      m.reset_states()
     # Update callbacks
     callbacks.on_epoch_begin(epoch)
     epoch_logs = {}
@@ -413,9 +411,8 @@ def test_loop(model, inputs, targets,
     ins = inputs + targets + sample_weights
 
   if hasattr(model, 'metrics'):
-    for m in model.metrics:
-      if isinstance(m, Layer):
-        m.reset_states()
+    for m in model.stateful_metric_functions:
+      m.reset_states()
     stateful_metric_indices = [
         i for i, name in enumerate(model.metrics_names)
         if str(name) in model.stateful_metric_names
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
new file mode 100644
index 0000000000000000000000000000000000000000..081e46aa66176d9d19c48c41b950f8df887e7a84
--- /dev/null
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -0,0 +1,1111 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras training and evaluation routines for eager execution.
+"""
+# pylint: disable=protected-access
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import numpy as np
+
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.eager.backprop import GradientTape
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import callbacks as cbks
+from tensorflow.python.keras import losses
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
+
+
+def _get_metrics_info(metric, internal_output_shapes=None, loss_func=None):
+  if metric == 'accuracy' or metric == 'acc':
+    # custom handling of accuracy
+    # (because of class mode duality)
+    output_shape = internal_output_shapes
+    if output_shape[-1] == 1 or loss_func == losses.binary_crossentropy:
+      # case: binary accuracy
+      acc_fn = metrics_module.binary_accuracy
+    elif loss_func == losses.sparse_categorical_crossentropy:
+      # case: categorical accuracy with sparse targets
+      acc_fn = metrics_module.sparse_categorical_accuracy
+    else:
+      acc_fn = metrics_module.categorical_accuracy
+
+    metric_name = 'acc'
+    return metric_name, acc_fn
+  else:
+    metric_fn = metrics_module.get(metric)
+    metric_name = metric_fn.__name__
+    return metric_name, metric_fn
+
+
+def _eager_loss_fn(outputs, targets, loss_fn, output_name):
+  with backend.name_scope(output_name + '_loss'):
+    loss = loss_fn(targets, outputs)
+  return loss
+
+
+def _eager_metrics_fn(model, outputs, targets):
+  """Calculates the metrics for each output of the given model.
+
+  Arguments:
+      model: The model on which metrics are being calculated.
+      outputs: The outputs of the given model.
+      targets: The predictions or targets of the given model.
+
+  Returns:
+      Returns the metric names and metric results for each output of the model.
+  """
+  metric_names = []
+  metric_results = []
+  if not isinstance(outputs, list):
+    outputs = [outputs]
+
+  if not isinstance(targets, list):
+    targets = [targets]
+
+  for i in range(len(model.outputs)):
+    output_metrics = model.nested_metrics[i]
+    for nested_output_metric in output_metrics:
+      metric_name, metric_fn = _get_metrics_info(
+          nested_output_metric, backend.int_shape(model.outputs[i]),
+          model.loss_functions[i])
+
+      if len(model.output_names) > 1:
+        metric_name = model.output_names[i] + '_' + metric_name
+        if metric_name not in model.metrics_names:
+          model.metrics_names.append(metric_name)
+
+      with backend.name_scope(metric_name):
+        metric_result = metric_fn(targets[i], outputs[i])
+        metric_names.append(metric_name)
+        metric_results.append(backend.mean(metric_result))
+
+  return metric_results
+
+
+def _model_loss(model, inputs, targets, sample_weights=None, training=False):
+  """Calculates the loss for a given model.
+
+  Arguments:
+      model: The model on which metrics are being calculated.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      sample_weights: Optional list of sample weight arrays.
+      training: Whether the model should be run in inference or training mode.
+
+  Returns:
+     Returns the model output, total loss and loss value calculated using the
+     specified loss function. The total loss includes regularization losses and
+     applies masking and sample weighting to the loss value.
+  """
+  total_loss = 0
+  if len(inputs) == 1:
+    if model._expects_training_arg:
+      outs = model.call(inputs[0], training=training)
+    else:
+      outs = model.call(inputs[0])
+  else:
+    if model._expects_training_arg:
+      outs = model.call(inputs, training=training)
+    else:
+      outs = model.call(inputs)
+  if not isinstance(outs, list):
+    outs = [outs]
+
+  if not isinstance(targets, list):
+    targets = [targets]
+
+  loss_metrics = []
+  with backend.name_scope('loss'):
+    for i, loss_fn in enumerate(model.loss_functions):
+      if sample_weights:
+        weights = sample_weights[i]
+      else:
+        weights = None
+
+      # TODO(fchollet): support masking; in practice `_keras_mask` is never
+      # set in this context currently.
+      mask = outs[i]._keras_mask
+
+      weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
+      with backend.name_scope(model.output_names[i] + '_loss'):
+        output_loss = weighted_masked_fn(
+            targets[i], outs[i], weights, mask=mask)
+      # If the number of outputs is 1 then we don't append the loss metric
+      # associated with each model output. When there are multiple outputs
+      # associated with a model, each output's loss is calculated and returned
+      # as part of the loss_metrics.
+      if len(model.outputs) > 1:
+        loss_metrics.append(backend.mean(output_loss))
+
+      loss_weight = model.loss_weights_list[i]
+      if total_loss is None:
+        total_loss = loss_weight * output_loss
+      else:
+        total_loss += loss_weight * output_loss
+
+    total_loss = backend.mean(total_loss)
+    # Add regularization losses
+    custom_losses = []
+    for layer in model.layers:
+      if layer.losses:
+        custom_losses += layer.losses
+
+    if custom_losses:
+      total_loss += sum(custom_losses)
+
+  return outs, total_loss, loss_metrics
+
+
+def iterator_fit_loop(model,
+                      inputs,
+                      class_weight,
+                      steps_per_epoch,
+                      callback_model,
+                      out_labels,
+                      epoch_logs,
+                      val_inputs=None,
+                      val_targets=None,
+                      val_sample_weights=None,
+                      epochs=1,
+                      verbose=1,
+                      callbacks=None,
+                      callback_metrics=None,
+                      validation_steps=None,
+                      do_validation=False):
+  """Fit function for eager execution when input is given as dataset iterator.
+
+  Updates the given epoch logs.
+
+  Arguments:
+      model: Instance of the `Model`.
+      inputs: Input dataset iterator.
+      class_weight: Optional class-weight array to weight the importance of
+          samples in `inputs` based on the class they belong to, as conveyed by
+          the targets from the `inputs` iterator.
+      steps_per_epoch: Total number of steps (batches of samples)
+          before declaring one epoch finished and starting the
+          next epoch.
+      callback_model: Instance of `Model` to callback.
+      out_labels: Output labels generated from model metric names.
+      epoch_logs: Dictionary of logs from every epoch.
+      val_inputs: Input data for validation.
+      val_targets: Target data for validation.
+      val_sample_weights: Sample weight data for validation.
+      epochs: Number of times to iterate over the data
+      verbose: Verbosity mode, 0, 1 or 2
+      callbacks: List of callbacks to be called during training
+      callback_metrics: List of strings, the display names of the metrics
+          passed to the callbacks. They should be the
+          concatenation of list the display names of the outputs of
+           `f` and the list of display names of the outputs of `f_val`.
+      validation_steps: Number of steps to run validation for (only if doing
+        validation from data tensors). Ignored with default value of `None`.
+      do_validation: Boolean value indicating whether we should do validation.
+
+  Raises:
+      ValueError: In case of mismatch between given number of inputs and
+        expectations of the model.
+  """
+  assert isinstance(inputs, iterator_ops.EagerIterator)
+  for step_index in range(steps_per_epoch):
+    batch_logs = {}
+    batch_logs['batch'] = step_index
+    batch_logs['size'] = 1
+    callbacks.on_batch_begin(step_index, batch_logs)
+
+    # Get data from the iterator.
+    try:
+      next_element = inputs.get_next()
+    except errors.OutOfRangeError:
+      logging.warning(
+          'Your dataset iterator ran out of data; '
+          'interrupting training. Make sure that your dataset'
+          ' can generate at least `steps_per_epoch * epochs` '
+          'batches (in this case, %d batches).' % steps_per_epoch * epochs)
+      break
+
+    if not isinstance(next_element, (list, tuple)) or len(next_element) != 2:
+      raise ValueError('Please provide data as a list or tuple of 2 elements '
+                       ' - input and target pair. Received %s' % next_element)
+    x, y = next_element
+
+    # Validate and standardize data.
+    x, y, sample_weights = model._standardize_user_data(
+        x, y, class_weight=class_weight)
+    if sample_weights:
+      sample_weights = [
+          ops.convert_to_tensor(val, dtype=backend.floatx())
+          if val is not None else None for val in sample_weights
+      ]
+
+    if step_index == 0 and not callback_metrics:
+      out_labels = model.metrics_names
+      if do_validation:
+        callback_metrics = copy.copy(out_labels) + [
+            'val_' + n for n in out_labels
+        ]
+      else:
+        callback_metrics = copy.copy(out_labels)
+      callbacks.set_params({
+          'epochs': epochs,
+          'steps': steps_per_epoch,
+          'verbose': verbose,
+          'do_validation': do_validation,
+          'metrics': callback_metrics or [],
+      })
+
+    # Train model.
+    outs, loss, loss_metrics = _process_single_batch(
+        model, x, y, sample_weights=sample_weights, training=True)
+    if not isinstance(outs, list):
+      outs = [outs]
+
+    # Calculate metrics.
+    for l, o in zip(out_labels, outs):
+      batch_logs[l] = o
+    # Required for eager execution
+    metrics_results = _eager_metrics_fn(model, outs, y)
+    batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
+
+    for k, v in zip(model.metrics_names,
+                    [backend.mean(loss)] + loss_metrics + metrics_results):
+      batch_logs[k] = tensor_util.constant_value(v)
+    callbacks.on_batch_end(step_index, batch_logs)
+    if callback_model.stop_training:
+      break
+
+    if step_index == steps_per_epoch - 1:
+      if do_validation:
+        val_outs = test_loop(
+            model,
+            val_inputs,
+            val_targets,
+            sample_weights=val_sample_weights,
+            steps=validation_steps,
+            verbose=0)
+        if not isinstance(val_outs, list):
+          val_outs = [val_outs]
+        # Same labels assumed.
+        for l, o in zip(out_labels, val_outs):
+          epoch_logs['val_' + l] = o
+
+
+def batch_fit_loop(model,
+                   inputs,
+                   targets,
+                   epoch_logs,
+                   index_array,
+                   out_labels,
+                   callback_model,
+                   batch_size,
+                   sample_weights=None,
+                   val_inputs=None,
+                   val_targets=None,
+                   val_sample_weights=None,
+                   callbacks=None,
+                   shuffle=True,
+                   num_train_samples=None,
+                   do_validation=False):
+  """Fit function for eager execution when input is given as arrays or tensors.
+
+  Updates the given epoch logs.
+
+  Arguments:
+      model: Instance of the `Model`.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      epoch_logs: Dictionary of logs from every epoch.
+      index_array: Index array generated from number of training samples.
+      out_labels: Output labels generated from model metric names.
+      callback_model: Instance of `Model` to callback.
+      batch_size: Integer batch size or None if unknown.
+      sample_weights: Optional list of sample weight arrays.
+      val_inputs: Input data for validation.
+      val_targets: Target data for validation.
+      val_sample_weights: Sample weight data for validation.
+      callbacks: List of callbacks to be called during training.
+      shuffle: Whether to shuffle the data at the beginning of each epoch.
+      num_train_samples: Integer number of training samples.
+      do_validation: Boolean value indicating whether we should do validation.
+  """
+  # TODO(psv): Create a dataset iterator instead of manually creating batches
+  # here and in batch_test_loop, batch_predict_loop.
+  if shuffle == 'batch':
+    index_array = model._batch_shuffle(index_array, batch_size)
+  elif shuffle:
+    np.random.shuffle(index_array)
+
+  batches = generic_utils.make_batches(num_train_samples, batch_size)
+
+  for batch_index, (batch_start, batch_end) in enumerate(batches):
+    batch_ids = index_array[batch_start:batch_end]
+    inputs_batch = slice_arrays(inputs, batch_ids, contiguous=not shuffle)
+    targets_batch = slice_arrays(targets, batch_ids, contiguous=not shuffle)
+    if sample_weights:
+      sample_weights_batch = slice_arrays(
+          sample_weights, batch_ids, contiguous=not shuffle)
+    else:
+      sample_weights_batch = None
+    batch_logs = {}
+    batch_logs['batch'] = batch_index
+    batch_logs['size'] = len(batch_ids)
+
+    callbacks.on_batch_begin(batch_index, batch_logs)
+
+    inputs_batch = [
+        ops.convert_to_tensor(val, dtype=backend.floatx())
+        for val in inputs_batch
+    ]
+    targets_batch = [
+        ops.convert_to_tensor(val, dtype=backend.floatx())
+        for val in targets_batch
+    ]
+    if sample_weights:
+      sample_weights_batch = [
+          ops.convert_to_tensor(val, dtype=backend.floatx())
+          if val is not None else None for val in sample_weights_batch
+      ]
+
+    outs, loss, loss_metrics = _process_single_batch(
+        model,
+        inputs_batch,
+        targets_batch,
+        sample_weights=sample_weights_batch,
+        training=True)
+
+    if not isinstance(outs, list):
+      outs = [outs]
+
+    for l, o in zip(out_labels, outs):
+      batch_logs[l] = o
+    # Required for eager execution
+    metrics_results = _eager_metrics_fn(model, outs, targets_batch)
+    batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
+
+    for k, v in zip(model.metrics_names,
+                    [backend.mean(loss)] + loss_metrics + metrics_results):
+      batch_logs[k] = tensor_util.constant_value(v)
+    callbacks.on_batch_end(batch_index, batch_logs)
+    if callback_model.stop_training:
+      break
+
+    if batch_index == len(batches) - 1:  # Last batch.
+      if do_validation:
+        val_outs = test_loop(
+            model,
+            val_inputs,
+            val_targets,
+            sample_weights=val_sample_weights,
+            batch_size=batch_size,
+            verbose=0)
+        if not isinstance(val_outs, list):
+          val_outs = [val_outs]
+        # Same labels assumed.
+        for l, o in zip(out_labels, val_outs):
+          epoch_logs['val_' + l] = o
+
+
+def iterator_test_loop(model, inputs, steps, verbose=0):
+  """Test function for eager execution when input is given as dataset iterator.
+
+  Arguments:
+      model: Model instance that is being evaluated in Eager mode.
+      inputs: Input dataset iterator.
+      steps: Total number of steps (batches of samples) before declaring
+      predictions finished.
+      verbose: Verbosity mode.
+
+  Returns:
+      Scalar loss (if the model has a single output and no metrics)
+      or list of scalars (if the model has multiple outputs
+      and/or metrics). The attribute `model.metrics_names` will give you
+      the display labels for the scalar outputs.
+
+  Raises:
+      ValueError: In case of mismatch between given number of inputs and
+        expectations of the model.
+  """
+  assert isinstance(inputs, iterator_ops.EagerIterator)
+  outs = []
+  num_samples = 0
+  if verbose == 1:
+    progbar = generic_utils.Progbar(target=steps)
+  for step_index in range(steps):
+    # Get data from the iterator.
+    try:
+      next_element = inputs.get_next()
+    except errors.OutOfRangeError:
+      logging.warning(
+          'Your dataset iterator ran out of data interrupting testing. '
+          'Make sure that your dataset can generate at least `steps` batches '
+          '(in this case, %d batches).', steps)
+      break
+
+    if not isinstance(next_element, (list, tuple)) or len(next_element) != 2:
+      raise ValueError('Please provide data as a list or tuple of 2 elements '
+                       ' - input and target pair. Received %s' % next_element)
+    x, y = next_element
+
+    # Validate and standardize data.
+    x, y, sample_weights = model._standardize_user_data(x, y)
+
+    # Calculate model output, loss values.
+    loss_outs, loss, loss_metrics = _model_loss(
+        model, x, y, sample_weights=sample_weights, training=False)
+    metrics_results = _eager_metrics_fn(model, loss_outs, y)
+    batch_outs = []
+    for _, v in zip(model.metrics_names,
+                    [backend.mean(loss)] + loss_metrics + metrics_results):
+      batch_outs.append(tensor_util.constant_value(v))
+
+    # Get current step size.
+    if isinstance(x, list):
+      step_size = x[0].get_shape().as_list()[0]
+    else:
+      step_size = x.get_shape().as_list()[0]
+
+    # Accumulate results in output array.
+    if not isinstance(batch_outs, list):
+      batch_outs = [batch_outs]
+    if step_index == 0:
+      for _ in enumerate(batch_outs):
+        outs.append(0.)
+    for i, batch_out in enumerate(batch_outs):
+      outs[i] += batch_out * step_size
+
+    # Calculate sample size.
+    num_samples += step_size
+    if verbose == 1:
+      progbar.update(step_index + 1)
+
+    for i in range(len(outs)):
+      outs[i] /= num_samples
+    if len(outs) == 1:
+      return outs[0]
+    return outs
+
+
+def batch_test_loop(model,
+                    inputs,
+                    targets,
+                    batch_size,
+                    sample_weights=None,
+                    verbose=0):
+  """Test function for eager execution when input is given as arrays or tensors.
+
+  Arguments:
+      model: Model instance that is being evaluated in Eager mode.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      batch_size: Integer batch size.
+      sample_weights: Optional list of sample weight arrays.
+      verbose: Verbosity mode.
+
+  Returns:
+      Scalar loss (if the model has a single output and no metrics)
+      or list of scalars (if the model has multiple outputs
+      and/or metrics). The attribute `model.metrics_names` will give you
+      the display labels for the scalar outputs.
+  """
+  outs = []
+  feed_data = inputs + targets
+  if sample_weights:
+    feed_data += sample_weights
+  num_samples = training_utils.check_num_samples(
+      feed_data, batch_size=batch_size)
+  if verbose == 1:
+    progbar = generic_utils.Progbar(target=num_samples)
+  batches = generic_utils.make_batches(num_samples, batch_size)
+  index_array = np.arange(num_samples)
+  for batch_index, (batch_start, batch_end) in enumerate(batches):
+    batch_ids = index_array[batch_start:batch_end]
+    inputs_batch = slice_arrays(inputs, batch_ids)
+    targets_batch = slice_arrays(targets, batch_ids)
+    if sample_weights:
+      sample_weights_batch = slice_arrays(sample_weights, batch_ids)
+    else:
+      sample_weights_batch = None
+
+    inputs_batch = [
+        ops.convert_to_tensor(val, dtype=backend.floatx())
+        for val in inputs_batch
+    ]
+    targets_batch = [
+        ops.convert_to_tensor(val, dtype=backend.floatx())
+        for val in targets_batch
+    ]
+    if sample_weights:
+      sample_weights_batch = [
+          ops.convert_to_tensor(val, dtype=backend.floatx())
+          if val is not None else None for val in sample_weights_batch
+      ]
+
+    loss_outs, loss, loss_metrics = _model_loss(
+        model,
+        inputs_batch,
+        targets_batch,
+        sample_weights=sample_weights_batch,
+        training=False)
+    metrics_results = _eager_metrics_fn(model, loss_outs, targets_batch)
+    batch_outs = []
+    for _, v in zip(model.metrics_names,
+                    [backend.mean(loss)] + loss_metrics + metrics_results):
+      batch_outs.append(tensor_util.constant_value(v))
+
+    if isinstance(batch_outs, list):
+      if batch_index == 0:
+        for _ in enumerate(batch_outs):
+          outs.append(0.)
+      for i, batch_out in enumerate(batch_outs):
+        outs[i] += batch_out * len(batch_ids)
+    else:
+      if batch_index == 0:
+        outs.append(0.)
+      outs[0] += batch_outs * len(batch_ids)
+
+    if verbose == 1:
+      progbar.update(batch_end)
+
+  for i in range(len(outs)):
+    outs[i] /= num_samples
+  if len(outs) == 1:
+    return outs[0]
+  return outs
+
+
+def iterator_predict_loop(model, inputs, steps, verbose=0):
+  """Predict function for eager execution when input is dataset iterator.
+
+  Arguments:
+      model: Instance of `Model`.
+      inputs: Input dataset iterator.
+      steps: Total number of steps (batches of samples) before declaring
+          `_predict_loop` finished.
+      verbose: Verbosity mode.
+
+  Returns:
+      Array of predictions (if the model has a single output)
+      or list of arrays of predictions (if the model has multiple outputs).
+
+  Raises:
+      ValueError: In case of mismatch between given number of inputs and
+        expectations of the model.
+  """
+  assert isinstance(inputs, iterator_ops.EagerIterator)
+  outs = []
+  if verbose == 1:
+    progbar = generic_utils.Progbar(target=steps)
+  for step_index in range(steps):
+    # Get data from the iterator.
+    try:
+      next_element = inputs.get_next()
+    except errors.OutOfRangeError:
+      logging.warning(
+          'Your dataset iterator ran out of data; '
+          'interrupting prediction. Make sure that your '
+          'dataset can generate at least `steps` '
+          'batches (in this case, %d batches).', steps)
+      break
+
+    if not isinstance(next_element, (list, tuple)) or len(next_element) != 2:
+      raise ValueError(
+          'Please provide data as a list or tuple of 2 elements '
+          ' - input and target pair. Received %s. We do not use the '
+          '`target` value here.' % next_element)
+    x, _ = next_element
+
+    # Validate and standardize data.
+    x, _, _ = model._standardize_user_data(x)
+
+    if model._expects_training_arg:
+      batch_outs = model.call(x[0] if len(x) == 1 else x, training=False)
+    else:
+      batch_outs = model.call(x[0] if len(x) == 1 else x)
+    if not isinstance(batch_outs, list):
+      batch_outs = [batch_outs]
+
+    # We collect the results from every step and then concatenate them once
+    # in the end. This is an expensive process. We are doing this because we
+    # do not know the number of samples beforehand.
+    if step_index == 0:
+      for _ in batch_outs:
+        outs.append([])
+    for i, batch_out in enumerate(batch_outs):
+      outs[i].append(backend.get_value(batch_out))
+
+    if verbose == 1:
+      progbar.update(step_index + 1)
+  for i, out in enumerate(outs):
+    outs[i] = np.concatenate(tuple(out), axis=0)
+  if len(outs) == 1:
+    return outs[0]
+  return outs
+
+
+def batch_predict_loop(model, inputs, batch_size, verbose=0):
+  """Predict function for eager execution when input is arrays or tensors.
+
+  Arguments:
+      model: Instance of `Model`.
+      inputs: List of input arrays.
+      batch_size: Integer batch size.
+      verbose: Verbosity mode.
+
+  Returns:
+      Array of predictions (if the model has a single output)
+      or list of arrays of predictions (if the model has multiple outputs).
+  """
+  outs = []
+  num_samples = training_utils.check_num_samples(inputs, batch_size)
+  if verbose == 1:
+    progbar = generic_utils.Progbar(target=num_samples)
+  batches = generic_utils.make_batches(num_samples, batch_size)
+  index_array = np.arange(num_samples)
+  for batch_index, (batch_start, batch_end) in enumerate(batches):
+    batch_ids = index_array[batch_start:batch_end]
+    inputs_batch = slice_arrays(inputs, batch_ids)
+
+    inputs_batch = [
+        ops.convert_to_tensor(val, dtype=backend.floatx())
+        for val in inputs_batch
+    ]
+
+    if len(inputs_batch) == 1:
+      if model._expects_training_arg:
+        batch_outs = model.call(inputs_batch[0], training=False)
+      else:
+        batch_outs = model.call(inputs_batch[0])
+    else:
+      if model._expects_training_arg:
+        batch_outs = model.call(inputs_batch, training=False)
+      else:
+        batch_outs = model.call(inputs_batch)
+
+    if not isinstance(batch_outs, list):
+      batch_outs = [batch_outs]
+    if batch_index == 0:
+      # Pre-allocate the results arrays.
+      for batch_out in batch_outs:
+        dims = batch_out.shape[1:].dims
+        dims_list = [d.value for d in dims]
+        shape = (num_samples,) + tuple(dims_list)
+        outs.append(np.zeros(shape, dtype=batch_out.dtype.as_numpy_dtype))
+    for i, batch_out in enumerate(batch_outs):
+      outs[i][batch_start:batch_end] = batch_out
+    if verbose == 1:
+      progbar.update(batch_end)
+
+  if len(outs) == 1:
+    return outs[0]
+  return outs
+
+
+def slice_arrays(arrays, indices, contiguous=True):
+  """Slices batches out of provided arrays (workaround for eager tensors).
+
+  Unfortunately eager tensors don't have the same slicing behavior as
+  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
+  hence we cannot use `generic_utils.slice_arrays` directly
+  and we have to implement this workaround based on `concat`. This has a
+  performance cost.
+
+  Arguments:
+    arrays: Single array or list of arrays.
+    indices: List of indices in the array that should be included in the output
+      batch.
+    contiguous: Boolean flag indicating whether the indices are contiguous.
+
+  Returns:
+    Slice of data (either single array or list of arrays).
+  """
+  if any(tensor_util.is_tensor(x) for x in arrays):
+    converted_to_list = False
+    if not isinstance(arrays, list):
+      converted_to_list = True
+      arrays = [arrays]
+    if not contiguous:
+      entries = [[x[i:i + 1] for i in indices] for x in arrays]
+      slices = [array_ops.concat(x, axis=0) for x in entries]
+    else:
+      slices = [x[indices[0]:indices[-1] + 1] for x in arrays]
+    if converted_to_list:
+      slices = slices[0]
+    return slices
+  else:
+    return generic_utils.slice_arrays(arrays, indices)
+
+
+def _process_single_batch(model,
+                          inputs,
+                          targets,
+                          sample_weights=None,
+                          training=False):
+  """Calculate the loss and gradient for one input batch.
+
+     The model weights are updated if training is set to True.
+
+  Arguments:
+      model: Model whose loss has to be calculated.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      sample_weights: Optional list of sample weight arrays.
+      training: The boolean represents if the weights of the model are updated.
+              'fit' methods will set this to True while 'evaluate' methods will
+              set this to False.
+
+  Returns:
+      output of the model, total loss and the loss associated with each output.
+
+  Raises:
+      ValueError: If the model has no loss to optimize.
+  """
+  with backend.learning_phase_scope(1 if training else 0):
+    with GradientTape() as tape:
+      outs, loss, loss_metrics = _model_loss(model, inputs, targets,
+                                             sample_weights=sample_weights,
+                                             training=training)
+      if loss is None:
+        raise ValueError('The model cannot be run '
+                         'because it has no loss to optimize.')
+    if training:
+      if not model._collected_trainable_weights:
+        logging.warning('The list of trainable weights is empty. Make sure that'
+                        ' you are not setting model.trainable to False before '
+                        'compiling the model.')
+      else:
+        grads = tape.gradient(loss, model._collected_trainable_weights)
+        model.optimizer.apply_gradients(zip(grads,
+                                            model._collected_trainable_weights))
+    return outs, loss, loss_metrics
+
+
+def train_on_batch(model, inputs, targets, sample_weights=None):
+  """Calculates the loss and gradient updates for one input batch.
+
+  Arguments:
+      model: Model whose loss has to be calculated.
+      inputs: Input batch data.
+      targets: Target batch data.
+      sample_weights: Sample weight batch data.
+
+  Returns:
+      total loss and the loss associated with each output.
+  """
+  if len(inputs) and not tensor_util.is_tensor(inputs[0]):
+    inputs = [
+        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
+    ]
+    targets = [
+        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
+    ]
+  if sample_weights:
+    sample_weights = [
+        ops.convert_to_tensor(val, dtype=backend.floatx())
+        if val is not None else None for val in sample_weights
+    ]
+
+  outs, loss, _ = _process_single_batch(
+      model, inputs, targets, sample_weights=sample_weights, training=True)
+  if not isinstance(outs, list):
+    outs = [outs]
+  metrics_results = _eager_metrics_fn(model, outs, targets)
+  if not isinstance(loss, list):
+    loss = [loss]
+  return loss + metrics_results
+
+
+def test_on_batch(model, inputs, targets, sample_weights=None):
+  """Calculates the loss for one input batch.
+
+  Arguments:
+      model: Model whose loss has to be calculated.
+      inputs: Input batch data.
+      targets: Target batch data.
+      sample_weights: Sample weight batch data.
+
+  Returns:
+      total loss, loss and metrics associated with each output.
+  """
+  if len(inputs) and not tensor_util.is_tensor(inputs[0]):
+    inputs = [
+        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
+    ]
+    targets = [
+        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
+    ]
+  if sample_weights:
+    sample_weights = [
+        ops.convert_to_tensor(val, dtype=backend.floatx())
+        if val is not None else None for val in sample_weights
+    ]
+  outs, loss, loss_metrics = _model_loss(
+      model, inputs, targets, sample_weights=sample_weights, training=False)
+  if not isinstance(outs, list):
+    outs = [outs]
+  metrics_results = _eager_metrics_fn(model, outs, targets)
+  if not isinstance(loss, list):
+    loss = [loss]
+  return loss + loss_metrics + metrics_results
+
+
+def fit_loop(model,
+             inputs,
+             targets,
+             sample_weights=None,
+             class_weight=None,
+             val_inputs=None,
+             val_targets=None,
+             val_sample_weights=None,
+             batch_size=None,
+             epochs=1,
+             verbose=1,
+             callbacks=None,
+             shuffle=True,
+             callback_metrics=None,
+             initial_epoch=0,
+             steps_per_epoch=None,
+             validation_steps=None):
+  """Fit function for eager execution.
+
+  Arguments:
+      model: Instance of the model that is being executed in Eager mode.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      sample_weights: Optional list of sample weight arrays.
+      class_weight: Optional class-weight array to weight the importance of
+          samples in `inputs` based on the class they belong to, as conveyed by
+          `targets`.
+      val_inputs: Input data for validation.
+      val_targets: Target data for validation.
+      val_sample_weights: Sample weight data for validation.
+      batch_size: Integer batch size or None if unknown.
+      epochs: Number of times to iterate over the data
+      verbose: Verbosity mode, 0, 1 or 2
+      callbacks: List of callbacks to be called during training
+      shuffle: Whether to shuffle the data at the beginning of each epoch
+      callback_metrics: List of strings, the display names of the metrics
+          passed to the callbacks. They should be the
+          concatenation of list the display names of the outputs of
+           `f` and the list of display names of the outputs of `f_val`.
+      initial_epoch: Epoch at which to start training
+          (useful for resuming a previous training run)
+      steps_per_epoch: Total number of steps (batches of samples)
+          before declaring one epoch finished and starting the
+          next epoch. Ignored with the default value of `None`.
+      validation_steps: Number of steps to run validation for (only if doing
+        validation from data tensors). Ignored with default value of `None`.
+
+  Returns:
+      `History` object.
+
+  Raises:
+    ValueError: In case of invalid argument values.
+  """
+  # Required for eager execution
+  with backend.learning_phase_scope(1):
+    do_validation = False
+    if val_inputs:
+      do_validation = True
+      if (steps_per_epoch is None and verbose and inputs and
+          hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
+        print('Train on %d samples, validate on %d samples' %
+              (inputs[0].shape[0], val_inputs[0].shape[0]))
+
+    num_train_samples = None
+    out_labels = None
+    if steps_per_epoch is None or model._is_compiled:
+      out_labels = model.metrics_names
+      if do_validation:
+        callback_metrics = copy.copy(out_labels) + [
+            'val_' + n for n in out_labels
+        ]
+      else:
+        callback_metrics = copy.copy(out_labels)
+
+    if steps_per_epoch is None:
+      if sample_weights:
+        feed_data = inputs + targets + sample_weights
+      else:
+        feed_data = inputs + targets
+      num_train_samples = training_utils.check_num_samples(
+          feed_data,
+          batch_size=batch_size,
+          steps=steps_per_epoch,
+          steps_name='steps_per_epoch')
+
+      if num_train_samples is not None:
+        index_array = np.arange(num_train_samples)
+
+    model.history = cbks.History()
+    callbacks = [cbks.BaseLogger()] + (callbacks or []) + [model.history]
+    if verbose:
+      if steps_per_epoch is not None:
+        count_mode = 'steps'
+      else:
+        count_mode = 'samples'
+      callbacks += [cbks.ProgbarLogger(count_mode)]
+    callbacks = cbks.CallbackList(callbacks)
+
+    # it's possible to callback a different model than self
+    # (used by Sequential models)
+    if hasattr(model, 'callback_model') and model.callback_model:
+      callback_model = model.callback_model
+    else:
+      callback_model = model
+
+    callbacks.set_model(callback_model)
+
+    callbacks.set_params({
+        'batch_size': batch_size,
+        'epochs': epochs,
+        'steps': steps_per_epoch,
+        'samples': num_train_samples,
+        'verbose': verbose,
+        'do_validation': do_validation,
+        'metrics': callback_metrics or [],
+    })
+    callbacks.on_train_begin()
+    callback_model.stop_training = False
+    for cbk in callbacks:
+      if not val_inputs:
+        cbk.validation_data = []
+      elif isinstance(val_inputs, iterator_ops.EagerIterator):
+        cbk.validation_data = val_inputs
+      elif val_sample_weights:
+        cbk.validation_data = val_inputs + val_targets + val_sample_weights
+      else:
+        cbk.validation_data = val_inputs + val_targets
+
+    for epoch in range(initial_epoch, epochs):
+      callbacks.on_epoch_begin(epoch)
+      epoch_logs = {}
+
+      if steps_per_epoch is not None:
+        iterator_fit_loop(
+            model,
+            inputs,
+            class_weight,
+            steps_per_epoch=steps_per_epoch,
+            callback_model=callback_model,
+            out_labels=out_labels,
+            epoch_logs=epoch_logs,
+            val_inputs=val_inputs,
+            val_targets=val_targets,
+            val_sample_weights=val_sample_weights,
+            epochs=epochs,
+            verbose=verbose,
+            callbacks=callbacks,
+            callback_metrics=callback_metrics,
+            validation_steps=validation_steps,
+            do_validation=do_validation)
+      else:
+        batch_fit_loop(
+            model,
+            inputs,
+            targets,
+            epoch_logs=epoch_logs,
+            index_array=index_array,
+            out_labels=out_labels,
+            callback_model=callback_model,
+            batch_size=batch_size,
+            sample_weights=sample_weights,
+            val_inputs=val_inputs,
+            val_targets=val_targets,
+            val_sample_weights=val_sample_weights,
+            callbacks=callbacks,
+            shuffle=shuffle,
+            num_train_samples=num_train_samples,
+            do_validation=do_validation)
+      callbacks.on_epoch_end(epoch, epoch_logs)
+      if callback_model.stop_training:
+        break
+  callbacks.on_train_end()
+  return model.history
+
+
+def test_loop(model, inputs, targets,
+              sample_weights=None,
+              batch_size=None,
+              verbose=0,
+              steps=None):
+  """Test function for eager execution.
+
+  Arguments:
+      model: Model instance that is being evaluated in Eager mode.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      sample_weights: Optional list of sample weight arrays.
+      batch_size: integer batch size or `None`.
+      verbose: verbosity mode.
+      steps: Total number of steps (batches of samples)
+          before declaring predictions finished.
+          Ignored with the default value of `None`.
+
+  Returns:
+      Scalar loss (if the model has a single output and no metrics)
+      or list of scalars (if the model has multiple outputs
+      and/or metrics). The attribute `model.metrics_names` will give you
+      the display labels for the scalar outputs.
+  """
+  with backend.learning_phase_scope(0):
+    if steps is not None:
+      return iterator_test_loop(model, inputs, steps, verbose=verbose)
+    else:
+      return batch_test_loop(
+          model,
+          inputs,
+          targets,
+          batch_size=batch_size,
+          sample_weights=sample_weights,
+          verbose=verbose)
+
+
+def predict_loop(model, inputs,
+                 batch_size=32,
+                 verbose=0,
+                 steps=None):
+  """Predict function for eager execution.
+
+  Arguments:
+      model: Instance of `Model`.
+      inputs: List of input arrays.
+      batch_size: integer batch size.
+      verbose: verbosity mode.
+      steps: Total number of steps (batches of samples)
+          before declaring `_predict_loop` finished.
+          Ignored with the default value of `None`.
+
+  Returns:
+      Array of predictions (if the model has a single output)
+      or list of arrays of predictions
+      (if the model has multiple outputs).
+  """
+  with backend.learning_phase_scope(0):
+    if steps is not None:
+      return iterator_predict_loop(model, inputs, steps, verbose=verbose)
+    else:
+      return batch_predict_loop(
+          model, inputs, batch_size=batch_size, verbose=verbose)
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
rename to tensorflow/python/keras/engine/training_eager_test.py
index 5adb3ef94086f6aa3ff299c20c5b924b8438916a..d9446fd4373e7403345a4b5a8a4e35faace941a2 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
@@ -94,7 +94,7 @@ class TrainingTest(test.TestCase):
         verbose=2)
     model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
 
-  # Test with validation split
+    # Test with validation split
     model.fit(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         epochs=2,
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
similarity index 92%
rename from tensorflow/python/keras/_impl/keras/engine/training_generator.py
rename to tensorflow/python/keras/engine/training_generator.py
index 58b5bc39c10ea06f680eb030e14ecd19a3888588..d81b384f0e1810614bd98e3861b4324f0f8a4dca 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -21,12 +21,12 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import callbacks as cbks
-from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
-from tensorflow.python.keras._impl.keras.utils.data_utils import OrderedEnqueuer
-from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import callbacks as cbks
+from tensorflow.python.keras.utils.data_utils import GeneratorEnqueuer
+from tensorflow.python.keras.utils.data_utils import OrderedEnqueuer
+from tensorflow.python.keras.utils.data_utils import Sequence
+from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -49,9 +49,6 @@ def fit_generator(model,
   epoch = initial_epoch
 
   do_validation = bool(validation_data)
-  model._make_train_function()
-  if do_validation:
-    model._make_test_function()
 
   is_sequence = isinstance(generator, Sequence)
   if not is_sequence and use_multiprocessing and workers > 1:
@@ -155,6 +152,8 @@ def fit_generator(model,
     # Construct epoch logs.
     epoch_logs = {}
     while epoch < epochs:
+      for m in model.stateful_metric_functions:
+        m.reset_states()
       callbacks.on_epoch_begin(epoch)
       steps_done = 0
       batch_index = 0
@@ -250,9 +249,18 @@ def evaluate_generator(model,
                        steps=None,
                        max_queue_size=10,
                        workers=1,
-                       use_multiprocessing=False):
+                       use_multiprocessing=False,
+                       verbose=0):
   """See docstring for `Model.evaluate_generator`."""
-  model._make_test_function()
+  stateful_metric_indices = []
+  if hasattr(model, 'metrics'):
+    for m in model.stateful_metric_functions:
+      m.reset_states()
+    stateful_metric_indices = [
+        i for i, name in enumerate(model.metrics_names)
+        if str(name) in model.stateful_metric_names]
+  else:
+    stateful_metric_indices = []
 
   steps_done = 0
   wait_time = 0.01
@@ -293,6 +301,9 @@ def evaluate_generator(model,
       else:
         output_generator = generator
 
+    if verbose == 1:
+      progbar = Progbar(target=steps)
+
     while steps_done < steps:
       generator_output = next(output_generator)
       if not hasattr(generator_output, '__len__'):
@@ -323,6 +334,8 @@ def evaluate_generator(model,
 
       steps_done += 1
       batch_sizes.append(batch_size)
+      if verbose == 1:
+        progbar.update(steps_done)
 
   finally:
     if enqueuer is not None:
@@ -333,8 +346,11 @@ def evaluate_generator(model,
   else:
     averages = []
     for i in range(len(outs)):
-      averages.append(
-          np.average([out[i] for out in all_outs], weights=batch_sizes))
+      if i not in stateful_metric_indices:
+        averages.append(
+            np.average([out[i] for out in all_outs], weights=batch_sizes))
+      else:
+        averages.append(float(all_outs[-1][i]))
     return averages
 
 
@@ -346,8 +362,6 @@ def predict_generator(model,
                       use_multiprocessing=False,
                       verbose=0):
   """See docstring for `Model.predict_generator`."""
-  model._make_predict_function()
-
   steps_done = 0
   wait_time = 0.01
   all_outs = []
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/engine/training_test.py
rename to tensorflow/python/keras/engine/training_test.py
index 58011a141268e588e6b746afca982f36f4d4d176..7dec0bbf8a681215d9a23db8f3419cc315a4b39c 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -23,13 +23,14 @@ import unittest
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
-from tensorflow.python.keras._impl.keras.engine.training_utils import weighted_masked_objective
-from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine.training_utils import weighted_masked_objective
+from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -946,6 +947,7 @@ class TestGeneratorMethods(test.TestCase):
                                  steps=5,
                                  max_queue_size=10,
                                  workers=2,
+                                 verbose=1,
                                  use_multiprocessing=True)
         model.evaluate_generator(custom_generator(),
                                  steps=5,
@@ -1340,16 +1342,12 @@ class TestTrainingWithDataTensors(test.TestCase):
                                  output_a_np)
 
       # test fit
-      out = model.fit(None,
-                      output_a_np, epochs=1, batch_size=10)
-      out = model.fit(None,
-                      output_a_np, epochs=1, batch_size=10)
+      _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=3)
+      _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=3)
 
       # test evaluate
-      out = model.evaluate(None,
-                           output_a_np, batch_size=10)
-      out = model.evaluate(None,
-                           output_a_np, batch_size=10)
+      _ = model.evaluate(None, output_a_np, steps=3)
+      _ = model.evaluate(None, output_a_np, steps=3)
 
       # test predict
       out = model.predict(None, steps=3)
@@ -1383,16 +1381,12 @@ class TestTrainingWithDataTensors(test.TestCase):
                                  output_a_np)
 
       # test fit
-      out = model.fit(None,
-                      output_a_np, epochs=1, batch_size=10)
-      out = model.fit(None,
-                      output_a_np, epochs=1, batch_size=10)
+      _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=10)
+      _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=10)
 
       # test evaluate
-      out = model.evaluate(None,
-                           output_a_np, batch_size=10)
-      out = model.evaluate(None,
-                           output_a_np, batch_size=10)
+      _ = model.evaluate(None, output_a_np, steps=10)
+      _ = model.evaluate(None, output_a_np, steps=10)
 
       # test predict
       out = model.predict(None, steps=3)
@@ -1715,40 +1709,56 @@ class TestTrainingWithDataTensors(test.TestCase):
 
 class TestTrainingWithDatasetIterators(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_training_and_eval_methods_on_iterators_single_io(self):
     with self.test_session():
       x = keras.layers.Input(shape=(3,), name='input')
       y = keras.layers.Dense(4, name='dense')(x)
       model = keras.Model(x, y)
 
-      optimizer = 'rmsprop'
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
       metrics = ['mae']
       model.compile(optimizer, loss, metrics=metrics)
 
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      targets = np.zeros((10, 4), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
       dataset = dataset.batch(10)
       iterator = dataset.make_one_shot_iterator()
 
-      model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=0)
-      model.evaluate(iterator, steps=2, verbose=0)
+      model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+      model.evaluate(iterator, steps=2, verbose=1)
       model.predict(iterator, steps=2)
       model.train_on_batch(iterator)
       model.test_on_batch(iterator)
+      model.predict_on_batch(iterator)
+
       # Test with validation data
       model.fit(iterator,
                 epochs=1, steps_per_epoch=2, verbose=0,
                 validation_data=iterator, validation_steps=2)
       # Test with validation split
-      with self.assertRaisesRegexp(ValueError,
-                                   'you cannot use `validation_split`'):
+      with self.assertRaisesRegexp(
+          ValueError, '`validation_split` argument is not supported '
+          'when input `x` is a dataset iterator'):
         model.fit(iterator,
                   epochs=1, steps_per_epoch=2, verbose=0,
                   validation_split=0.5, validation_steps=2)
 
+      # Test with sample weight.
+      sample_weight = np.random.random((10,))
+      with self.assertRaisesRegexp(
+          ValueError, '`sample_weight` argument is not supported '
+          'when input `x` is a dataset iterator'):
+        model.fit(
+            iterator,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            sample_weight=sample_weight)
+
       # Test invalid usage
       with self.assertRaisesRegexp(ValueError,
                                    'Instead, pass an `Iterator`'):
@@ -1759,19 +1769,54 @@ class TestTrainingWithDatasetIterators(test.TestCase):
         model.fit(iterator, iterator,
                   epochs=1, steps_per_epoch=2, verbose=0)
 
+      with self.assertRaisesRegexp(
+          ValueError, 'you should specify the `steps_per_epoch` argument'):
+        model.fit(iterator, epochs=1, verbose=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'you should specify the `steps` argument'):
+        model.evaluate(iterator, verbose=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'you should specify the `steps` argument'):
+        model.predict(iterator, verbose=0)
+
+  def test_get_next_op_created_once(self):
+    with self.test_session():
+      x = keras.layers.Input(shape=(3,), name='input')
+      y = keras.layers.Dense(4, name='dense')(x)
+      model = keras.Model(x, y)
+
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      targets = np.zeros((10, 4), dtype=np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+      iterator = dataset.make_one_shot_iterator()
+
+      model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+      # Finalize graph to make sure we are not appending another iterator
+      # get_next op in the graph.
+      ops.get_default_graph().finalize()
+      model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_iterators_running_out_of_data(self):
     with self.test_session():
       x = keras.layers.Input(shape=(3,), name='input')
       y = keras.layers.Dense(4, name='dense')(x)
       model = keras.Model(x, y)
 
-      optimizer = 'rmsprop'
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
       metrics = ['mae']
       model.compile(optimizer, loss, metrics=metrics)
 
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      targets = np.zeros((10, 4), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(2)
       dataset = dataset.batch(10)
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
similarity index 86%
rename from tensorflow/python/keras/_impl/keras/engine/training_utils.py
rename to tensorflow/python/keras/engine/training_utils.py
index 662938f421b3a338d6720b2911347664711c1ac1..7d214d61a4d04a2083119fab3db84d3a9b151df5 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -22,11 +22,12 @@ import copy
 
 import numpy as np
 
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import losses
-from tensorflow.python.keras._impl.keras import metrics as metrics_module
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import losses
+from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.ops import math_ops
 
 
@@ -65,14 +66,7 @@ def check_num_samples(ins,
   if steps is not None and batch_size is not None:
     raise ValueError(
         'If ' + steps_name + ' is set, the `batch_size` must be None.')
-
-  if not ins or has_symbolic_tensors(ins):
-    if steps is None:
-      raise ValueError('If your data is in the form of symbolic tensors, '
-                       'you should specify the `' + steps_name + '` argument '
-                       '(instead of the `batch_size` argument, '
-                       'because symbolic tensors are expected to produce '
-                       'batches of input data).')
+  if check_steps_argument(ins, steps, steps_name):
     return None
   if hasattr(ins[0], 'shape'):
     return int(ins[0].shape[0])
@@ -551,8 +545,11 @@ def standardize_weights(y,
 
 
 def has_symbolic_tensors(ls):
-  return (any(tensor_util.is_tensor(v) for v in ls)
-          and not context.executing_eagerly())
+  if context.executing_eagerly():
+    return False
+  if isinstance(ls, (list, tuple)):
+    return any(tensor_util.is_tensor(v) for v in ls)
+  return tensor_util.is_tensor(ls)
 
 
 def populate_metric_names(model):
@@ -614,3 +611,77 @@ def add_metric_name(model, metric_name, index):
     metric_name = '%s_%d' % (base_metric_name, j)
     j += 1
   model.metrics_names.append(metric_name)
+
+
+def validate_iterator_input(x, y, sample_weight, validation_split=None):
+  """Validates user input arguments when a dataset iterator is passed.
+
+  Arguments:
+    x: Input data. A `tf.data` dataset iterator.
+    y: Target data. It could be either Numpy array(s) or TensorFlow tensor(s).
+        Expected to be `None` when `x` is a dataset iterator.
+    sample_weight: An optional sample-weight array passed by the user to
+        weight the importance of each sample in `x`. Expected to be `None` when
+        `x` is a dataset iterator
+    validation_split: Float between 0 and 1. Fraction of the training data to
+        be used as validation data. Expected to be `None` when `x` is a dataset
+        iterator.
+
+  Raises:
+    ValueError: if argument `y` or `sample_weight` or `validation_split` are
+        provided by user.
+  """
+  if y is not None:
+    raise ValueError('You passed a dataset iterator (%s) as input `x` to '
+                     'your model. In that case, you should not specify '
+                     'a target (`y`) argument, since the dataset iterator '
+                     'generates both input data and target data. '
+                     'Received: %s' % (x, y))
+  if sample_weight is not None:
+    raise ValueError('`sample_weight` argument is not supported when input'
+                     ' `x` is a dataset iterator. '
+                     'Received: x=%s, sample_weight=%s' % (x, sample_weight))
+  if validation_split is not None and validation_split != 0.0:
+    raise ValueError(
+        '`validation_split` argument is not supported when '
+        'input `x` is a dataset iterator. '
+        'Received: x=%s, validation_split=%f' % (x, validation_split))
+
+
+def check_steps_argument(input_data, steps, steps_name):
+  """Validates `steps` argument based on input data's type.
+
+  The cases when `steps` value must be provided are when
+    1. input data passed is an iterator.
+    2. model was built on top of symbolic tensors, input data is not
+       required and is `None`.
+    3. input data passed is a symbolic tensor.
+
+  Arguments:
+      input_data: Input data. Can be Numpy array(s) or TensorFlow tensor(s) or
+        tf.data.Dataset iterator or `None`.
+      steps: Integer or `None`. Total number of steps (batches of samples) to
+        execute.
+      steps_name: The public API's parameter name for `steps`.
+
+  Returns:
+    boolean, True if `steps` argument is required, else False.
+
+  Raises:
+      ValueError: if `steps` argument is required for given input data type
+        but not provided.
+  """
+
+  is_x_iterator = (
+      isinstance(input_data, iterator_ops.Iterator) or
+      isinstance(input_data, iterator_ops.EagerIterator))
+
+  if (input_data is None or is_x_iterator or has_symbolic_tensors(input_data) or
+      (isinstance(input_data, list) and not input_data)):
+    if steps is None:
+      input_type_str = 'iterators' if is_x_iterator else 'data tensors'
+      raise ValueError('When using {input_type} as input to a model, you should'
+                       ' specify the `{steps_name}` argument.'.format(
+                           input_type=input_type_str, steps_name=steps_name))
+    return True
+  return False
diff --git a/tensorflow/python/keras/estimator/__init__.py b/tensorflow/python/keras/estimator/__init__.py
deleted file mode 100644
index 6f931f415810999a042b2c2060bf6494209677ed..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/estimator/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras estimator API."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.estimator import model_to_estimator
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/initializers.py b/tensorflow/python/keras/initializers.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/initializers.py
rename to tensorflow/python/keras/initializers.py
index ecb71d00e2c78ced6095aaa3a0180b454b04917a..b9b2e9ad598fabe8cbfbbcbd57d4d71ddf630df7 100644
--- a/tensorflow/python/keras/_impl/keras/initializers.py
+++ b/tensorflow/python/keras/initializers.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import six
 
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops.init_ops import Constant
 from tensorflow.python.ops.init_ops import Identity
 from tensorflow.python.ops.init_ops import Initializer  # pylint: disable=unused-import
diff --git a/tensorflow/python/keras/initializers/__init__.py b/tensorflow/python/keras/initializers/__init__.py
deleted file mode 100644
index 6b1fcfd2d9585d19ae3fd9705e128b19b1ec40e7..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/initializers/__init__.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras built-in initializers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Initializer functions / callable classes.
-from tensorflow.python.keras._impl.keras.initializers import Constant
-from tensorflow.python.keras._impl.keras.initializers import Identity
-from tensorflow.python.keras._impl.keras.initializers import Initializer
-from tensorflow.python.keras._impl.keras.initializers import Ones
-from tensorflow.python.keras._impl.keras.initializers import Orthogonal
-from tensorflow.python.keras._impl.keras.initializers import RandomNormal
-from tensorflow.python.keras._impl.keras.initializers import RandomUniform
-from tensorflow.python.keras._impl.keras.initializers import TruncatedNormal
-from tensorflow.python.keras._impl.keras.initializers import VarianceScaling
-from tensorflow.python.keras._impl.keras.initializers import Zeros
-
-# Functional interface.
-# pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.initializers import glorot_normal
-from tensorflow.python.keras._impl.keras.initializers import glorot_uniform
-from tensorflow.python.keras._impl.keras.initializers import he_normal
-from tensorflow.python.keras._impl.keras.initializers import he_uniform
-from tensorflow.python.keras._impl.keras.initializers import lecun_normal
-from tensorflow.python.keras._impl.keras.initializers import lecun_uniform
-
-# Auxiliary utils.
-from tensorflow.python.keras._impl.keras.initializers import deserialize
-from tensorflow.python.keras._impl.keras.initializers import serialize
-from tensorflow.python.keras._impl.keras.initializers import get
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/initializers_test.py
rename to tensorflow/python/keras/initializers_test.py
index 7b4e6b4d5b115bc788469bf1afe2a43f8dd86f04..a54d6da83907b71ce5f7fd6070598545731b7428 100644
--- a/tensorflow/python/keras/_impl/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.ops import init_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/python/keras/_impl/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/integration_test.py
rename to tensorflow/python/keras/integration_test.py
index 43aff67ef93c8ec495beafdd17c5557b6398671f..2e83544d97e66c3ab7efa41fb31d11752eed29b8 100644
--- a/tensorflow/python/keras/_impl/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.layers import core as tf_core_layers
 from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index c7be8b918c11235b7316e125cd7a9796851ad083..8fb663a17e16f9a16c67393327347f6cc463a5b6 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -20,141 +20,147 @@ from __future__ import print_function
 
 # Generic layers.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.engine import Input
-from tensorflow.python.keras._impl.keras.engine import InputLayer
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras.engine import Input
+from tensorflow.python.keras.engine import InputLayer
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
 
 # Advanced activations.
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import LeakyReLU
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import PReLU
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import ELU
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import ThresholdedReLU
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import Softmax
+from tensorflow.python.keras.layers.advanced_activations import LeakyReLU
+from tensorflow.python.keras.layers.advanced_activations import PReLU
+from tensorflow.python.keras.layers.advanced_activations import ELU
+from tensorflow.python.keras.layers.advanced_activations import ThresholdedReLU
+from tensorflow.python.keras.layers.advanced_activations import Softmax
 
 # Convolution layers.
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv2DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv3DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConv1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConv2D
+from tensorflow.python.keras.layers.convolutional import Conv1D
+from tensorflow.python.keras.layers.convolutional import Conv2D
+from tensorflow.python.keras.layers.convolutional import Conv3D
+from tensorflow.python.keras.layers.convolutional import Conv2DTranspose
+from tensorflow.python.keras.layers.convolutional import Conv3DTranspose
+from tensorflow.python.keras.layers.convolutional import SeparableConv1D
+from tensorflow.python.keras.layers.convolutional import SeparableConv2D
 
 # Convolution layer aliases.
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution2DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import DepthwiseConv2D
+from tensorflow.python.keras.layers.convolutional import Convolution1D
+from tensorflow.python.keras.layers.convolutional import Convolution2D
+from tensorflow.python.keras.layers.convolutional import Convolution3D
+from tensorflow.python.keras.layers.convolutional import Convolution2DTranspose
+from tensorflow.python.keras.layers.convolutional import Convolution3DTranspose
+from tensorflow.python.keras.layers.convolutional import SeparableConvolution1D
+from tensorflow.python.keras.layers.convolutional import SeparableConvolution2D
+from tensorflow.python.keras.layers.convolutional import DepthwiseConv2D
 
 # Image processing layers.
-from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import ZeroPadding1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import ZeroPadding2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import ZeroPadding3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Cropping1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Cropping2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Cropping3D
+from tensorflow.python.keras.layers.convolutional import UpSampling1D
+from tensorflow.python.keras.layers.convolutional import UpSampling2D
+from tensorflow.python.keras.layers.convolutional import UpSampling3D
+from tensorflow.python.keras.layers.convolutional import ZeroPadding1D
+from tensorflow.python.keras.layers.convolutional import ZeroPadding2D
+from tensorflow.python.keras.layers.convolutional import ZeroPadding3D
+from tensorflow.python.keras.layers.convolutional import Cropping1D
+from tensorflow.python.keras.layers.convolutional import Cropping2D
+from tensorflow.python.keras.layers.convolutional import Cropping3D
 
 # Core layers.
-from tensorflow.python.keras._impl.keras.layers.core import Masking
-from tensorflow.python.keras._impl.keras.layers.core import Dropout
-from tensorflow.python.keras._impl.keras.layers.core import SpatialDropout1D
-from tensorflow.python.keras._impl.keras.layers.core import SpatialDropout2D
-from tensorflow.python.keras._impl.keras.layers.core import SpatialDropout3D
-from tensorflow.python.keras._impl.keras.layers.core import Activation
-from tensorflow.python.keras._impl.keras.layers.core import Reshape
-from tensorflow.python.keras._impl.keras.layers.core import Permute
-from tensorflow.python.keras._impl.keras.layers.core import Flatten
-from tensorflow.python.keras._impl.keras.layers.core import RepeatVector
-from tensorflow.python.keras._impl.keras.layers.core import Lambda
-from tensorflow.python.keras._impl.keras.layers.core import Dense
-from tensorflow.python.keras._impl.keras.layers.core import ActivityRegularization
+from tensorflow.python.keras.layers.core import Masking
+from tensorflow.python.keras.layers.core import Dropout
+from tensorflow.python.keras.layers.core import SpatialDropout1D
+from tensorflow.python.keras.layers.core import SpatialDropout2D
+from tensorflow.python.keras.layers.core import SpatialDropout3D
+from tensorflow.python.keras.layers.core import Activation
+from tensorflow.python.keras.layers.core import Reshape
+from tensorflow.python.keras.layers.core import Permute
+from tensorflow.python.keras.layers.core import Flatten
+from tensorflow.python.keras.layers.core import RepeatVector
+from tensorflow.python.keras.layers.core import Lambda
+from tensorflow.python.keras.layers.core import Dense
+from tensorflow.python.keras.layers.core import ActivityRegularization
 
 # Embedding layers.
-from tensorflow.python.keras._impl.keras.layers.embeddings import Embedding
+from tensorflow.python.keras.layers.embeddings import Embedding
 
 # Locally-connected layers.
-from tensorflow.python.keras._impl.keras.layers.local import LocallyConnected1D
-from tensorflow.python.keras._impl.keras.layers.local import LocallyConnected2D
+from tensorflow.python.keras.layers.local import LocallyConnected1D
+from tensorflow.python.keras.layers.local import LocallyConnected2D
 
 # Merge layers.
-from tensorflow.python.keras._impl.keras.layers.merge import Add
-from tensorflow.python.keras._impl.keras.layers.merge import Multiply
-from tensorflow.python.keras._impl.keras.layers.merge import Average
-from tensorflow.python.keras._impl.keras.layers.merge import Maximum
-from tensorflow.python.keras._impl.keras.layers.merge import Concatenate
-from tensorflow.python.keras._impl.keras.layers.merge import Dot
-from tensorflow.python.keras._impl.keras.layers.merge import add
-from tensorflow.python.keras._impl.keras.layers.merge import multiply
-from tensorflow.python.keras._impl.keras.layers.merge import average
-from tensorflow.python.keras._impl.keras.layers.merge import maximum
-from tensorflow.python.keras._impl.keras.layers.merge import concatenate
-from tensorflow.python.keras._impl.keras.layers.merge import dot
+from tensorflow.python.keras.layers.merge import Add
+from tensorflow.python.keras.layers.merge import Multiply
+from tensorflow.python.keras.layers.merge import Average
+from tensorflow.python.keras.layers.merge import Maximum
+from tensorflow.python.keras.layers.merge import Concatenate
+from tensorflow.python.keras.layers.merge import Dot
+from tensorflow.python.keras.layers.merge import add
+from tensorflow.python.keras.layers.merge import subtract
+from tensorflow.python.keras.layers.merge import multiply
+from tensorflow.python.keras.layers.merge import average
+from tensorflow.python.keras.layers.merge import maximum
+from tensorflow.python.keras.layers.merge import minimum
+from tensorflow.python.keras.layers.merge import concatenate
+from tensorflow.python.keras.layers.merge import dot
 
 # Noise layers.
-from tensorflow.python.keras._impl.keras.layers.noise import AlphaDropout
-from tensorflow.python.keras._impl.keras.layers.noise import GaussianNoise
-from tensorflow.python.keras._impl.keras.layers.noise import GaussianDropout
+from tensorflow.python.keras.layers.noise import AlphaDropout
+from tensorflow.python.keras.layers.noise import GaussianNoise
+from tensorflow.python.keras.layers.noise import GaussianDropout
 
 # Normalization layers.
-from tensorflow.python.keras._impl.keras.layers.normalization import BatchNormalization
+from tensorflow.python.keras.layers.normalization import BatchNormalization
 
 # Pooling layers.
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling3D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAveragePooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAveragePooling3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPooling3D
+from tensorflow.python.keras.layers.pooling import MaxPooling1D
+from tensorflow.python.keras.layers.pooling import MaxPooling2D
+from tensorflow.python.keras.layers.pooling import MaxPooling3D
+from tensorflow.python.keras.layers.pooling import AveragePooling1D
+from tensorflow.python.keras.layers.pooling import AveragePooling2D
+from tensorflow.python.keras.layers.pooling import AveragePooling3D
+from tensorflow.python.keras.layers.pooling import GlobalAveragePooling1D
+from tensorflow.python.keras.layers.pooling import GlobalAveragePooling2D
+from tensorflow.python.keras.layers.pooling import GlobalAveragePooling3D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPooling1D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPooling2D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPooling3D
 
 # Pooling layer aliases.
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPool3D
-from tensorflow.python.keras._impl.keras.layers.pooling import AvgPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import AvgPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import AvgPool3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAvgPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAvgPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAvgPool3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool3D
+from tensorflow.python.keras.layers.pooling import MaxPool1D
+from tensorflow.python.keras.layers.pooling import MaxPool2D
+from tensorflow.python.keras.layers.pooling import MaxPool3D
+from tensorflow.python.keras.layers.pooling import AvgPool1D
+from tensorflow.python.keras.layers.pooling import AvgPool2D
+from tensorflow.python.keras.layers.pooling import AvgPool3D
+from tensorflow.python.keras.layers.pooling import GlobalAvgPool1D
+from tensorflow.python.keras.layers.pooling import GlobalAvgPool2D
+from tensorflow.python.keras.layers.pooling import GlobalAvgPool3D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPool1D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPool2D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPool3D
 
 # Recurrent layers.
-from tensorflow.python.keras._impl.keras.layers.recurrent import RNN
-from tensorflow.python.keras._impl.keras.layers.recurrent import StackedRNNCells
-from tensorflow.python.keras._impl.keras.layers.recurrent import SimpleRNNCell
-from tensorflow.python.keras._impl.keras.layers.recurrent import GRUCell
-from tensorflow.python.keras._impl.keras.layers.recurrent import LSTMCell
-from tensorflow.python.keras._impl.keras.layers.recurrent import SimpleRNN
-from tensorflow.python.keras._impl.keras.layers.recurrent import GRU
-from tensorflow.python.keras._impl.keras.layers.recurrent import LSTM
+from tensorflow.python.keras.layers.recurrent import RNN
+from tensorflow.python.keras.layers.recurrent import StackedRNNCells
+from tensorflow.python.keras.layers.recurrent import SimpleRNNCell
+from tensorflow.python.keras.layers.recurrent import GRUCell
+from tensorflow.python.keras.layers.recurrent import LSTMCell
+from tensorflow.python.keras.layers.recurrent import SimpleRNN
+from tensorflow.python.keras.layers.recurrent import GRU
+from tensorflow.python.keras.layers.recurrent import LSTM
 
 # Convolutional-recurrent layers.
-from tensorflow.python.keras._impl.keras.layers.convolutional_recurrent import ConvLSTM2D
+from tensorflow.python.keras.layers.convolutional_recurrent import ConvLSTM2D
 
 # CuDNN recurrent layers.
-from tensorflow.python.keras._impl.keras.layers.cudnn_recurrent import CuDNNLSTM
-from tensorflow.python.keras._impl.keras.layers.cudnn_recurrent import CuDNNGRU
+from tensorflow.python.keras.layers.cudnn_recurrent import CuDNNLSTM
+from tensorflow.python.keras.layers.cudnn_recurrent import CuDNNGRU
 
 # Wrapper functions
-from tensorflow.python.keras._impl.keras.layers.wrappers import Wrapper
-from tensorflow.python.keras._impl.keras.layers.wrappers import Bidirectional
-from tensorflow.python.keras._impl.keras.layers.wrappers import TimeDistributed
+from tensorflow.python.keras.layers.wrappers import Wrapper
+from tensorflow.python.keras.layers.wrappers import Bidirectional
+from tensorflow.python.keras.layers.wrappers import TimeDistributed
+
+# Serialization functions
+from tensorflow.python.keras.layers.serialization import deserialize
+from tensorflow.python.keras.layers.serialization import serialize
 
 del absolute_import
 del division
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
similarity index 94%
rename from tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
rename to tensorflow/python/keras/layers/advanced_activations.py
index 89931db3c0786b6869379e0d140e8a19e5e46d5f..8ade3c317456a88181f6005c620953817463595b 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -18,14 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/layers/advanced_activations_test.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py
rename to tensorflow/python/keras/layers/advanced_activations_test.py
index 343b7949accf3f0c9ddc5245910aa5faad8335c6..81c76db14cd3741687bf5e2bec66e5354e9f6312 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/layers/advanced_activations_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/layers/convolutional.py
rename to tensorflow/python/keras/layers/convolutional.py
index 9971f12773233cf85217913f490dbde836b0b3cb..ce1c84e98d04c84aad7aa381b2536facfae2322d 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -21,24 +21,24 @@ from __future__ import print_function
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
 # imports for backwards namespace compatibility
 # pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling3D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling3D
+from tensorflow.python.keras.layers.pooling import AveragePooling1D
+from tensorflow.python.keras.layers.pooling import AveragePooling2D
+from tensorflow.python.keras.layers.pooling import AveragePooling3D
+from tensorflow.python.keras.layers.pooling import MaxPooling1D
+from tensorflow.python.keras.layers.pooling import MaxPooling2D
+from tensorflow.python.keras.layers.pooling import MaxPooling3D
 # pylint: enable=unused-import
-from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
@@ -1467,10 +1467,14 @@ class SeparableConv2D(SeparableConv):
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
+      dilation_rate: An integer or tuple/list of 2 integers, specifying
+          the dilation rate to use for dilated convolution.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any `strides` value != 1.
       depth_multiplier: The number of depthwise convolution output channels
           for each input channel.
           The total number of depthwise convolution output
-          channels will be equal to `filterss_in * depth_multiplier`.
+          channels will be equal to `filters_in * depth_multiplier`.
       activation: Activation function to use.
           If you don't specify anything, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
@@ -1511,7 +1515,7 @@ class SeparableConv2D(SeparableConv):
                strides=(1, 1),
                padding='valid',
                data_format=None,
-               dilation_rate=1,
+               dilation_rate=(1, 1),
                depth_multiplier=1,
                activation=None,
                use_bias=True,
@@ -2095,14 +2099,14 @@ class ZeroPadding3D(Layer):
   """Zero-padding layer for 3D data (spatial or spatio-temporal).
 
   Arguments:
-      padding: int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
+      padding: int, or tuple of 3 ints, or tuple of 3 tuples of 2 ints.
           - If int: the same symmetric padding
               is applied to width and height.
-          - If tuple of 2 ints:
+          - If tuple of 3 ints:
               interpreted as two different
               symmetric padding values for height and width:
               `(symmetric_dim1_pad, symmetric_dim2_pad, symmetric_dim3_pad)`.
-          - If tuple of 2 tuples of 2 ints:
+          - If tuple of 3 tuples of 2 ints:
               interpreted as
               `((left_dim1_pad, right_dim1_pad), (left_dim2_pad,
                 right_dim2_pad), (left_dim3_pad, right_dim3_pad))`
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
rename to tensorflow/python/keras/layers/convolutional_recurrent.py
index be25bbc043a3beb14b3afbfdd342aace2403e5fd..c731508b3c32d93895432fd5174c1f57557b10dc 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -21,18 +21,19 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.layers.recurrent import _generate_dropout_mask
-from tensorflow.python.keras._impl.keras.layers.recurrent import RNN
-from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.layers.recurrent import _generate_dropout_mask
+from tensorflow.python.keras.layers.recurrent import _standardize_args
+from tensorflow.python.keras.layers.recurrent import RNN
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -167,6 +168,7 @@ class ConvRNN2D(RNN):
                                     **kwargs)
     self.input_spec = [InputSpec(ndim=5)]
     self.states = None
+    self._num_constants = None
 
   @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
@@ -214,7 +216,7 @@ class ConvRNN2D(RNN):
     # Note input_shape will be list of shapes of initial states and
     # constants if these are passed in __call__.
     if self._num_constants is not None:
-      constants_shape = input_shape[-self._num_constants:]
+      constants_shape = input_shape[-self._num_constants:]  # pylint: disable=E1130
     else:
       constants_shape = None
 
@@ -279,8 +281,8 @@ class ConvRNN2D(RNN):
       return [initial_state]
 
   def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
-    inputs, initial_state, constants = self._standardize_args(
-        inputs, initial_state, constants)
+    inputs, initial_state, constants = _standardize_args(
+        inputs, initial_state, constants, self._num_constants)
 
     if initial_state is None and constants is None:
       return super(ConvRNN2D, self).__call__(inputs, **kwargs)
@@ -609,16 +611,25 @@ class ConvLSTM2DCell(Layer):
         name='recurrent_kernel',
         regularizer=self.recurrent_regularizer,
         constraint=self.recurrent_constraint)
+
     if self.use_bias:
-      self.bias = self.add_weight(shape=(self.filters * 4,),
-                                  initializer=self.bias_initializer,
-                                  name='bias',
-                                  regularizer=self.bias_regularizer,
-                                  constraint=self.bias_constraint)
       if self.unit_forget_bias:
-        bias_value = np.zeros((self.filters * 4,))
-        bias_value[self.filters: self.filters * 2] = 1.
-        K.set_value(self.bias, bias_value)
+
+        def bias_initializer(_, *args, **kwargs):
+          return K.concatenate([
+              self.bias_initializer((self.filters,), *args, **kwargs),
+              initializers.Ones()((self.filters,), *args, **kwargs),
+              self.bias_initializer((self.filters * 2,), *args, **kwargs),
+          ])
+      else:
+        bias_initializer = self.bias_initializer
+      self.bias = self.add_weight(
+          shape=(self.filters * 4,),
+          name='bias',
+          initializer=bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+
     else:
       self.bias = None
 
@@ -844,10 +855,10 @@ class ConvLSTM2D(ConvRNN2D):
   Input shape:
     - if data_format='channels_first'
         5D tensor with shape:
-        `(samples,time, channels, rows, cols)`
+        `(samples, time, channels, rows, cols)`
     - if data_format='channels_last'
         5D tensor with shape:
-        `(samples,time, rows, cols, channels)`
+        `(samples, time, rows, cols, channels)`
 
   Output shape:
     - if `return_sequences`
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py b/tensorflow/python/keras/layers/convolutional_recurrent_test.py
similarity index 90%
rename from tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
rename to tensorflow/python/keras/layers/convolutional_recurrent_test.py
index 9e768b4e9552d126eac9c586d19810634fac9013..4b8f6f2a14e490c976d23463283bc4b81333ff92 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
@@ -180,6 +180,23 @@ class ConvLSTMTest(test.TestCase):
                   'recurrent_dropout': 0.1},
           input_shape=(1, 2, 5, 5, 2))
 
+  def test_conv_lstm_cloning(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.ConvLSTM2D(5, 3, input_shape=(None, 5, 5, 3)))
+
+      test_inputs = np.random.random((2, 4, 5, 5, 3))
+      reference_outputs = model.predict(test_inputs)
+      weights = model.get_weights()
+
+    # Use a new graph to clone the model
+    with self.test_session():
+      clone = keras.models.clone_model(model)
+      clone.set_weights(weights)
+
+      outputs = clone.predict(test_inputs)
+      self.assertAllClose(reference_outputs, outputs, atol=1e-5)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
rename to tensorflow/python/keras/layers/convolutional_test.py
index 12b42676759d499c910707cb1b78e788e3c443fd..167cabaeecb0c4ce9a785e7a990aa715f2d1a5b3 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -22,10 +22,10 @@ import copy
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/layers/core.py
rename to tensorflow/python/keras/layers/core.py
index 9c4cb0f4fda681ce3236222460cd87439ea67810..df4c3915a3097d52553557208b074f6923341673 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -26,15 +26,16 @@ import numpy as np
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
@@ -501,6 +502,17 @@ class Permute(Layer):
 class Flatten(Layer):
   """Flattens the input. Does not affect the batch size.
 
+  Arguments:
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, ..., channels)` while `channels_first` corresponds to
+          inputs with shape `(batch, channels, ...)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
   Example:
 
   ```python
@@ -515,11 +527,19 @@ class Flatten(Layer):
   ```
   """
 
-  def __init__(self, **kwargs):
+  def __init__(self, data_format=None, **kwargs):
     super(Flatten, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(min_ndim=2)
 
   def call(self, inputs):
+    if self.data_format == 'channels_first':
+      permutation = [0]
+      permutation.extend([i for i in
+                          range(2, K.ndim(inputs))])
+      permutation.append(1)
+      inputs = array_ops.transpose(inputs, perm=permutation)
+
     outputs = array_ops.reshape(inputs, (array_ops.shape(inputs)[0], -1))
     if not context.executing_eagerly():
       outputs.set_shape(self.compute_output_shape(inputs.get_shape()))
@@ -534,6 +554,11 @@ class Flatten(Layer):
       output_shape += [None]
     return tensor_shape.TensorShape(output_shape)
 
+  def get_config(self):
+    config = {'data_format': self.data_format}
+    base_config = super(Flatten, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 @tf_export('keras.layers.RepeatVector')
 class RepeatVector(Layer):
diff --git a/tensorflow/python/keras/_impl/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
similarity index 94%
rename from tensorflow/python/keras/_impl/keras/layers/core_test.py
rename to tensorflow/python/keras/layers/core_test.py
index d22d8d12dc4e76998c177dbe96fb87e3fffa5175..ff8af976b99376b037af81ed81707332ccf9937e 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -124,6 +124,16 @@ class CoreLayersTest(test.TestCase):
     testing_utils.layer_test(
         keras.layers.Flatten, kwargs={}, input_shape=(3, 2, 4))
 
+    # Test channels_first
+    inputs = np.random.random((10, 3, 5, 5)).astype('float32')
+    outputs = testing_utils.layer_test(
+        keras.layers.Flatten,
+        kwargs={'data_format': 'channels_first'},
+        input_data=inputs)
+    target_outputs = np.reshape(
+        np.transpose(inputs, (0, 2, 3, 1)), (-1, 5 * 5 * 3))
+    self.assertAllClose(outputs, target_outputs)
+
   @tf_test_util.run_in_graph_and_eager_modes()
   def test_repeat_vector(self):
     testing_utils.layer_test(
diff --git a/tensorflow/python/keras/_impl/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/layers/cudnn_recurrent.py
rename to tensorflow/python/keras/layers/cudnn_recurrent.py
index ffb90457a85bb801d766e144f45c044e0a7e3bb0..5c4a2dbe92e75296b21bfdb628621efa03f19e02 100644
--- a/tensorflow/python/keras/_impl/keras/layers/cudnn_recurrent.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -20,12 +20,12 @@ from __future__ import print_function
 
 import collections
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.layers.recurrent import RNN
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.layers.recurrent import RNN
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import state_ops
diff --git a/tensorflow/python/keras/_impl/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
similarity index 91%
rename from tensorflow/python/keras/_impl/keras/layers/cudnn_recurrent_test.py
rename to tensorflow/python/keras/layers/cudnn_recurrent_test.py
index a06943b10830571a9aa7367dd23a59cb332f4d84..9d186f8c586bd9f626e142a855be6d2cf00d7121 100644
--- a/tensorflow/python/keras/_impl/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -18,57 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class CuDNNTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes()
-  def test_cudnn_rnn_timing(self):
-    if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
-        input_size = 10
-        timesteps = 6
-        units = 2
-        num_samples = 32
-
-        for rnn_type in ['lstm', 'gru']:
-          times = []
-          for use_cudnn in [True, False]:
-            start_time = time.time()
-            inputs = keras.layers.Input(shape=(None, input_size))
-            if use_cudnn:
-              if rnn_type == 'lstm':
-                layer = keras.layers.CuDNNLSTM(units)
-              else:
-                layer = keras.layers.CuDNNGRU(units)
-            else:
-              if rnn_type == 'lstm':
-                layer = keras.layers.LSTM(units)
-              else:
-                layer = keras.layers.GRU(units)
-            outputs = layer(inputs)
-
-            optimizer = RMSPropOptimizer(learning_rate=0.001)
-            model = keras.models.Model(inputs, outputs)
-            model.compile(optimizer, 'mse')
-
-            x = np.random.random((num_samples, timesteps, input_size))
-            y = np.random.random((num_samples, units))
-            model.fit(x, y, epochs=4, batch_size=32)
-
-            times.append(time.time() - start_time)
-          self.assertGreater(times[1], times[0])
-
   @test_util.run_in_graph_and_eager_modes()
   def test_cudnn_rnn_basics(self):
     if test.is_gpu_available(cuda_only=True):
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
similarity index 94%
rename from tensorflow/python/keras/_impl/keras/layers/embeddings.py
rename to tensorflow/python/keras/layers/embeddings.py
index f7398845d400b1fd4fedd532eb1520dff30d47a0..25eeeee9529bcb52e608eeb9468c210eea8bd8be 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py b/tensorflow/python/keras/layers/embeddings_test.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
rename to tensorflow/python/keras/layers/embeddings_test.py
index 6ebf5dc94adb423abae7ec9e6910fb86439410f1..fff1c5ef9882f0c479d119ddb0bf68e919c016b4 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/layers/embeddings_test.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/gru_test.py b/tensorflow/python/keras/layers/gru_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/layers/gru_test.py
rename to tensorflow/python/keras/layers/gru_test.py
index 48e7e14f5ab73b534ab0d1c765ad2572b2930b2b..234434f7a0205c7dda80d308e4780cd761352d77 100644
--- a/tensorflow/python/keras/_impl/keras/layers/gru_test.py
+++ b/tensorflow/python/keras/layers/gru_test.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/local.py b/tensorflow/python/keras/layers/local.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/layers/local.py
rename to tensorflow/python/keras/layers/local.py
index caae820fb3a8eba76c3fbbca734908514b076982..46c18b763e80b58da1ec0c2655978753af75b4f8 100644
--- a/tensorflow/python/keras/_impl/keras/layers/local.py
+++ b/tensorflow/python/keras/layers/local.py
@@ -18,15 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/layers/local_test.py
rename to tensorflow/python/keras/layers/local_test.py
index 93741d24b9a74cf9e8a83069f7c4235b1f489818..90ae1719e171b19e1c3b95fef434bd53285c858c 100644
--- a/tensorflow/python/keras/_impl/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/layers/lstm_test.py
rename to tensorflow/python/keras/layers/lstm_test.py
index 11a5e0aeaacfa7520361ae41ac3d40607e8a9050..87cb344bf82b73b6af9830a4428a5ba099135324 100644
--- a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge.py b/tensorflow/python/keras/layers/merge.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/layers/merge.py
rename to tensorflow/python/keras/layers/merge.py
index 2b6cf7c8a94ff40ea35e2bbfe13e6c26024857b4..683e3e0ed1ce9a1fc56dcae0c0c8841148f008d5 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge.py
+++ b/tensorflow/python/keras/layers/merge.py
@@ -20,9 +20,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge_test.py b/tensorflow/python/keras/layers/merge_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/layers/merge_test.py
rename to tensorflow/python/keras/layers/merge_test.py
index b2fe06f93e33ed63d6a2aa29522ecb552f582440..8a097cf7f57d06155f26e3099554e34a54186189 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/layers/merge_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/noise.py b/tensorflow/python/keras/layers/noise.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/layers/noise.py
rename to tensorflow/python/keras/layers/noise.py
index addac5b137430d8f74efa126423cb39b15382502..a895caa25b91702d92002f84fe44b5b5c3a8ca0c 100644
--- a/tensorflow/python/keras/_impl/keras/layers/noise.py
+++ b/tensorflow/python/keras/layers/noise.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/keras/_impl/keras/layers/noise_test.py b/tensorflow/python/keras/layers/noise_test.py
similarity index 93%
rename from tensorflow/python/keras/_impl/keras/layers/noise_test.py
rename to tensorflow/python/keras/layers/noise_test.py
index af4f031ec95bb56b72c1f1018e0e529d8ff55564..bde2185f03bd45c1c9fecbd6fe5544a17e9c04ef 100644
--- a/tensorflow/python/keras/_impl/keras/layers/noise_test.py
+++ b/tensorflow/python/keras/layers/noise_test.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/layers/normalization.py
rename to tensorflow/python/keras/layers/normalization.py
index c16fc07fb4ecda66bd8bcc70dce5d753c73f5dd9..c0dc5220f1ea63930e787fee4d8ff95e2c4cb321 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -22,13 +22,13 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
similarity index 74%
rename from tensorflow/python/keras/_impl/keras/layers/normalization_test.py
rename to tensorflow/python/keras/layers/normalization_test.py
index fa9277e3d1e5bb0b9633abc46a96a11816dddb2d..b22f3bd1529812f6b5f63efe5cf6b6133db97f07 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
@@ -168,6 +168,76 @@ class NormalizationLayersTest(test.TestCase):
       new_model.compile('sgd', 'mse')
       new_model.train_on_batch(x, x)
 
+  def test_that_trainable_disables_updates(self):
+    with self.test_session():
+      val_a = np.random.random((10, 4))
+      val_out = np.random.random((10, 4))
+
+      a = keras.layers.Input(shape=(4,))
+      layer = keras.layers.BatchNormalization(input_shape=(4,))
+      b = layer(a)
+      model = keras.models.Model(a, b)
+
+      model.trainable = False
+      assert not model.updates
+
+      model.compile('sgd', 'mse')
+      assert not model.updates
+
+      x1 = model.predict(val_a)
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      self.assertAllClose(x1, x2, atol=1e-7)
+
+      model.trainable = True
+      model.compile('sgd', 'mse')
+      assert model.updates
+
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      assert np.abs(np.sum(x1 - x2)) > 1e-5
+
+      layer.trainable = False
+      model.compile('sgd', 'mse')
+      assert not model.updates
+
+      x1 = model.predict(val_a)
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      self.assertAllClose(x1, x2, atol=1e-7)
+
+  def test_batchnorm_trainable(self):
+    """Tests that batchnorm layer is trainable when learning phase is enabled.
+
+    Computes mean and std for current inputs then
+    applies batch normalization using them.
+    """
+    with self.test_session():
+      bn_mean = 0.5
+      bn_std = 10.
+      val_a = np.expand_dims(np.arange(10.), axis=1)
+
+      def get_model(bn_mean, bn_std):
+        inp = keras.layers.Input(shape=(1,))
+        x = keras.layers.BatchNormalization()(inp)
+        model1 = keras.models.Model(inp, x)
+        model1.set_weights([
+            np.array([1.]),
+            np.array([0.]),
+            np.array([bn_mean]),
+            np.array([bn_std**2])
+        ])
+        return model1
+
+      # Simulates training-mode with trainable layer.
+      # Should use mini-batch statistics.
+      keras.backend.set_learning_phase(1)
+      model = get_model(bn_mean, bn_std)
+      model.compile(loss='mse', optimizer='rmsprop')
+      out = model.predict(val_a)
+      self.assertAllClose(
+          (val_a - np.mean(val_a)) / np.std(val_a), out, atol=1e-3)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/pooling.py b/tensorflow/python/keras/layers/pooling.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/layers/pooling.py
rename to tensorflow/python/keras/layers/pooling.py
index 86bc8a680a529a9ea17592a42207fab58adeebce..10a82b285eff6f6b414e67441ceb88976ca2368f 100644
--- a/tensorflow/python/keras/_impl/keras/layers/pooling.py
+++ b/tensorflow/python/keras/layers/pooling.py
@@ -19,10 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import conv_utils
+from tensorflow.python.keras import backend
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/keras/_impl/keras/layers/pooling_test.py b/tensorflow/python/keras/layers/pooling_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/layers/pooling_test.py
rename to tensorflow/python/keras/layers/pooling_test.py
index 2c08b647ea0fafb7519240b0c81e8fa77f034f7f..cbd58a22879975b7dbaab8290f59cee573b272cd 100644
--- a/tensorflow/python/keras/_impl/keras/layers/pooling_test.py
+++ b/tensorflow/python/keras/layers/pooling_test.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/layers/recurrent.py
rename to tensorflow/python/keras/layers/recurrent.py
index caf9e6f46f51c77bcd37953821c54d4f3145fb5b..7e509fb45182653d938adfd679e204cc7ea1e900 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -24,15 +24,15 @@ import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -153,7 +153,7 @@ class StackedRNNCells(Layer):
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+    from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
     cells = []
     for cell_config in config.pop('cells'):
       cells.append(
@@ -519,9 +519,10 @@ class RNN(Layer):
       return [K.tile(initial_state, [1, self.cell.state_size])]
 
   def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
-    inputs, initial_state, constants = self._standardize_args(
-        inputs, initial_state, constants)
-
+    inputs, initial_state, constants = _standardize_args(inputs,
+                                                         initial_state,
+                                                         constants,
+                                                         self._num_constants)
     if initial_state is None and constants is None:
       return super(RNN, self).__call__(inputs, **kwargs)
 
@@ -661,46 +662,6 @@ class RNN(Layer):
     else:
       return output
 
-  def _standardize_args(self, inputs, initial_state, constants):
-    """Standardize `__call__` to a single list of tensor inputs.
-
-    When running a model loaded from file, the input tensors
-    `initial_state` and `constants` can be passed to `RNN.__call__` as part
-    of `inputs` instead of by the dedicated keyword arguments. This method
-    makes sure the arguments are separated and that `initial_state` and
-    `constants` are lists of tensors (or None).
-
-    Arguments:
-        inputs: tensor or list/tuple of tensors
-        initial_state: tensor or list of tensors or None
-        constants: tensor or list of tensors or None
-
-    Returns:
-        inputs: tensor
-        initial_state: list of tensors or None
-        constants: list of tensors or None
-    """
-    if isinstance(inputs, list):
-      assert initial_state is None and constants is None
-      if self._num_constants is not None:
-        constants = inputs[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
-        inputs = inputs[:-self._num_constants]  # pylint: disable=invalid-unary-operand-type
-      if len(inputs) > 1:
-        initial_state = inputs[1:]
-      inputs = inputs[0]
-
-    def to_list_or_none(x):
-      if x is None or isinstance(x, list):
-        return x
-      if isinstance(x, tuple):
-        return list(x)
-      return [x]
-
-    initial_state = to_list_or_none(initial_state)
-    constants = to_list_or_none(constants)
-
-    return inputs, initial_state, constants
-
   def reset_states(self, states=None):
     if not self.stateful:
       raise AttributeError('Layer must be stateful.')
@@ -773,7 +734,7 @@ class RNN(Layer):
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+    from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
     cell = deserialize_layer(config.pop('cell'), custom_objects=custom_objects)
     num_constants = config.pop('num_constants', None)
     layer = cls(cell, **config)
@@ -914,13 +875,13 @@ class SimpleRNNCell(Layer):
     prev_output = states[0]
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
+          array_ops.ones_like(inputs),
           self.dropout,
           training=training)
     if (0 < self.recurrent_dropout < 1 and
         self._recurrent_dropout_mask is None):
       self._recurrent_dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs, self.units),
+          array_ops.ones_like(prev_output),
           self.recurrent_dropout,
           training=training)
 
@@ -1333,14 +1294,14 @@ class GRUCell(Layer):
 
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
+          array_ops.ones_like(inputs),
           self.dropout,
           training=training,
           count=3)
     if (0 < self.recurrent_dropout < 1 and
         self._recurrent_dropout_mask is None):
       self._recurrent_dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs, self.units),
+          array_ops.ones_like(h_tm1),
           self.recurrent_dropout,
           training=training,
           count=3)
@@ -1873,14 +1834,14 @@ class LSTMCell(Layer):
   def call(self, inputs, states, training=None):
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
+          array_ops.ones_like(inputs),
           self.dropout,
           training=training,
           count=4)
     if (0 < self.recurrent_dropout < 1 and
         self._recurrent_dropout_mask is None):
       self._recurrent_dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs, self.units),
+          array_ops.ones_like(states[0]),
           self.recurrent_dropout,
           training=training,
           count=4)
@@ -2254,12 +2215,7 @@ class LSTM(RNN):
     return cls(**config)
 
 
-def _generate_dropout_ones(inputs, dims):
-  return K.ones((array_ops.shape(inputs)[0], dims))
-
-
 def _generate_dropout_mask(ones, rate, training=None, count=1):
-
   def dropped_inputs():
     return K.dropout(ones, rate)
 
@@ -2605,3 +2561,47 @@ class Recurrent(Layer):
     }
     base_config = super(Recurrent, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+
+def _standardize_args(inputs, initial_state, constants, num_constants):
+  """Standardizes `__call__` to a single list of tensor inputs.
+
+  When running a model loaded from a file, the input tensors
+  `initial_state` and `constants` can be passed to `RNN.__call__()` as part
+  of `inputs` instead of by the dedicated keyword arguments. This method
+  makes sure the arguments are separated and that `initial_state` and
+  `constants` are lists of tensors (or None).
+
+  Arguments:
+      inputs: Tensor or list/tuple of tensors. which may include constants
+        and initial states. In that case `num_constant` must be specified.
+      initial_state: Tensor or list of tensors or None, initial states.
+      constants: Tensor or list of tensors or None, constant tensors.
+      num_constants: Expected number of constants (if constants are passed as
+        part of the `inputs` list.
+
+  Returns:
+      inputs: Single tensor.
+      initial_state: List of tensors or None.
+      constants: List of tensors or None.
+  """
+  if isinstance(inputs, list):
+    assert initial_state is None and constants is None
+    if num_constants is not None:
+      constants = inputs[-num_constants:]
+      inputs = inputs[:-num_constants]
+    if len(inputs) > 1:
+      initial_state = inputs[1:]
+    inputs = inputs[0]
+
+  def to_list_or_none(x):
+    if x is None or isinstance(x, list):
+      return x
+    if isinstance(x, tuple):
+      return list(x)
+    return [x]
+
+  initial_state = to_list_or_none(initial_state)
+  constants = to_list_or_none(constants)
+
+  return inputs, initial_state, constants
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
rename to tensorflow/python/keras/layers/recurrent_test.py
index 4c68c18825a47d87806a7a09d4054f974d569e00..802374d2d28d792c1e32bf5095b928f569144b49 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -23,7 +23,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
diff --git a/tensorflow/python/keras/_impl/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
similarity index 58%
rename from tensorflow/python/keras/_impl/keras/layers/serialization.py
rename to tensorflow/python/keras/layers/serialization.py
index 8151ad7fdddefe08e7af0563bdf27ab335d7d1f8..be306c0af765dd79bcc2b7651d97957c1cf80519 100644
--- a/tensorflow/python/keras/_impl/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -20,22 +20,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.engine import Input
-from tensorflow.python.keras._impl.keras.engine import InputLayer
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import *
-from tensorflow.python.keras._impl.keras.layers.convolutional import *
-from tensorflow.python.keras._impl.keras.layers.convolutional_recurrent import *
-from tensorflow.python.keras._impl.keras.layers.core import *
-from tensorflow.python.keras._impl.keras.layers.cudnn_recurrent import *
-from tensorflow.python.keras._impl.keras.layers.embeddings import *
-from tensorflow.python.keras._impl.keras.layers.local import *
-from tensorflow.python.keras._impl.keras.layers.merge import *
-from tensorflow.python.keras._impl.keras.layers.noise import *
-from tensorflow.python.keras._impl.keras.layers.normalization import *
-from tensorflow.python.keras._impl.keras.layers.pooling import *
-from tensorflow.python.keras._impl.keras.layers.recurrent import *
-from tensorflow.python.keras._impl.keras.layers.wrappers import *
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.engine import Input
+from tensorflow.python.keras.engine import InputLayer
+from tensorflow.python.keras.layers.advanced_activations import *
+from tensorflow.python.keras.layers.convolutional import *
+from tensorflow.python.keras.layers.convolutional_recurrent import *
+from tensorflow.python.keras.layers.core import *
+from tensorflow.python.keras.layers.cudnn_recurrent import *
+from tensorflow.python.keras.layers.embeddings import *
+from tensorflow.python.keras.layers.local import *
+from tensorflow.python.keras.layers.merge import *
+from tensorflow.python.keras.layers.noise import *
+from tensorflow.python.keras.layers.normalization import *
+from tensorflow.python.keras.layers.pooling import *
+from tensorflow.python.keras.layers.recurrent import *
+from tensorflow.python.keras.layers.wrappers import *
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 
 
 def serialize(layer):
@@ -53,7 +53,7 @@ def deserialize(config, custom_objects=None):
   Returns:
       Layer instance (may be Model, Sequential, Layer...)
   """
-  from tensorflow.python.keras._impl.keras import models  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
   globs = globals()  # All layers.
   globs['Model'] = models.Model
   globs['Sequential'] = models.Sequential
diff --git a/tensorflow/python/keras/_impl/keras/layers/serialization_test.py b/tensorflow/python/keras/layers/serialization_test.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/layers/serialization_test.py
rename to tensorflow/python/keras/layers/serialization_test.py
index 787160d1e71f570479144c5afd45cd41f38f0e91..5872185ef7c30aa50e8ca5aac32cc1804369017c 100644
--- a/tensorflow/python/keras/_impl/keras/layers/serialization_test.py
+++ b/tensorflow/python/keras/layers/serialization_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py b/tensorflow/python/keras/layers/simplernn_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/layers/simplernn_test.py
rename to tensorflow/python/keras/layers/simplernn_test.py
index 8c7189cd4718450a85c015e08ab3a58cc5d86531..3d24b0d5045d9c264f32adedaa0e91cdc5cbb0cf 100644
--- a/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/layers/simplernn_test.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
similarity index 81%
rename from tensorflow/python/keras/_impl/keras/layers/wrappers.py
rename to tensorflow/python/keras/layers/wrappers.py
index 91b8c1148bec568a3f26a4798ee120ed7a91faf1..7759561ef94c4a81552ef7b40ea71e49bbb743ae 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -22,11 +22,12 @@ from __future__ import print_function
 import copy
 
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.layers.recurrent import _standardize_args
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -103,7 +104,7 @@ class Wrapper(Layer):
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+    from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
     layer = deserialize_layer(
         config.pop('layer'), custom_objects=custom_objects)
     return cls(layer, **config)
@@ -201,7 +202,7 @@ class TimeDistributed(Wrapper):
           step,
           inputs,
           initial_states=[],
-          input_length=input_shape[0],
+          input_length=input_shape[1],
           unroll=False)
       y = outputs
     else:
@@ -284,6 +285,7 @@ class Bidirectional(Wrapper):
     self.return_state = layer.return_state
     self.supports_masking = True
     self._trainable = True
+    self._num_constants = None
     super(Bidirectional, self).__init__(layer, **kwargs)
     self.input_spec = layer.input_spec
 
@@ -326,37 +328,51 @@ class Bidirectional(Wrapper):
       return [output_shape] + state_shape + copy.copy(state_shape)
     return output_shape
 
-  def __call__(self, inputs, initial_state=None, **kwargs):
+  def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
+    """`Bidirectional.__call__` implements the same API as the wrapped `RNN`."""
+    inputs, initial_state, constants = _standardize_args(
+        inputs, initial_state, constants, self._num_constants)
+
     if isinstance(inputs, list):
       if len(inputs) > 1:
         initial_state = inputs[1:]
       inputs = inputs[0]
 
-    if initial_state is None:
+    if initial_state is None and constants is None:
       return super(Bidirectional, self).__call__(inputs, **kwargs)
 
-    # Standardize `initial_state` into list
-    if isinstance(initial_state, tuple):
-      initial_state = list(initial_state)
-    elif not isinstance(initial_state, list):
-      initial_state = [initial_state]
-
-    # Check if `initial_state` can be splitted into half
-    num_states = len(initial_state)
-    if num_states % 2 > 0:
-      raise ValueError(
-          'When passing `initial_state` to a Bidirectional RNN, the state '
-          'should be a list containing the states of the underlying RNNs. '
-          'Found: ' + str(initial_state))
-
-    # Applies the same workaround as in `RNN.__call__`, without handling
-    # constants
-    kwargs['initial_state'] = initial_state
-    additional_inputs = initial_state
-    additional_specs = [InputSpec(shape=K.int_shape(state))
-                        for state in initial_state]
-    self.forward_layer.state_spec = additional_specs[:num_states // 2]
-    self.backward_layer.state_spec = additional_specs[num_states // 2:]
+    # Applies the same workaround as in `RNN.__call__`
+    additional_inputs = []
+    additional_specs = []
+    if initial_state is not None:
+      # Check if `initial_state` can be splitted into half
+      num_states = len(initial_state)
+      if num_states % 2 > 0:
+        raise ValueError(
+            'When passing `initial_state` to a Bidirectional RNN, '
+            'the state should be a list containing the states of '
+            'the underlying RNNs. '
+            'Found: ' + str(initial_state))
+
+      kwargs['initial_state'] = initial_state
+      additional_inputs += initial_state
+      state_specs = [InputSpec(shape=K.int_shape(state))
+                     for state in initial_state]
+      self.forward_layer.state_spec = state_specs[:num_states // 2]
+      self.backward_layer.state_spec = state_specs[num_states // 2:]
+      additional_specs += state_specs
+    if constants is not None:
+      kwargs['constants'] = constants
+      additional_inputs += constants
+      constants_spec = [InputSpec(shape=K.int_shape(constant))
+                        for constant in constants]
+      self.forward_layer.constants_spec = constants_spec
+      self.backward_layer.constants_spec = constants_spec
+      additional_specs += constants_spec
+
+      self._num_constants = len(constants)
+      self.forward_layer._num_constants = self._num_constants
+      self.backward_layer._num_constants = self._num_constants
 
     is_keras_tensor = K.is_keras_tensor(additional_inputs[0])
     for tensor in additional_inputs:
@@ -381,12 +397,19 @@ class Bidirectional(Wrapper):
     else:
       return super(Bidirectional, self).__call__(inputs, **kwargs)
 
-  def call(self, inputs, training=None, mask=None, initial_state=None):
+  def call(self, inputs,
+           training=None,
+           mask=None,
+           initial_state=None,
+           constants=None):
+    """`Bidirectional.call` implements the same API as the wrapped `RNN`."""
     kwargs = {}
     if generic_utils.has_arg(self.layer.call, 'training'):
       kwargs['training'] = training
     if generic_utils.has_arg(self.layer.call, 'mask'):
       kwargs['mask'] = mask
+    if generic_utils.has_arg(self.layer.call, 'constants'):
+      kwargs['constants'] = constants
 
     if initial_state is not None and generic_utils.has_arg(
         self.layer.call, 'initial_state'):
@@ -444,13 +467,23 @@ class Bidirectional(Wrapper):
     self.built = True
 
   def compute_mask(self, inputs, mask):
+    if isinstance(mask, list):
+      mask = mask[0]
     if self.return_sequences:
       if not self.merge_mode:
-        return [mask, mask]
+        output_mask = [mask, mask]
       else:
-        return mask
+        output_mask = mask
     else:
-      return None
+      output_mask = [None, None] if not self.merge_mode else None
+
+    if self.return_state:
+      states = self.forward_layer.states
+      state_mask = [None for _ in states]
+      if isinstance(output_mask, list):
+        return output_mask + state_mask * 2
+      return [output_mask] + state_mask * 2
+    return output_mask
 
   @property
   def trainable_weights(self):
@@ -488,5 +521,15 @@ class Bidirectional(Wrapper):
 
   def get_config(self):
     config = {'merge_mode': self.merge_mode}
+    if self._num_constants is not None:
+      config['num_constants'] = self._num_constants
     base_config = super(Bidirectional, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    num_constants = config.pop('num_constants', None)
+    layer = super(Bidirectional, cls).from_config(config,
+                                                  custom_objects=custom_objects)
+    layer._num_constants = num_constants
+    return layer
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
similarity index 74%
rename from tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
rename to tensorflow/python/keras/layers/wrappers_test.py
index 8fcf66e90ff1289a06a996768ae5de2f1548a27c..5eab6aba8a5f9a7e70f55685a9cd9ae6e0cf024d 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -18,14 +18,55 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
+class _RNNCellWithConstants(keras.layers.Layer):
+
+  def __init__(self, units, **kwargs):
+    self.units = units
+    self.state_size = units
+    super(_RNNCellWithConstants, self).__init__(**kwargs)
+
+  def build(self, input_shape):
+    [input_shape, constant_shape] = input_shape
+
+    self.input_kernel = self.add_weight(
+        shape=(input_shape[-1], self.units),
+        initializer='uniform',
+        name='kernel')
+    self.recurrent_kernel = self.add_weight(
+        shape=(self.units, self.units),
+        initializer='uniform',
+        name='recurrent_kernel')
+    self.constant_kernel = self.add_weight(
+        shape=(constant_shape[-1], self.units),
+        initializer='uniform',
+        name='constant_kernel')
+    self.built = True
+
+  def call(self, inputs, states, constants):
+    [prev_output] = states
+    [constant] = constants
+    h_input = keras.backend.dot(inputs, self.input_kernel)
+    h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
+    h_const = keras.backend.dot(constant, self.constant_kernel)
+    output = h_input + h_state + h_const
+    return output, [output]
+
+  def get_config(self):
+    config = {'units': self.units}
+    base_config = super(_RNNCellWithConstants, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 class TimeDistributedTest(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes()
@@ -383,6 +424,100 @@ class BidirectionalTest(test.TestCase):
       layer.trainable = True
       assert len(layer.trainable_weights) == 6
 
+  def test_Bidirectional_with_constants(self):
+    with self.test_session():
+      # Test basic case.
+      x = keras.Input((5, 5))
+      c = keras.Input((3,))
+      cell = _RNNCellWithConstants(32)
+      custom_objects = {'_RNNCellWithConstants': _RNNCellWithConstants}
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.Bidirectional(keras.layers.RNN(cell))
+      y = layer(x, constants=c)
+      model = keras.Model([x, c], y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          [np.zeros((6, 5, 5)), np.zeros((6, 3))],
+          np.zeros((6, 64))
+      )
+
+      # Test basic case serialization.
+      x_np = np.random.random((6, 5, 5))
+      c_np = np.random.random((6, 3))
+      y_np = model.predict([x_np, c_np])
+      weights = model.get_weights()
+      config = layer.get_config()
+
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.Bidirectional.from_config(copy.deepcopy(config))
+      y = layer(x, constants=c)
+      model = keras.Model([x, c], y)
+      model.set_weights(weights)
+      y_np_2 = model.predict([x_np, c_np])
+      self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+      # Test flat list inputs
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.Bidirectional.from_config(copy.deepcopy(config))
+      y = layer([x, c])
+      model = keras.Model([x, c], y)
+      model.set_weights(weights)
+      y_np_3 = model.predict([x_np, c_np])
+      self.assertAllClose(y_np, y_np_3, atol=1e-4)
+
+  def test_Bidirectional_with_constants_layer_passing_initial_state(self):
+    with self.test_session():
+      # Test basic case.
+      x = keras.Input((5, 5))
+      c = keras.Input((3,))
+      s_for = keras.Input((32,))
+      s_bac = keras.Input((32,))
+      cell = _RNNCellWithConstants(32)
+      custom_objects = {'_RNNCellWithConstants': _RNNCellWithConstants}
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.Bidirectional(keras.layers.RNN(cell))
+      y = layer(x, initial_state=[s_for, s_bac], constants=c)
+      model = keras.Model([x, s_for, s_bac, c], y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          [np.zeros((6, 5, 5)),
+           np.zeros((6, 32)),
+           np.zeros((6, 32)),
+           np.zeros((6, 3))],
+          np.zeros((6, 64))
+      )
+
+      # Test basic case serialization.
+      x_np = np.random.random((6, 5, 5))
+      s_fw_np = np.random.random((6, 32))
+      s_bk_np = np.random.random((6, 32))
+      c_np = np.random.random((6, 3))
+      y_np = model.predict([x_np, s_fw_np, s_bk_np, c_np])
+      weights = model.get_weights()
+      config = layer.get_config()
+
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.Bidirectional.from_config(copy.deepcopy(config))
+      y = layer(x, initial_state=[s_for, s_bac], constants=c)
+      model = keras.Model([x, s_for, s_bac, c], y)
+      model.set_weights(weights)
+      y_np_2 = model.predict([x_np, s_fw_np, s_bk_np, c_np])
+      self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+      # Verify that state is used
+      y_np_2_different_s = model.predict(
+          [x_np, s_fw_np + 10., s_bk_np + 10., c_np])
+      assert np.mean(y_np - y_np_2_different_s) != 0
+
+      # Test flat list inputs
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.Bidirectional.from_config(copy.deepcopy(config))
+      y = layer([x, s_for, s_bac, c])
+      model = keras.Model([x, s_for, s_bac, c], y)
+      model.set_weights(weights)
+      y_np_3 = model.predict([x_np, s_fw_np, s_bk_np, c_np])
+      self.assertAllClose(y_np, y_np_3, atol=1e-4)
+
 
 def _to_list(ls):
   if isinstance(ls, list):
diff --git a/tensorflow/python/keras/_impl/keras/losses.py b/tensorflow/python/keras/losses.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/losses.py
rename to tensorflow/python/keras/losses.py
index 1d634d38013164659f7360fce45704c19083f475..d82ebd9c314c0427da86f6f6f617b7b282240c3d 100644
--- a/tensorflow/python/keras/_impl/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -21,9 +21,9 @@ from __future__ import print_function
 
 import six
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/keras/losses/__init__.py b/tensorflow/python/keras/losses/__init__.py
deleted file mode 100644
index 66721b694f5fd5fae7ca521ff56d4c6c6bce79b5..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/losses/__init__.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras built-in loss functions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Loss functions.
-from tensorflow.python.keras._impl.keras.losses import binary_crossentropy
-from tensorflow.python.keras._impl.keras.losses import categorical_crossentropy
-from tensorflow.python.keras._impl.keras.losses import categorical_hinge
-from tensorflow.python.keras._impl.keras.losses import cosine_proximity
-from tensorflow.python.keras._impl.keras.losses import hinge
-from tensorflow.python.keras._impl.keras.losses import kullback_leibler_divergence
-from tensorflow.python.keras._impl.keras.losses import logcosh
-from tensorflow.python.keras._impl.keras.losses import mean_absolute_error
-from tensorflow.python.keras._impl.keras.losses import mean_absolute_percentage_error
-from tensorflow.python.keras._impl.keras.losses import mean_squared_error
-from tensorflow.python.keras._impl.keras.losses import mean_squared_logarithmic_error
-from tensorflow.python.keras._impl.keras.losses import poisson
-from tensorflow.python.keras._impl.keras.losses import sparse_categorical_crossentropy
-from tensorflow.python.keras._impl.keras.losses import squared_hinge
-
-# Auxiliary utils.
-# pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.losses import deserialize
-from tensorflow.python.keras._impl.keras.losses import serialize
-from tensorflow.python.keras._impl.keras.losses import get
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/losses_test.py
rename to tensorflow/python/keras/losses_test.py
index 1884c0fdca79801ecd7d8cd21dae8b745ed0f6b6..3098a6d071a77ec26a132f445ab16949e90339f2 100644
--- a/tensorflow/python/keras/_impl/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -23,7 +23,7 @@ import shutil
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 try:
diff --git a/tensorflow/python/keras/_impl/keras/metrics.py b/tensorflow/python/keras/metrics.py
similarity index 72%
rename from tensorflow/python/keras/_impl/keras/metrics.py
rename to tensorflow/python/keras/metrics.py
index 747c3e65157ded6b0d227c6d6667b9092d0eed44..e03d7dfe93585efd06f4701a8d20f61fc314d564 100644
--- a/tensorflow/python/keras/_impl/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -21,22 +21,22 @@ from __future__ import print_function
 
 import six
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.losses import binary_crossentropy
-from tensorflow.python.keras._impl.keras.losses import categorical_crossentropy
-from tensorflow.python.keras._impl.keras.losses import cosine_proximity
-from tensorflow.python.keras._impl.keras.losses import hinge
-from tensorflow.python.keras._impl.keras.losses import kullback_leibler_divergence
-from tensorflow.python.keras._impl.keras.losses import logcosh
-from tensorflow.python.keras._impl.keras.losses import mean_absolute_error
-from tensorflow.python.keras._impl.keras.losses import mean_absolute_percentage_error
-from tensorflow.python.keras._impl.keras.losses import mean_squared_error
-from tensorflow.python.keras._impl.keras.losses import mean_squared_logarithmic_error
-from tensorflow.python.keras._impl.keras.losses import poisson
-from tensorflow.python.keras._impl.keras.losses import sparse_categorical_crossentropy
-from tensorflow.python.keras._impl.keras.losses import squared_hinge
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.losses import binary_crossentropy
+from tensorflow.python.keras.losses import categorical_crossentropy
+from tensorflow.python.keras.losses import cosine_proximity
+from tensorflow.python.keras.losses import hinge
+from tensorflow.python.keras.losses import kullback_leibler_divergence
+from tensorflow.python.keras.losses import logcosh
+from tensorflow.python.keras.losses import mean_absolute_error
+from tensorflow.python.keras.losses import mean_absolute_percentage_error
+from tensorflow.python.keras.losses import mean_squared_error
+from tensorflow.python.keras.losses import mean_squared_logarithmic_error
+from tensorflow.python.keras.losses import poisson
+from tensorflow.python.keras.losses import sparse_categorical_crossentropy
+from tensorflow.python.keras.losses import squared_hinge
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/keras/metrics/__init__.py b/tensorflow/python/keras/metrics/__init__.py
deleted file mode 100644
index 59faf037bce0f087d244a2faaeb52713bdc3b772..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/metrics/__init__.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras built-in metrics functions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Metrics functions.
-from tensorflow.python.keras._impl.keras.metrics import binary_accuracy
-from tensorflow.python.keras._impl.keras.metrics import binary_crossentropy
-from tensorflow.python.keras._impl.keras.metrics import categorical_accuracy
-from tensorflow.python.keras._impl.keras.metrics import categorical_crossentropy
-from tensorflow.python.keras._impl.keras.metrics import cosine_proximity
-from tensorflow.python.keras._impl.keras.metrics import hinge
-from tensorflow.python.keras._impl.keras.metrics import kullback_leibler_divergence
-from tensorflow.python.keras._impl.keras.metrics import mean_absolute_error
-from tensorflow.python.keras._impl.keras.metrics import mean_absolute_percentage_error
-from tensorflow.python.keras._impl.keras.metrics import mean_squared_error
-from tensorflow.python.keras._impl.keras.metrics import mean_squared_logarithmic_error
-from tensorflow.python.keras._impl.keras.metrics import poisson
-from tensorflow.python.keras._impl.keras.metrics import sparse_categorical_crossentropy
-from tensorflow.python.keras._impl.keras.metrics import sparse_top_k_categorical_accuracy
-from tensorflow.python.keras._impl.keras.metrics import squared_hinge
-from tensorflow.python.keras._impl.keras.metrics import top_k_categorical_accuracy
-
-# Auxiliary utils.
-# pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.metrics import deserialize
-from tensorflow.python.keras._impl.keras.metrics import serialize
-from tensorflow.python.keras._impl.keras.metrics import get
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
similarity index 75%
rename from tensorflow/python/keras/_impl/keras/metrics_test.py
rename to tensorflow/python/keras/metrics_test.py
index 13cef9781278109a9e726409aae5c4e325b2a5f0..15e793f5fcf0b416978095da370fbdaabd1490a6 100644
--- a/tensorflow/python/keras/_impl/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
@@ -92,6 +92,7 @@ class KerasMetricsTest(test.TestCase):
         def __init__(self, name='true_positives', **kwargs):
           super(BinaryTruePositives, self).__init__(name=name, **kwargs)
           self.true_positives = keras.backend.variable(value=0, dtype='int32')
+          self.stateful = True
 
         def reset_states(self):
           keras.backend.set_value(self.true_positives, 0)
@@ -132,10 +133,17 @@ class KerasMetricsTest(test.TestCase):
                     metrics=['acc', metric_fn])
 
       # Test fit, evaluate
-      samples = 1000
+      samples = 100
       x = np.random.random((samples, 2))
       y = np.random.randint(2, size=(samples, 1))
-      model.fit(x, y, epochs=1, batch_size=10)
+      val_samples = 10
+      val_x = np.random.random((val_samples, 2))
+      val_y = np.random.randint(2, size=(val_samples, 1))
+
+      history = model.fit(x, y,
+                          epochs=1,
+                          batch_size=10,
+                          validation_data=(val_x, val_y))
       outs = model.evaluate(x, y, batch_size=10)
       preds = model.predict(x)
 
@@ -145,6 +153,37 @@ class KerasMetricsTest(test.TestCase):
       # Test correctness (e.g. updates should have been run)
       self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5)
 
+      # Test correctness of the validation metric computation
+      val_preds = model.predict(val_x)
+      val_outs = model.evaluate(val_x, val_y, batch_size=10)
+      self.assertAllClose(
+          val_outs[2], ref_true_pos(val_y, val_preds), atol=1e-5)
+      self.assertAllClose(
+          val_outs[2], history.history['val_true_positives'][-1], atol=1e-5)
+
+      # Test with generators
+      gen = [(np.array([x0]), np.array([y0])) for x0, y0 in zip(x, y)]
+      val_gen = [(np.array([x0]), np.array([y0]))
+                 for x0, y0 in zip(val_x, val_y)]
+      history = model.fit_generator(iter(gen),
+                                    epochs=1,
+                                    steps_per_epoch=samples,
+                                    validation_data=iter(val_gen),
+                                    validation_steps=val_samples)
+      outs = model.evaluate_generator(iter(gen), steps=samples)
+      preds = model.predict_generator(iter(gen), steps=samples)
+
+      # Test correctness of the metric results
+      self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5)
+
+      # Test correctness of the validation metric computation
+      val_preds = model.predict_generator(iter(val_gen), steps=val_samples)
+      val_outs = model.evaluate_generator(iter(val_gen), steps=val_samples)
+      self.assertAllClose(
+          val_outs[2], ref_true_pos(val_y, val_preds), atol=1e-5)
+      self.assertAllClose(
+          val_outs[2], history.history['val_true_positives'][-1], atol=1e-5)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
similarity index 93%
rename from tensorflow/python/keras/_impl/keras/model_subclassing_test.py
rename to tensorflow/python/keras/model_subclassing_test.py
index 295ad47f6be46443ce94f8c5c6228df50c5f693e..558854ab97bec203281162a03a8d513e487b3dfb 100644
--- a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -23,12 +23,15 @@ import os
 import numpy as np
 import six
 
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl import keras
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 try:
@@ -248,6 +251,26 @@ class ModelSubclassingTest(test.TestCase):
       model.fit([x1, x2], [y1, y2], epochs=2, steps_per_epoch=10, verbose=0)
       _ = model.evaluate(steps=10, verbose=0)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_single_io_workflow_with_dataset_iterators(self):
+    num_classes = 2
+    num_samples = 10
+    input_dim = 50
+
+    with self.test_session():
+      model = SimpleTestModel(num_classes=num_classes, use_dp=True, use_bn=True)
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+      x = np.ones((num_samples, input_dim))
+      y = np.zeros((num_samples, num_classes))
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+      iterator = dataset.make_one_shot_iterator()
+
+      model.fit(iterator, epochs=2, steps_per_epoch=10, verbose=0)
+      _ = model.evaluate(iterator, steps=10, verbose=0)
+
   def test_multi_io_workflow_with_numpy_arrays_and_custom_placeholders(self):
 
     num_classes = (2, 3)
@@ -583,6 +606,22 @@ class ModelSubclassingTest(test.TestCase):
     loss = model.train_on_batch(x, y)
     self.assertGreater(loss, 0.1)
 
+  def test_no_dependency(self):
+    class Foo(keras.Model):
+
+      def __init__(self):
+        super(Foo, self).__init__()
+        self.isdep = keras.layers.Dense(1)
+        self.notdep = checkpointable.NoDependency(keras.layers.Dense(2))
+        self.notdep_var = checkpointable.NoDependency(
+            resource_variable_ops.ResourceVariable(1., name='notdep_var'))
+
+    m = Foo()
+    self.assertEqual([m.isdep, m.notdep], m.layers)
+    self.assertEqual(1, len(m._checkpoint_dependencies))
+    self.assertIs(m.isdep, m._checkpoint_dependencies[0].ref)
+    self.assertEqual('notdep_var:0', m.notdep_var.name)
+
 
 class CustomCallModel(keras.Model):
 
diff --git a/tensorflow/python/keras/_impl/keras/models.py b/tensorflow/python/keras/models.py
similarity index 94%
rename from tensorflow/python/keras/_impl/keras/models.py
rename to tensorflow/python/keras/models.py
index 9602e7ba39b290f33c7ca9d0d1b5b35838667531..21217fdca14eabaa425903d5370731eb94fdeec6 100644
--- a/tensorflow/python/keras/_impl/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -19,14 +19,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.engine import saving
-from tensorflow.python.keras._impl.keras.engine import sequential
-from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.keras._impl.keras.engine.input_layer import Input
-from tensorflow.python.keras._impl.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine import saving
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.engine.input_layer import Input
+from tensorflow.python.keras.engine.input_layer import InputLayer
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils.generic_utils import has_arg
 
 
 # API entries importable from `keras.models`:
diff --git a/tensorflow/python/keras/models/__init__.py b/tensorflow/python/keras/models/__init__.py
deleted file mode 100644
index 2fb4ac0960d38f28a1c9c897a0f1aedf57e048ac..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/models/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras models API."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.models import load_model
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.models import model_from_config
-from tensorflow.python.keras._impl.keras.models import model_from_json
-from tensorflow.python.keras._impl.keras.models import model_from_yaml
-from tensorflow.python.keras._impl.keras.models import save_model
-from tensorflow.python.keras._impl.keras.models import Sequential
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/models_test.py b/tensorflow/python/keras/models_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/models_test.py
rename to tensorflow/python/keras/models_test.py
index 5978ddd987c63b9d87a31be6837172f08512ef73..01fb41b8ee134e29fdd9852b6c2f10a6bd79ef1c 100644
--- a/tensorflow/python/keras/_impl/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/optimizers.py
rename to tensorflow/python/keras/optimizers.py
index 9f383deb725ac69bf2f17f3627010c4e1f567ef0..febbda4df6c6e2ad67d30f7337fbd518da3d2439 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -26,9 +26,9 @@ from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
diff --git a/tensorflow/python/keras/optimizers/__init__.py b/tensorflow/python/keras/optimizers/__init__.py
deleted file mode 100644
index 44f47bc47f4a0e31aaf2ac8f67cfdbef410d8c44..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/optimizers/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras built-in optimizers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Optimizer classes.
-from tensorflow.python.keras._impl.keras.optimizers import Adadelta
-from tensorflow.python.keras._impl.keras.optimizers import Adagrad
-from tensorflow.python.keras._impl.keras.optimizers import Adam
-from tensorflow.python.keras._impl.keras.optimizers import Adamax
-from tensorflow.python.keras._impl.keras.optimizers import Nadam
-from tensorflow.python.keras._impl.keras.optimizers import Optimizer
-from tensorflow.python.keras._impl.keras.optimizers import RMSprop
-from tensorflow.python.keras._impl.keras.optimizers import SGD
-
-# Auxiliary utils.
-# pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.optimizers import deserialize
-from tensorflow.python.keras._impl.keras.optimizers import serialize
-from tensorflow.python.keras._impl.keras.optimizers import get
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/optimizers_test.py
rename to tensorflow/python/keras/optimizers_test.py
index 57636afbf089f27c00cc56c46fdb3ea50f89cc6b..92b0cf326158adb1c6124384571a075196dbd3cc 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.adam import AdamOptimizer
 
diff --git a/tensorflow/python/keras/preprocessing/__init__.py b/tensorflow/python/keras/preprocessing/__init__.py
index 8fa3911a7a8833f4b296519c84662cf39ea2dc88..e6704eeaa1f953be68e7ccdbc7e8bd60c62a61d8 100644
--- a/tensorflow/python/keras/preprocessing/__init__.py
+++ b/tensorflow/python/keras/preprocessing/__init__.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """Keras data preprocessing utils."""
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py
similarity index 79%
rename from tensorflow/python/keras/_impl/keras/preprocessing/image.py
rename to tensorflow/python/keras/preprocessing/image.py
index 6299445c34b99f20d7ae5090fc979d0ac2611109..aa425df6a8bdb29b90a6d7000d126b771247c19f 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/preprocessing/image.py
@@ -29,8 +29,8 @@ import re
 import threading
 
 import numpy as np
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -217,6 +217,16 @@ def random_zoom(x,
 
 @tf_export('keras.preprocessing.image.random_channel_shift')
 def random_channel_shift(x, intensity, channel_axis=0):
+  """Perform a random channel shift.
+
+  Arguments:
+      x: Input tensor. Must be 3D.
+      intensity: Transformation intensity.
+      channel_axis: Index of axis for channels in the input tensor.
+
+  Returns:
+      Numpy image tensor.
+  """
   x = np.rollaxis(x, channel_axis, 0)
   min_x, max_x = np.min(x), np.max(x)
   channel_images = [
@@ -451,54 +461,149 @@ def list_pictures(directory, ext='jpg|jpeg|bmp|png|ppm'):
 
 @tf_export('keras.preprocessing.image.ImageDataGenerator')
 class ImageDataGenerator(object):
-  """Generate minibatches of image data with real-time data augmentation.
+  """Generates batches of tensor image data with real-time data augmentation.
+  The data will be looped over (in batches).
 
   Arguments:
-      featurewise_center: set input mean to 0 over the dataset.
-      samplewise_center: set each sample mean to 0.
-      featurewise_std_normalization: divide inputs by std of the dataset.
-      samplewise_std_normalization: divide each input by its std.
-      zca_whitening: apply ZCA whitening.
+      featurewise_center: boolean, set input mean to 0 over the dataset,
+          feature-wise.
+      samplewise_center: boolean, set each sample mean to 0.
+      featurewise_std_normalization: boolean, divide inputs by std
+          of the dataset, feature-wise.
+      samplewise_std_normalization: boolean, divide each input by its std.
       zca_epsilon: epsilon for ZCA whitening. Default is 1e-6.
-      rotation_range: degrees (0 to 180).
-      width_shift_range: fraction of total width, if < 1, or pixels if >= 1.
-      height_shift_range: fraction of total height, if < 1, or pixels if >= 1.
-      brightness_range: the range of brightness to apply
-      shear_range: shear intensity (shear angle in degrees).
-      zoom_range: amount of zoom. if scalar z, zoom will be randomly picked
-          in the range [1-z, 1+z]. A sequence of two can be passed instead
-          to select this range.
-      channel_shift_range: shift range for each channel.
-      fill_mode: points outside the boundaries are filled according to the
-          given mode ('constant', 'nearest', 'reflect' or 'wrap'). Default
-          is 'nearest'.
-          Points outside the boundaries of the input are filled according to the
-            given mode:
+      zca_whitening: boolean, apply ZCA whitening.
+      rotation_range: int, degree range for random rotations.
+      width_shift_range: float, 1-D array-like or int
+          float: fraction of total width, if < 1, or pixels if >= 1.
+          1-D array-like: random elements from the array.
+          int: integer number of pixels from interval
+              `(-width_shift_range, +width_shift_range)`
+          With `width_shift_range=2` possible values are integers [-1, 0, +1],
+          same as with `width_shift_range=[-1, 0, +1]`,
+          while with `width_shift_range=1.0` possible values are floats in
+          the interval [-1.0, +1.0).
+      shear_range: float, shear Intensity
+          (Shear angle in counter-clockwise direction in degrees)
+      zoom_range: float or [lower, upper], Range for random zoom.
+          If a float, `[lower, upper] = [1-zoom_range, 1+zoom_range]`.
+      channel_shift_range: float, range for random channel shifts.
+      fill_mode: One of {"constant", "nearest", "reflect" or "wrap"}.
+          Default is 'nearest'. Points outside the boundaries of the input
+          are filled according to the given mode:
               'constant': kkkkkkkk|abcd|kkkkkkkk (cval=k)
               'nearest':  aaaaaaaa|abcd|dddddddd
               'reflect':  abcddcba|abcd|dcbaabcd
               'wrap':  abcdabcd|abcd|abcdabcd
-      cval: value used for points outside the boundaries when fill_mode is
-          'constant'. Default is 0.
-      horizontal_flip: whether to randomly flip images horizontally.
-      vertical_flip: whether to randomly flip images vertically.
-      rescale: rescaling factor. If None or 0, no rescaling is applied,
-          otherwise we multiply the data by the value provided. This is
-          applied after the `preprocessing_function` (if any provided)
-          but before any other transformation.
+      cval: float or int, value used for points outside the boundaries
+          when `fill_mode = "constant"`.
+      horizontal_flip: boolean, randomly flip inputs horizontally.
+      vertical_flip: boolean, randomly flip inputs vertically.
+      rescale: rescaling factor. Defaults to None. If None or 0, no rescaling
+          is applied, otherwise we multiply the data by the value provided
+          (before applying any other transformation).
       preprocessing_function: function that will be implied on each input.
-          The function will run before any other modification on it.
+          The function will run after the image is resized and augmented.
           The function should take one argument:
           one image (Numpy tensor with rank 3),
           and should output a Numpy tensor with the same shape.
-      data_format: 'channels_first' or 'channels_last'. In 'channels_first'
-        mode, the channels dimension
-          (the depth) is at index 1, in 'channels_last' mode it is at index 3.
+      data_format: One of {"channels_first", "channels_last"}.
+          "channels_last" mode means that the images should have shape
+              `(samples, height, width, channels)`,
+          "channels_first" mode means that the images should have shape
+              `(samples, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
+              Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
-      validation_split: fraction of images reserved for validation (strictly
-        between 0 and 1).
+      validation_split: float, fraction of images reserved for validation
+          (strictly between 0 and 1).
+
+  Examples:
+      Example of using `.flow(x, y)`:
+      ```python
+      (x_train, y_train), (x_test, y_test) = cifar10.load_data()
+      y_train = np_utils.to_categorical(y_train, num_classes)
+      y_test = np_utils.to_categorical(y_test, num_classes)
+      datagen = ImageDataGenerator(
+          featurewise_center=True,
+          featurewise_std_normalization=True,
+          rotation_range=20,
+          width_shift_range=0.2,
+          height_shift_range=0.2,
+          horizontal_flip=True)
+      # compute quantities required for featurewise normalization
+      # (std, mean, and principal components if ZCA whitening is applied)
+      datagen.fit(x_train)
+      # fits the model on batches with real-time data augmentation:
+      model.fit_generator(datagen.flow(x_train, y_train, batch_size=32),
+                          steps_per_epoch=len(x_train) / 32, epochs=epochs)
+      # here's a more "manual" example
+      for e in range(epochs):
+          print('Epoch', e)
+          batches = 0
+          for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=32):
+              model.fit(x_batch, y_batch)
+              batches += 1
+              if batches >= len(x_train) / 32:
+                  # we need to break the loop by hand because
+                  # the generator loops indefinitely
+                  break
+      ```
+      Example of using `.flow_from_directory(directory)`:
+      ```python
+      train_datagen = ImageDataGenerator(
+          rescale=1./255,
+          shear_range=0.2,
+          zoom_range=0.2,
+          horizontal_flip=True)
+      test_datagen = ImageDataGenerator(rescale=1./255)
+      train_generator = train_datagen.flow_from_directory(
+          'data/train',
+          target_size=(150, 150),
+          batch_size=32,
+          class_mode='binary')
+      validation_generator = test_datagen.flow_from_directory(
+          'data/validation',
+          target_size=(150, 150),
+          batch_size=32,
+          class_mode='binary')
+      model.fit_generator(
+          train_generator,
+          steps_per_epoch=2000,
+          epochs=50,
+          validation_data=validation_generator,
+          validation_steps=800)
+      ```
+      Example of transforming images and masks together.
+      ```python
+      # we create two instances with the same arguments
+      data_gen_args = dict(featurewise_center=True,
+                           featurewise_std_normalization=True,
+                           rotation_range=90.,
+                           width_shift_range=0.1,
+                           height_shift_range=0.1,
+                           zoom_range=0.2)
+      image_datagen = ImageDataGenerator(**data_gen_args)
+      mask_datagen = ImageDataGenerator(**data_gen_args)
+      # Provide the same seed and keyword arguments to the fit and flow methods
+      seed = 1
+      image_datagen.fit(images, augment=True, seed=seed)
+      mask_datagen.fit(masks, augment=True, seed=seed)
+      image_generator = image_datagen.flow_from_directory(
+          'data/images',
+          class_mode=None,
+          seed=seed)
+      mask_generator = mask_datagen.flow_from_directory(
+          'data/masks',
+          class_mode=None,
+          seed=seed)
+      # combine generators into one which yields image and masks
+      train_generator = zip(image_generator, mask_generator)
+      model.fit_generator(
+          train_generator,
+          steps_per_epoch=2000,
+          epochs=50)
+      ```
   """
 
   def __init__(self,
@@ -613,6 +718,31 @@ class ImageDataGenerator(object):
            save_prefix='',
            save_format='png',
            subset=None):
+    """Generates batches of augmented/normalized data with given numpy arrays.
+
+    Arguments:
+        x: data. Should have rank 4.
+            In case of grayscale data, the channels axis should have value 1
+            and in case of RGB data, it should have value 3.
+        y: labels.
+        batch_size: int (default: 32).
+        shuffle: boolean (default: True).
+        seed: int (default: None).
+        save_to_dir: None or str (default: None).
+            This allows you to optionally specify a directory
+            to which to save the augmented pictures being generated
+            (useful for visualizing what you are doing).
+        save_prefix: str (default: `''`). Prefix to use for filenames of
+            saved pictures (only relevant if `save_to_dir` is set).
+        save_format: one of "png", "jpeg". Default: "png".
+            (only relevant if `save_to_dir` is set)
+        subset: Subset of data (`"training"` or `"validation"`) if
+            `validation_split` is set in `ImageDataGenerator`.
+
+    Returns:
+        An Iterator yielding tuples of `(x, y)` where `x` is a numpy array of
+          image data and `y` is a numpy array of corresponding labels.
+    """
     return NumpyArrayIterator(
         x,
         y,
@@ -641,6 +771,65 @@ class ImageDataGenerator(object):
                           follow_links=False,
                           subset=None,
                           interpolation='nearest'):
+    """Generates batches of augmented/normalized data given directory path.
+
+    Arguments:
+        directory: path to the target directory. It should contain one
+            subdirectory per class. Any PNG, JPG, BMP, PPM or TIF images
+            inside each of the subdirectories directory tree will be included
+            in the generator. See [this script]
+            (https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d)
+            for more details.
+        target_size: tuple of integers `(height, width)`, default: `(256,
+            256)`. The dimensions to which all images found will be resized.
+        color_mode: one of "grayscale", "rbg". Default: "rgb". Whether the
+            images will be converted to have 1 or 3 color channels.
+        classes: optional list of class subdirectories (e.g. `['dogs',
+            'cats']`). Default: None. If not provided, the list of classes
+            will be automatically inferred from the subdirectory
+            names/structure under `directory`, where each subdirectory will be
+            treated as a different class (and the order of the classes, which
+            will map to the label indices, will be alphanumeric). The
+            dictionary containing the mapping from class names to class
+            indices can be obtained via the attribute `class_indices`.
+        class_mode: one of "categorical", "binary", "sparse", "input" or
+            None. Default: "categorical". Determines the type of label arrays
+            that are returned: "categorical" will be 2D one-hot encoded
+            labels, "binary" will be 1D binary labels, "sparse" will be 1D
+            integer labels, "input" will be images identical to input images
+            (mainly used to work with autoencoders). If None, no labels are
+            returned (the generator will only yield batches of image data,
+            which is useful to use `model.predict_generator()`,
+            `model.evaluate_generator()`, etc.). Please note that in case of
+            class_mode None, the data still needs to reside in a subdirectory
+            of `directory` for it to work correctly.
+        batch_size: size of the batches of data (default: 32).
+        shuffle: whether to shuffle the data (default: True)
+        seed: optional random seed for shuffling and transformations.
+        save_to_dir: None or str (default: None). This allows you to
+            optionally specify a directory to which to save the augmented
+            pictures being generated (useful for visualizing what you are doing)
+        save_prefix: str. Prefix to use for filenames of saved pictures
+            (only relevant if `save_to_dir` is set).
+        save_format: one of "png", "jpeg" (only relevant if `save_to_dir` is
+            set). Default: "png".
+        follow_links: whether to follow symlinks inside class subdirectories
+            (default: False).
+        subset: Subset of data (`"training"` or `"validation"`) if
+          ` validation_split` is set in `ImageDataGenerator`.
+        interpolation: Interpolation method used to resample the image if
+            the target size is different from that of the loaded image.
+            Supported methods are `"nearest"`, `"bilinear"`, and `"bicubic"`.
+            If PIL version 1.1.3 or newer is installed, `"lanczos"` is also
+            supported. If PIL version 3.4.0 or newer is installed, `"box"` and
+            `"hamming"` are also supported. By default, `"nearest"` is used.
+
+    Returns:
+        A DirectoryIterator yielding tuples of `(x, y)` where `x` is a
+        numpy array containing a batch of images with shape
+        `(batch_size, *target_size, channels)` and `y` is a numpy
+        array of corresponding labels.
+    """
     return DirectoryIterator(
         directory,
         self,
@@ -669,7 +858,7 @@ class ImageDataGenerator(object):
         The inputs, normalized.
     """
     if self.preprocessing_function:
-      x = self.image_data_generator.preprocessing_function(x)
+      x = self.preprocessing_function(x)
     if self.rescale:
       x *= self.rescale
     if self.samplewise_center:
@@ -737,15 +926,24 @@ class ImageDataGenerator(object):
       theta = 0
 
     if self.height_shift_range:
-      tx = np.random.uniform(-self.height_shift_range, self.height_shift_range)
-      if self.height_shift_range < 1:
+      try:  # 1-D array-like or int
+        tx = np.random.choice(self.height_shift_range)
+        tx *= np.random.choice([-1, 1])
+      except ValueError:  # floating point
+        tx = np.random.uniform(-self.height_shift_range,
+                               self.height_shift_range)
+      if np.max(self.height_shift_range) < 1:
         tx *= x.shape[img_row_axis]
     else:
       tx = 0
 
     if self.width_shift_range:
-      ty = np.random.uniform(-self.width_shift_range, self.width_shift_range)
-      if self.width_shift_range < 1:
+      try:  # 1-D array-like or int
+        ty = np.random.choice(self.width_shift_range)
+        ty *= np.random.choice([-1, 1])
+      except ValueError:  # floating point
+        ty = np.random.uniform(-self.width_shift_range, self.width_shift_range)
+      if np.max(self.width_shift_range) < 1:
         ty *= x.shape[img_col_axis]
     else:
       ty = 0
@@ -809,24 +1007,25 @@ class ImageDataGenerator(object):
     return x
 
   def fit(self, x, augment=False, rounds=1, seed=None):
-    """Fits internal statistics to some sample data.
+    """Computes the internal data statistics based on an array of sample data.
 
-    Required for featurewise_center, featurewise_std_normalization
-    and zca_whitening.
+    These are statistics related to the data-dependent transformations.
+    Only required if featurewise_center or featurewise_std_normalization or
+    zca_whitening.
 
     Arguments:
-        x: Numpy array, the data to fit on. Should have rank 4.
-            In case of grayscale data,
-            the channels axis should have value 1, and in case
-            of RGB data, it should have value 3.
-        augment: Whether to fit on randomly augmented samples
-        rounds: If `augment`,
-            how many augmentation passes to do over the data
-        seed: random seed.
+        x: sample data. Should have rank 4.
+            In case of grayscale data, the channels axis should have value 1
+            and in case of RGB data, it should have value 3.
+        augment: Boolean (default: False). Whether to fit on randomly
+            augmented samples.
+        rounds: int (default: 1). If augment, how many augmentation passes
+            over the data to use.
+        seed: int (default: None). Random seed.
 
     Raises:
-        ValueError: in case of invalid input `x`.
-        ImportError: if Scipy is not available.
+        ValueError: If input rank is not 4.
+        ImportError: If scipy is not imported.
     """
     x = np.asarray(x, dtype=K.floatx())
     if x.ndim != 4:
diff --git a/tensorflow/python/keras/preprocessing/image/__init__.py b/tensorflow/python/keras/preprocessing/image/__init__.py
deleted file mode 100644
index 6aba5fc8252e1acf604a89a4e66c2a7db080aa73..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/preprocessing/image/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras data preprocessing utils for image data."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.preprocessing.image import apply_transform
-from tensorflow.python.keras._impl.keras.preprocessing.image import array_to_img
-from tensorflow.python.keras._impl.keras.preprocessing.image import DirectoryIterator
-from tensorflow.python.keras._impl.keras.preprocessing.image import flip_axis
-from tensorflow.python.keras._impl.keras.preprocessing.image import ImageDataGenerator
-from tensorflow.python.keras._impl.keras.preprocessing.image import img_to_array
-from tensorflow.python.keras._impl.keras.preprocessing.image import Iterator
-from tensorflow.python.keras._impl.keras.preprocessing.image import load_img
-from tensorflow.python.keras._impl.keras.preprocessing.image import NumpyArrayIterator
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_brightness
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_channel_shift
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_rotation
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_shear
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_shift
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_zoom
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
similarity index 93%
rename from tensorflow/python/keras/_impl/keras/preprocessing/image_test.py
rename to tensorflow/python/keras/preprocessing/image_test.py
index 001fee91f9ed609c0b3cd88d4079e75c0e585b02..275808a6155b26159259584653cb48697af9f318 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -24,7 +24,7 @@ import tempfile
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 try:
@@ -246,7 +246,37 @@ class TestImage(test.TestCase):
     self.assertEqual(len(dir_iterator.class_indices), num_classes)
     self.assertEqual(len(dir_iterator.classes), count)
     self.assertEqual(set(dir_iterator.filenames), set(filenames))
-    _ = dir_iterator.next()
+
+    def preprocessing_function(x):
+      """This will fail if not provided by a Numpy array.
+
+      Note: This is made to enforce backward compatibility.
+
+      Args:
+          x: A numpy array.
+
+      Returns:
+          An array of zeros with the same shape as the given array.
+      """
+      self.assertEqual(x.shape, (26, 26, 3))
+      self.assertIs(type(x), np.ndarray)
+      return np.zeros_like(x)
+
+    # Test usage as Sequence
+    generator = keras.preprocessing.image.ImageDataGenerator(
+        preprocessing_function=preprocessing_function)
+    dir_seq = generator.flow_from_directory(
+        str(temp_dir),
+        target_size=(26, 26),
+        color_mode='rgb',
+        batch_size=3,
+        class_mode='categorical')
+    self.assertEqual(len(dir_seq), count // 3 + 1)
+    x1, y1 = dir_seq[1]
+    self.assertEqual(x1.shape, (3, 26, 26, 3))
+    self.assertEqual(y1.shape, (3, num_classes))
+    x1, y1 = dir_seq[5]
+    self.assertTrue((x1 == 0).all())
 
   def directory_iterator_with_validation_split_test_helper(
       self, validation_split):
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py b/tensorflow/python/keras/preprocessing/sequence.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
rename to tensorflow/python/keras/preprocessing/sequence.py
index e68c171d9c7e33d7e932f5d5b7f15859faa2348b..e0924f837a79dbdf31bee09667b43f70a1273b4b 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
+++ b/tensorflow/python/keras/preprocessing/sequence.py
@@ -23,7 +23,7 @@ import random
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 
-from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
+from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -357,9 +357,15 @@ class TimeseriesGenerator(Sequence):
     self.reverse = reverse
     self.batch_size = batch_size
 
+    if self.start_index > self.end_index:
+      raise ValueError('`start_index+length=%i > end_index=%i` '
+                       'is disallowed, as no part of the sequence '
+                       'would be left to be used as current step.' %
+                       (self.start_index, self.end_index))
+
   def __len__(self):
     length = int(
-        np.ceil((self.end_index - self.start_index) /
+        np.ceil((self.end_index - self.start_index + 1) /
                 (self.batch_size * self.stride)))
     return length if length >= 0 else 0
 
@@ -373,11 +379,12 @@ class TimeseriesGenerator(Sequence):
   def __getitem__(self, index):
     if self.shuffle:
       rows = np.random.randint(
-          self.start_index, self.end_index, size=self.batch_size)
+          self.start_index, self.end_index + 1, size=self.batch_size)
     else:
       i = self.start_index + self.batch_size * self.stride * index
-      rows = np.arange(i, min(i + self.batch_size * self.stride,
-                              self.end_index), self.stride)
+      rows = np.arange(
+          i, min(i + self.batch_size * self.stride, self.end_index + 1),
+          self.stride)
 
     samples, targets = self._empty_batch(len(rows))
     for j in range(len(rows)):
diff --git a/tensorflow/python/keras/preprocessing/sequence/__init__.py b/tensorflow/python/keras/preprocessing/sequence/__init__.py
deleted file mode 100644
index b7a7149cc40654c878e3c0db1fc78d8912abf498..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/preprocessing/sequence/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras data preprocessing utils for sequence data."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import make_sampling_table
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import pad_sequences
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import skipgrams
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import TimeseriesGenerator
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/sequence_test.py b/tensorflow/python/keras/preprocessing/sequence_test.py
similarity index 75%
rename from tensorflow/python/keras/_impl/keras/preprocessing/sequence_test.py
rename to tensorflow/python/keras/preprocessing/sequence_test.py
index b9bfdd000484665e8771f4bccef59738e5c26120..ab6a09106b5f3c8bc340a25ebe3fc82be3f71cd2 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/sequence_test.py
+++ b/tensorflow/python/keras/preprocessing/sequence_test.py
@@ -18,9 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from math import ceil
+
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
@@ -146,7 +148,7 @@ class TestSequence(test.TestCase):
         start_index=10,
         end_index=30,
         batch_size=2)
-    self.assertEqual(len(data_gen), 5)
+    self.assertEqual(len(data_gen), 6)
     self.assertAllClose(data_gen[0][0],
                         np.array([[[10], [12], [14], [16], [18]],
                                   [[11], [13], [15], [17], [19]]]))
@@ -163,13 +165,74 @@ class TestSequence(test.TestCase):
         end_index=30,
         batch_size=2)
 
-    self.assertEqual(len(data_gen), 5)
+    self.assertEqual(len(data_gen), 6)
     self.assertAllClose(data_gen[0][0],
                         np.array(
                             [np.array(data[10:19:2]),
                              np.array(data[11:20:2])]))
     self.assertAllClose(data_gen[0][1], np.array([targets[20], targets[21]]))
 
+    with self.assertRaises(ValueError) as context:
+      keras.preprocessing.sequence.TimeseriesGenerator(data, targets, length=50)
+    error = str(context.exception)
+    self.assertIn('`start_index+length=50 > end_index=49` is disallowed', error)
+
+  def test_TimeSeriesGenerator_doesnt_miss_any_sample(self):
+    x = np.array([[i] for i in range(10)])
+
+    for length in range(3, 10):
+      g = keras.preprocessing.sequence.TimeseriesGenerator(
+          x, x, length=length, batch_size=1)
+      expected = max(0, len(x) - length)
+      actual = len(g)
+      self.assertEqual(expected, actual)
+
+      if actual > 0:
+        # All elements in range(length, 10) should be used as current step
+        expected = np.arange(length, 10).reshape(-1, 1)
+
+        y = np.concatenate([g[ix][1] for ix in range(len(g))], axis=0)
+        self.assertAllClose(y, expected)
+
+    x = np.array([[i] for i in range(23)])
+
+    strides = (1, 1, 5, 7, 3, 5, 3)
+    lengths = (3, 3, 4, 3, 1, 3, 7)
+    batch_sizes = (6, 6, 6, 5, 6, 6, 6)
+    shuffles = (False, True, True, False, False, False, False)
+
+    for stride, length, batch_size, shuffle in zip(strides, lengths,
+                                                   batch_sizes, shuffles):
+      g = keras.preprocessing.sequence.TimeseriesGenerator(
+          x,
+          x,
+          length=length,
+          sampling_rate=1,
+          stride=stride,
+          start_index=0,
+          end_index=None,
+          shuffle=shuffle,
+          reverse=False,
+          batch_size=batch_size)
+      if shuffle:
+        # all batches have the same size when shuffle is True.
+        expected_sequences = ceil(
+            (23 - length) / float(batch_size * stride)) * batch_size
+      else:
+        # last batch will be different if `(samples - length) / stride`
+        # is not a multiple of `batch_size`.
+        expected_sequences = ceil((23 - length) / float(stride))
+
+      expected_batches = ceil(expected_sequences / float(batch_size))
+
+      y = [g[ix][1] for ix in range(len(g))]
+
+      actual_sequences = sum(len(iy) for iy in y)
+      actual_batches = len(y)
+
+      self.assertEqual(expected_sequences, actual_sequences)
+      self.assertEqual(expected_batches, actual_batches)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/text.py b/tensorflow/python/keras/preprocessing/text.py
similarity index 85%
rename from tensorflow/python/keras/_impl/keras/preprocessing/text.py
rename to tensorflow/python/keras/preprocessing/text.py
index f652f318f3d6dae20b1113a50cd02930abb851af..f3b57de257a58663f7eb30efb27638ce16b5c431 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/text.py
+++ b/tensorflow/python/keras/preprocessing/text.py
@@ -42,13 +42,15 @@ def text_to_word_sequence(text,
                           filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                           lower=True,
                           split=' '):
-  """Converts a text to a sequence of words (or tokens).
+  r"""Converts a text to a sequence of words (or tokens).
 
   Arguments:
       text: Input text (string).
-      filters: Sequence of characters to filter out.
-      lower: Whether to convert the input to lowercase.
-      split: Sentence split marker (string).
+      filters: list (or concatenation) of characters to filter out, such as
+          punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+          includes basic punctuation, tabs, and newlines.
+      lower: boolean, whether to convert the input to lowercase.
+      split: string, separator for word splitting.
 
   Returns:
       A list of words (or tokens).
@@ -56,12 +58,21 @@ def text_to_word_sequence(text,
   if lower:
     text = text.lower()
 
-  if sys.version_info < (3,) and isinstance(text, unicode):
-    translate_map = dict((ord(c), unicode(split)) for c in filters)
+  if sys.version_info < (3,):
+    if isinstance(text, unicode):
+      translate_map = dict((ord(c), unicode(split)) for c in filters)
+      text = text.translate(translate_map)
+    elif len(split) == 1:
+      translate_map = maketrans(filters, split * len(filters))
+      text = text.translate(translate_map)
+    else:
+      for c in filters:
+        text = text.replace(c, split)
   else:
-    translate_map = maketrans(filters, split * len(filters))
+    translate_dict = dict((c, split) for c in filters)
+    translate_map = maketrans(translate_dict)
+    text = text.translate(translate_map)
 
-  text = text.translate(translate_map)
   seq = text.split(split)
   return [i for i in seq if i]
 
@@ -72,20 +83,23 @@ def one_hot(text,
             filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
             lower=True,
             split=' '):
-  """One-hot encodes a text into a list of word indexes of size n.
+  r"""One-hot encodes a text into a list of word indexes of size n.
 
   This is a wrapper to the `hashing_trick` function using `hash` as the
   hashing function; unicity of word to index mapping non-guaranteed.
 
   Arguments:
       text: Input text (string).
-      n: Dimension of the hashing space.
-      filters: Sequence of characters to filter out.
-      lower: Whether to convert the input to lowercase.
-      split: Sentence split marker (string).
+      n: int, size of vocabulary.
+      filters: list (or concatenation) of characters to filter out, such as
+          punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+          includes basic punctuation, tabs, and newlines.
+      lower: boolean, whether to set the text to lowercase.
+      split: string, separator for word splitting.
 
   Returns:
-      A list of integer word indices (unicity non-guaranteed).
+      List of integers in [1, n].
+      Each integer encodes a word (unicity non-guaranteed).
   """
   return hashing_trick(
       text, n, hash_function=hash, filters=filters, lower=lower, split=split)
@@ -98,19 +112,21 @@ def hashing_trick(text,
                   filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                   lower=True,
                   split=' '):
-  """Converts a text to a sequence of indexes in a fixed-size hashing space.
+  r"""Converts a text to a sequence of indexes in a fixed-size hashing space.
 
   Arguments:
       text: Input text (string).
       n: Dimension of the hashing space.
-      hash_function: if `None` uses python `hash` function, can be 'md5' or
+      hash_function: defaults to python `hash` function, can be 'md5' or
           any function that takes in input a string and returns a int.
-          Note that `hash` is not a stable hashing function, so
+          Note that 'hash' is not a stable hashing function, so
           it is not consistent across different runs, while 'md5'
           is a stable hashing function.
-      filters: Sequence of characters to filter out.
-      lower: Whether to convert the input to lowercase.
-      split: Sentence split marker (string).
+      filters: list (or concatenation) of characters to filter out, such as
+          punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+          includes basic punctuation, tabs, and newlines.
+      lower: boolean, whether to set the text to lowercase.
+      split: string, separator for word splitting.
 
   Returns:
       A list of integer word indices (unicity non-guaranteed).
@@ -150,7 +166,7 @@ class Tokenizer(object):
           filtered from the texts. The default is all punctuation, plus
           tabs and line breaks, minus the `'` character.
       lower: boolean. Whether to convert the texts to lowercase.
-      split: character or string to use for token splitting.
+      split: string, separator for word splitting.
       char_level: if True, every character will be treated as a token.
       oov_token: if given, it will be added to word_index and used to
           replace out-of-vocabulary words during text_to_sequence calls
diff --git a/tensorflow/python/keras/preprocessing/text/__init__.py b/tensorflow/python/keras/preprocessing/text/__init__.py
deleted file mode 100644
index 000ad68a0c01e9067f8852836ba5d502deb3fcd4..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/preprocessing/text/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras data preprocessing utils for text data."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.preprocessing.text import hashing_trick
-from tensorflow.python.keras._impl.keras.preprocessing.text import one_hot
-from tensorflow.python.keras._impl.keras.preprocessing.text import text_to_word_sequence
-from tensorflow.python.keras._impl.keras.preprocessing.text import Tokenizer
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py b/tensorflow/python/keras/preprocessing/text_test.py
similarity index 89%
rename from tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
rename to tensorflow/python/keras/preprocessing/text_test.py
index c6a267e57e4e2dc04156483d1cf85a42a78eb395..566fd3bb1a36392fdf30da4f4a46dd076acdd1e0 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
+++ b/tensorflow/python/keras/preprocessing/text_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
@@ -114,11 +114,21 @@ class TestText(test.TestCase):
     seq = keras.preprocessing.text.text_to_word_sequence(text)
     self.assertEqual(seq, ['hello', 'world'])
 
+  def test_text_to_word_sequence_multichar_split(self):
+    text = 'hello!stop?world!'
+    seq = keras.preprocessing.text.text_to_word_sequence(text, split='stop')
+    self.assertEqual(seq, ['hello', 'world'])
+
   def test_text_to_word_sequence_unicode(self):
     text = u'ali! veli? kırk dokuz elli'
     seq = keras.preprocessing.text.text_to_word_sequence(text)
     self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
 
+  def test_text_to_word_sequence_unicode_multichar_split(self):
+    text = u'ali!stopveli?stopkırkstopdokuzstopelli'
+    seq = keras.preprocessing.text.text_to_word_sequence(text, split='stop')
+    self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
+
   def test_tokenizer_unicode(self):
     texts = [
         u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz'
diff --git a/tensorflow/python/keras/_impl/keras/regularizers.py b/tensorflow/python/keras/regularizers.py
similarity index 92%
rename from tensorflow/python/keras/_impl/keras/regularizers.py
rename to tensorflow/python/keras/regularizers.py
index 74c37d370ea630ca3c3e5e0945828f63928572e1..28b6ad4c65a2919323b81c89de6e5a3d4b5d3ff3 100644
--- a/tensorflow/python/keras/_impl/keras/regularizers.py
+++ b/tensorflow/python/keras/regularizers.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import six
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/regularizers/__init__.py b/tensorflow/python/keras/regularizers/__init__.py
deleted file mode 100644
index 3e707ccab577b5e28febd83d91f84d7b1c0d5d82..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/regularizers/__init__.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras built-in regularizers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Regularizer functions / callable classes.
-from tensorflow.python.keras._impl.keras.regularizers import L1L2
-from tensorflow.python.keras._impl.keras.regularizers import Regularizer
-
-# Functional interface.
-# pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.regularizers import l1
-from tensorflow.python.keras._impl.keras.regularizers import l2
-from tensorflow.python.keras._impl.keras.regularizers import l1_l2
-
-# Auxiliary utils.
-from tensorflow.python.keras._impl.keras.regularizers import deserialize
-from tensorflow.python.keras._impl.keras.regularizers import serialize
-from tensorflow.python.keras._impl.keras.regularizers import get
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/regularizers_test.py b/tensorflow/python/keras/regularizers_test.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/regularizers_test.py
rename to tensorflow/python/keras/regularizers_test.py
index c4f04833ba51d85c6e174cca0f546133253bddee..e2075785d8061a44da1fbf1b435a15ec6a652e11 100644
--- a/tensorflow/python/keras/_impl/keras/regularizers_test.py
+++ b/tensorflow/python/keras/regularizers_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/testing_utils.py
rename to tensorflow/python/keras/testing_utils.py
index 60799ee1e038b4466351248bb5de7c8fc0de02a2..e7cb45d5e110dcb749ae2b1b86dd8dd5b8ded4ef 100644
--- a/tensorflow/python/keras/_impl/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl import keras
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 from tensorflow.python.util import tf_inspect
 
@@ -37,7 +37,6 @@ def get_test_data(train_samples,
     test_samples: Integer, how many test samples to generate.
     input_shape: Tuple of integers, shape of the inputs.
     num_classes: Integer, number of classes for the data and targets.
-      Only relevant if `classification=True`.
 
   Returns:
     A tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
@@ -45,7 +44,7 @@ def get_test_data(train_samples,
   num_sample = train_samples + test_samples
   templates = 2 * num_classes * np.random.random((num_classes,) + input_shape)
   y = np.random.randint(0, num_classes, size=(num_sample,))
-  x = np.zeros((num_sample,) + input_shape)
+  x = np.zeros((num_sample,) + input_shape, dtype=np.float32)
   for i in range(num_sample):
     x[i] = templates[y[i]] + np.random.normal(loc=0, scale=1., size=input_shape)
   return ((x[:train_samples], y[:train_samples]),
diff --git a/tensorflow/python/keras/utils/__init__.py b/tensorflow/python/keras/utils/__init__.py
index 2f74cf031d0520c8d874b7269c52e3b9e1b9931b..69337b6a8d52abd4caf2ada518fde51c407f8103 100644
--- a/tensorflow/python/keras/utils/__init__.py
+++ b/tensorflow/python/keras/utils/__init__.py
@@ -18,22 +18,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
-from tensorflow.python.keras._impl.keras.utils.data_utils import SequenceEnqueuer
-from tensorflow.python.keras._impl.keras.utils.generic_utils import custom_object_scope
-from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import get_custom_objects
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.io_utils import HDF5Matrix
-from tensorflow.python.keras._impl.keras.utils.layer_utils import convert_all_kernels_in_model
-from tensorflow.python.keras._impl.keras.utils.multi_gpu_utils import multi_gpu_model
-from tensorflow.python.keras._impl.keras.utils.np_utils import normalize
-from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
-from tensorflow.python.keras._impl.keras.utils.vis_utils import plot_model
+from tensorflow.python.keras.utils.data_utils import GeneratorEnqueuer
+from tensorflow.python.keras.utils.data_utils import get_file
+from tensorflow.python.keras.utils.data_utils import OrderedEnqueuer
+from tensorflow.python.keras.utils.data_utils import Sequence
+from tensorflow.python.keras.utils.data_utils import SequenceEnqueuer
+from tensorflow.python.keras.utils.generic_utils import custom_object_scope
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import get_custom_objects
+from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras.utils.io_utils import HDF5Matrix
+from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
+from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
+from tensorflow.python.keras.utils.np_utils import normalize
+from tensorflow.python.keras.utils.np_utils import to_categorical
+from tensorflow.python.keras.utils.vis_utils import plot_model
 
 del absolute_import
 del division
diff --git a/tensorflow/python/keras/_impl/keras/utils/conv_utils.py b/tensorflow/python/keras/utils/conv_utils.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/utils/conv_utils.py
rename to tensorflow/python/keras/utils/conv_utils.py
index 8882a3a46bcb9de7283a67f001e67ed8644a0cf7..5419e7ae0583abcf2e09d0bcc5b9526f2a9969bf 100644
--- a/tensorflow/python/keras/_impl/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/utils/conv_utils.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 
-from tensorflow.python.keras._impl.keras import backend
+from tensorflow.python.keras import backend
 
 
 def convert_data_format(data_format, ndim):
diff --git a/tensorflow/python/keras/_impl/keras/utils/data_utils.py b/tensorflow/python/keras/utils/data_utils.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/utils/data_utils.py
rename to tensorflow/python/keras/utils/data_utils.py
index 4c49544c6a63c4e5a0b79d31b074ad352c512bfa..a1f89d9d43400983baaf81d47aeb480d4d8f30c4 100644
--- a/tensorflow/python/keras/_impl/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/utils/data_utils.py
@@ -39,7 +39,7 @@ from six.moves.urllib.error import HTTPError
 from six.moves.urllib.error import URLError
 from six.moves.urllib.request import urlopen
 
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py b/tensorflow/python/keras/utils/data_utils_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/utils/data_utils_test.py
rename to tensorflow/python/keras/utils/data_utils_test.py
index 677e98e871d4a148b13c1aa22696917ed8dc90f9..395df7e0e786d510e785c3ed099905a91e09a149 100644
--- a/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py
+++ b/tensorflow/python/keras/utils/data_utils_test.py
@@ -29,7 +29,7 @@ import numpy as np
 from six.moves.urllib.parse import urljoin
 from six.moves.urllib.request import pathname2url
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/utils/generic_utils.py
rename to tensorflow/python/keras/utils/generic_utils.py
index db184d278cfd1022fafcd55d7bb6a6544c82656d..a69893955f4f1cd7d4fafb1746019a59c240dd09 100644
--- a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -349,7 +349,10 @@ class Progbar(object):
           self._values[k][0] += v * (current - self._seen_so_far)
           self._values[k][1] += (current - self._seen_so_far)
       else:
-        self._values[k] = v
+        # Stateful metrics output a numeric value. This representation
+        # means "take an average from a single value" but keeps the
+        # numeric formatting.
+        self._values[k] = [v, 1]
     self._seen_so_far = current
 
     now = time.time()
diff --git a/tensorflow/python/keras/_impl/keras/utils/generic_utils_test.py b/tensorflow/python/keras/utils/generic_utils_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/utils/generic_utils_test.py
rename to tensorflow/python/keras/utils/generic_utils_test.py
index d57692f4f41753fc38ead2ace7e989b499bc23ff..87bc19eb37d15d35bb8ad0f5d086404f9c4f55ca 100644
--- a/tensorflow/python/keras/_impl/keras/utils/generic_utils_test.py
+++ b/tensorflow/python/keras/utils/generic_utils_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/io_utils.py b/tensorflow/python/keras/utils/io_utils.py
similarity index 100%
rename from tensorflow/python/keras/_impl/keras/utils/io_utils.py
rename to tensorflow/python/keras/utils/io_utils.py
diff --git a/tensorflow/python/keras/_impl/keras/utils/io_utils_test.py b/tensorflow/python/keras/utils/io_utils_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/utils/io_utils_test.py
rename to tensorflow/python/keras/utils/io_utils_test.py
index cfeba188d3cadfa08efbd07fcbd46776b691e06f..3895dca68e37e1597b93d8eeded7e5cfb0d3e338 100644
--- a/tensorflow/python/keras/_impl/keras/utils/io_utils_test.py
+++ b/tensorflow/python/keras/utils/io_utils_test.py
@@ -23,7 +23,7 @@ import shutil
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 try:
diff --git a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/utils/layer_utils.py
rename to tensorflow/python/keras/utils/layer_utils.py
index 902972ecbb8fd69a9252b7e19e32bee5e33e4f97..bd61f8e9cccb1a0ba25dd9a449453df837b89ad2 100644
--- a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.conv_utils import convert_kernel
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.conv_utils import convert_kernel
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils.py b/tensorflow/python/keras/utils/multi_gpu_utils.py
similarity index 77%
rename from tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils.py
rename to tensorflow/python/keras/utils/multi_gpu_utils.py
index 231ace2a0b4a4f25cebf06a5216cf3d30aadc49b..e5442f04e316c6c2ec6f814cf8ae2aad546dc7d7 100644
--- a/tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils.py
@@ -18,8 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.engine.training import Model
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine.training import Model
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -34,7 +34,7 @@ def _normalize_device_name(name):
 
 
 @tf_export('keras.utils.multi_gpu_model')
-def multi_gpu_model(model, gpus):
+def multi_gpu_model(model, gpus, cpu_merge=True, cpu_relocation=False):
   """Replicates a model on different GPUs.
 
   Specifically, this function implements single-machine
@@ -61,12 +61,18 @@ def multi_gpu_model(model, gpus):
           (see usage example below).
       gpus: Integer >= 2, number of on GPUs on which to create
           model replicas.
+      cpu_merge: A boolean value to identify whether to force
+          merging model weights under the scope of the CPU or not.
+      cpu_relocation: A boolean value to identify whether to
+          create the model's weights under the scope of the CPU.
+          If the model is not defined under any preceding device
+          scope, you can still rescue it by activating this option.
 
   Returns:
       A Keras `Model` instance which can be used just like the initial
       `model` argument, but which distributes its workload on multiple GPUs.
 
-  Example:
+  Example 1: Training models with weights merge on CPU
 
   ```python
       import tensorflow as tf
@@ -107,12 +113,45 @@ def multi_gpu_model(model, gpus):
       model.save('my_model.h5')
   ```
 
+  Example 2: Training models with weights merge on CPU using cpu_relocation
+
+  ```python
+       ..
+       # Not needed to change the device scope for model definition:
+       model = Xception(weights=None, ..)
+
+       try:
+           model = multi_gpu_model(model, cpu_relocation=True)
+           print("Training using multiple GPUs..")
+       except:
+           print("Training using single GPU or CPU..")
+
+       model.compile(..)
+       ..
+  ```
+
+  Example 3: Training models with weights merge on GPU (recommended for NV-link)
+
+  ```python
+       ..
+       # Not needed to change the device scope for model definition:
+       model = Xception(weights=None, ..)
+
+       try:
+           model = multi_gpu_model(model, cpu_merge=False)
+           print("Training using multiple GPUs..")
+       except:
+           print("Training using single GPU or CPU..")
+       model.compile(..)
+       ..
+  ```
+
   Raises:
     ValueError: if the `gpus` argument does not match available devices.
   """
   # pylint: disable=g-import-not-at-top
-  from tensorflow.python.keras._impl.keras.layers.core import Lambda
-  from tensorflow.python.keras._impl.keras.layers.merge import concatenate
+  from tensorflow.python.keras.layers.core import Lambda
+  from tensorflow.python.keras.layers.merge import concatenate
 
   if isinstance(gpus, (list, tuple)):
     if len(gpus) <= 1:
@@ -166,6 +205,12 @@ def multi_gpu_model(model, gpus):
     start = stride * i
     return array_ops.slice(data, start, size)
 
+  # Relocate the model definition under CPU device scope if needed
+  if cpu_relocation:
+    from tensorflow.python.keras.models import clone_model  # pylint: disable=g-import-not-at-top
+    with ops.device('/cpu:0'):
+      model = clone_model(model)
+
   all_outputs = []
   for i in range(len(model.outputs)):
     all_outputs.append([])
@@ -199,8 +244,8 @@ def multi_gpu_model(model, gpus):
         for o in range(len(outputs)):
           all_outputs[o].append(outputs[o])
 
-  # Merge outputs on CPU.
-  with ops.device('/cpu:0'):
+  # Merge outputs under expected scope.
+  with ops.device('/cpu:0' if cpu_merge else '/gpu:%d' % target_gpu_ids[0]):
     merged = []
     for name, outputs in zip(model.output_names, all_outputs):
       merged.append(concatenate(outputs, axis=0, name=name))
diff --git a/tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils_test.py
rename to tensorflow/python/keras/utils/multi_gpu_utils_test.py
index 0a38d6b5228fe791ce14adc7e37e0b7a6926fadf..77792d14f53d009c0bfc17273c034c37039106bf 100644
--- a/tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils_test.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import data
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/np_utils.py b/tensorflow/python/keras/utils/np_utils.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/utils/np_utils.py
rename to tensorflow/python/keras/utils/np_utils.py
index a611be08aaed824ebb278b4b28ef52ea1872563b..9d9c72b162700cb3bca2cf83d56db30f8df1deb9 100644
--- a/tensorflow/python/keras/_impl/keras/utils/np_utils.py
+++ b/tensorflow/python/keras/utils/np_utils.py
@@ -43,7 +43,7 @@ def to_categorical(y, num_classes=None):
   if not num_classes:
     num_classes = np.max(y) + 1
   n = y.shape[0]
-  categorical = np.zeros((n, num_classes))
+  categorical = np.zeros((n, num_classes), dtype=np.float32)
   categorical[np.arange(n), y] = 1
   output_shape = input_shape + (num_classes,)
   categorical = np.reshape(categorical, output_shape)
diff --git a/tensorflow/python/keras/_impl/keras/utils/np_utils_test.py b/tensorflow/python/keras/utils/np_utils_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/utils/np_utils_test.py
rename to tensorflow/python/keras/utils/np_utils_test.py
index 1e974c2ef2aee3b6a83ad777673505f8c75b2b58..d77e76ff3ecb70bb19c0485e6e32940554e893d5 100644
--- a/tensorflow/python/keras/_impl/keras/utils/np_utils_test.py
+++ b/tensorflow/python/keras/utils/np_utils_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
similarity index 100%
rename from tensorflow/python/keras/_impl/keras/utils/tf_utils.py
rename to tensorflow/python/keras/utils/tf_utils.py
diff --git a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/utils/vis_utils.py
rename to tensorflow/python/keras/utils/vis_utils.py
index 4761cece82c727e4962d0374f8efb80dfaeac3c6..7a454ac8314acdfa3c3e61c080acdd9efdf3acdc 100644
--- a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -65,8 +65,8 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   Returns:
       A `pydot.Dot` instance representing the Keras model.
   """
-  from tensorflow.python.keras._impl.keras.layers.wrappers import Wrapper
-  from tensorflow.python.keras._impl.keras.models import Sequential
+  from tensorflow.python.keras.layers.wrappers import Wrapper
+  from tensorflow.python.keras.models import Sequential
 
   _check_pydot()
   dot = pydot.Dot()
@@ -77,7 +77,6 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   if isinstance(model, Sequential):
     if not model.built:
       model.build()
-    model = model.model
   layers = model.layers
 
   # Create graph nodes.
diff --git a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py b/tensorflow/python/keras/wrappers/scikit_learn.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
rename to tensorflow/python/keras/wrappers/scikit_learn.py
index 2884dc84cc5d99511947e6f0f97b0bf8a505221f..4462d94ecdb10c6f7306de1f552151e209394bac 100644
--- a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
+++ b/tensorflow/python/keras/wrappers/scikit_learn.py
@@ -23,9 +23,9 @@ import types
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras.models import Sequential
-from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
-from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
+from tensorflow.python.keras.models import Sequential
+from tensorflow.python.keras.utils.generic_utils import has_arg
+from tensorflow.python.keras.utils.np_utils import to_categorical
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/wrappers/scikit_learn/__init__.py b/tensorflow/python/keras/wrappers/scikit_learn/__init__.py
deleted file mode 100644
index a46f859273ea0117e29a403057f9f81bc758dd52..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/wrappers/scikit_learn/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras scikit-learn API wrapper."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.wrappers.scikit_learn import KerasClassifier
-from tensorflow.python.keras._impl.keras.wrappers.scikit_learn import KerasRegressor
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn_test.py b/tensorflow/python/keras/wrappers/scikit_learn_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/wrappers/scikit_learn_test.py
rename to tensorflow/python/keras/wrappers/scikit_learn_test.py
index b20a84ee88b5b2b70ca2f718fbe86ffd6e949461..c322efdedf10e961f7590591ba0048e42492aaff 100644
--- a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn_test.py
+++ b/tensorflow/python/keras/wrappers/scikit_learn_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 INPUT_DIM = 5
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index c892b6ee9a0071a66556f78779ed91646f88d185..83b353600ad47c59759fff81c64fe5e8c2ed7926 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -741,6 +741,18 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "regex_full_match_op_test",
+    size = "small",
+    srcs = ["regex_full_match_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
 tf_py_test(
     name = "save_restore_ops_test",
     size = "small",
@@ -1222,6 +1234,7 @@ cuda_py_test(
     shard_count = 10,
     tags = [
         "noasan",  # times out
+        "optonly",  # times out
     ],
 )
 
@@ -2363,6 +2376,9 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    tags = [
+        "optonly",  # flaky timeouts unless optimized
+    ],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 7acca0a4a09deb89c9453aa1389c422a2ed43d81..08bf2d9c644bcde2a80e6138557dae6e19383dfd 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -998,11 +998,9 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     v = resource_variable_ops.ResourceVariable(init_val)
     with self.test_session() as sess:
       sess.run(v.initializer)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "l-value dtype int32 does not match r-value dtype int64"):
+      with self.assertRaises(ValueError):
         sess.run(v[:].assign(too_large_val))
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaises(ValueError):
         sess.run(v[:].assign(too_small_val))
 
 
@@ -1042,7 +1040,6 @@ class ShapeSizeRankTest(test_util.TensorFlowTestCase):
         self.evaluate(array_ops.size(tensor, out_type=dtypes.int64)).dtype)
 
 
-@test_util.with_c_api
 class SequenceMaskTest(test_util.TensorFlowTestCase):
 
   def testExceptions(self):
@@ -1065,10 +1062,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
       # test dtype and default maxlen:
       res = array_ops.sequence_mask(constant_op.constant([0, 1, 4]),
                                     dtype=dtypes.float32)
-      if ops._USE_C_API:
-        self.assertAllEqual(res.get_shape().as_list(), [3, 4])
-      else:
-        self.assertAllEqual(res.get_shape().as_list(), [3, None])
+      self.assertAllEqual(res.get_shape().as_list(), [3, 4])
       self.assertAllEqual(
           res.eval(),
           [[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]])
@@ -1078,10 +1072,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
     with self.test_session():
       res = array_ops.sequence_mask(
           constant_op.constant([0, 1, 4]))
-      if ops._USE_C_API:
-        self.assertAllEqual(res.get_shape().as_list(), [3, 4])
-      else:
-        self.assertAllEqual(res.get_shape().as_list(), [3, None])
+      self.assertAllEqual(res.get_shape().as_list(), [3, 4])
       self.assertAllEqual(
           res.eval(),
           [[False, False, False, False],
@@ -1100,10 +1091,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
       # test dtype and default maxlen:
       res = array_ops.sequence_mask(
           constant_op.constant([[0, 1, 4], [1, 2, 3]]), dtype=dtypes.float32)
-      if ops._USE_C_API:
-        self.assertAllEqual(res.get_shape().as_list(), [2, 3, 4])
-      else:
-        self.assertAllEqual(res.get_shape().as_list(), [2, 3, None])
+      self.assertAllEqual(res.get_shape().as_list(), [2, 3, 4])
       self.assertAllEqual(
           res.eval(),
           [[[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]],
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
index 54f33f336015cc9cb50658941b8e157cc1b94df9..92cd53a031e73d4ff4ac50c2465f32a2c20545a7 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -792,6 +792,28 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
 class PredictionOpsTest(test_util.TensorFlowTestCase):
   """Tests prediction ops for inference."""
 
+  def testPredictionOnEmptyEnsemble(self):
+    """Tests that prediction on a empty ensemble does not fail."""
+    with self.test_session() as session:
+      # Create an empty ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto='')
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_0_values = [36, 32]
+      feature_1_values = [11, 27]
+      expected_logits = [[0.0], [0.0]]
+
+      # Prediction should work fine.
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
   def testPredictionMultipleTree(self):
     """Tests the predictions work when we have multiple trees."""
     with self.test_session() as session:
@@ -893,16 +915,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       #            logit= 0.1*1.14+0.2*7.0-1*7.0
       expected_logits = [[6.114], [-5.486]]
 
-      # Do with parallelization, e.g. EVAL
-      predict_op = boosted_trees_ops.predict(
-          tree_ensemble_handle,
-          bucketized_features=[feature_0_values, feature_1_values],
-          logits_dimension=1)
-
-      logits = session.run(predict_op)
-      self.assertAllClose(expected_logits, logits)
-
-      # Do without parallelization, e.g. INFER - the result is the same
+      # Prediction should work fine.
       predict_op = boosted_trees_ops.predict(
           tree_ensemble_handle,
           bucketized_features=[feature_0_values, feature_1_values],
diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
index d6c004774746dd28a7b376eb2e0564e5b71e5b40..13b804875e94a9f8acc9c441ba2525876a3ef58f 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
@@ -1379,7 +1379,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         }
         post_pruned_nodes_meta {
           new_node_id: 0
-          logit_change: -24.0143
+          logit_change: -24.014299
         }
       }
       tree_metadata {
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index e08123b0417912c479476d8147d832d1715b8882..fb52d10475fa47f37b1ee7de97b49878b5d13341 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -18,9 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -414,6 +417,16 @@ class ClipTest(test.TestCase):
 
     self.assertAllClose(np_ans, tf_ans)
 
+  def testClipByValueEmptyTensor(self):
+    # Test case for GitHub issue 19337
+    zero = array_ops.placeholder(dtype=dtypes.float32, shape=None)
+    x = clip_ops.clip_by_value(zero, zero, zero)
+    y = clip_ops.clip_by_value(zero, 1.0, 1.0)
+    z = clip_ops.clip_by_value(zero, zero, 1.0)
+    w = clip_ops.clip_by_value(zero, 1.0, zero)
+    with self.test_session(use_gpu=True) as sess:
+      sess.run([x, y, z, w], feed_dict={zero: np.zeros((7, 0))})
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 77e6f5f1a0d645101584f0eeec2ced80459197ba..68873df97ea2a632d98de4936a20a1f81bce93e9 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -38,7 +38,6 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
@@ -123,7 +122,6 @@ def isum(s, maximum_iterations=None):
   return r_s
 
 
-@test_util.with_c_api
 class ControlFlowTest(test.TestCase):
 
   def testRefIdentity(self):
@@ -1847,6 +1845,23 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(math_ops.less(1, 2), fn1, lambda: x)
       self.assertAllClose(9.0, r.eval(feed_dict={x: 1.0}))
 
+  def testGradInWhileWrtInitialLoopVal(self):
+    with self.test_session():
+      x = array_ops.placeholder(dtypes.float32, shape=(), name="x")
+      y = x + 1
+
+      def body(i, v):
+        z = v * 2
+        return i + 1, gradients_impl.gradients(z, x)[0]
+
+      with self.assertRaisesRegexp(
+          ValueError,
+          "Cannot compute gradient inside while loop with respect to op 'x'. "
+          "We do not support taking the gradient wrt or through the initial "
+          "value of a loop variable. Gradients can be computed through "
+          "loop invariants or wrt the input parameters to the loop body."):
+        control_flow_ops.while_loop(lambda i, x: i < 3, body, [0, y])
+
   def testWhileGradInWhile(self):
     with self.test_session():
       n = ops.convert_to_tensor(1.0, name="n")
@@ -2930,7 +2945,6 @@ class ControlFlowTest(test.TestCase):
           1)
 
 
-@test_util.with_c_api
 class ControlFlowContextCheckTest(test.TestCase):
 
   def _getWhileTensor(self):
@@ -3050,7 +3064,6 @@ class ControlFlowContextCheckTest(test.TestCase):
           math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
 
 
-@test_util.with_c_api
 class TupleTest(test.TestCase):
 
   def testTensors(self):
@@ -3136,7 +3149,6 @@ class TupleTest(test.TestCase):
       self.assertEquals(1, var.eval())
 
 
-@test_util.with_c_api
 class AssertTest(test.TestCase):
 
   def testGuardedAssertDoesNotCopyWhenTrue(self):
@@ -3176,7 +3188,6 @@ class AssertTest(test.TestCase):
       self.assertEqual([], guarded_memcpy_nodestat_names)
 
 
-@test_util.with_c_api
 class WhileOpBenchmark(test.Benchmark):
   """Evaluate the performance of while_loop op."""
 
@@ -3291,7 +3302,6 @@ class WhileOpBenchmark(test.Benchmark):
         name="unroll_same_device", iters=iters, wall_time=duration)
 
 
-@test_util.with_c_api
 class EagerTest(test.TestCase):
 
   def testCond(self):
diff --git a/tensorflow/python/kernel_tests/control_flow_util_test.py b/tensorflow/python/kernel_tests/control_flow_util_test.py
index 39e96f74b0461da0cf499e303b30a4a41aae4899..762c445da05008a78fec1ec9e1cc7186e1539134 100644
--- a/tensorflow/python/kernel_tests/control_flow_util_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_util_test.py
@@ -19,9 +19,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.platform import test
 
@@ -66,6 +70,111 @@ class ControlFlowUtilTest(test.TestCase):
 
     self.assertFalse(control_flow_util.IsLoopExit(test_ops.int_output().op))
 
+  def build_test_graph(self):
+    g = ops.Graph()
+    with g.as_default():
+
+      def while_loop(x):
+
+        def b(x):
+          with ops.name_scope("NestedCond"):
+            return control_flow_ops.cond(
+                math_ops.less(x, 100), lambda: math_ops.add(x, 1),
+                lambda: math_ops.add(x, 2))
+
+        c = lambda x: math_ops.less(x, 10000)
+        with ops.name_scope("OuterWhile"):
+          return control_flow_ops.while_loop(c, b, [x])
+
+      x = array_ops.placeholder(dtypes.int32)
+      with ops.name_scope("OuterCond"):
+        control_flow_ops.cond(
+            math_ops.less(x, 1000), lambda: while_loop(x),
+            lambda: math_ops.add(x, 2))
+    return g
+
+  def testIsCondSwitch(self):
+    g = self.build_test_graph()
+
+    cond_switch = [
+        "OuterCond/cond/Switch",
+        "OuterCond/cond/OuterWhile/while/Switch",
+        "OuterCond/cond/OuterWhile/while/NestedCond/cond/Switch",
+        "OuterCond/cond/OuterWhile/while/NestedCond/cond/Add/Switch",
+        "OuterCond/cond/OuterWhile/while/NestedCond/cond/Add_1/Switch",
+        "OuterCond/cond/Add/Switch",
+    ]
+    for n in g.get_operations():
+      if control_flow_util.IsSwitch(n):
+        self.assertTrue(
+            control_flow_util.IsCondSwitch(n) != control_flow_util.IsLoopSwitch(
+                n))
+      if n.name in cond_switch:
+        self.assertTrue(control_flow_util.IsSwitch(n))
+        self.assertTrue(
+            control_flow_util.IsCondSwitch(n),
+            msg="Mismatch for {}".format(n.name))
+        self.assertFalse(
+            control_flow_util.IsLoopSwitch(n),
+            msg="Mismatch for {}".format(n.name))
+      else:
+        self.assertFalse(
+            control_flow_util.IsCondSwitch(n),
+            msg="Mismatch for {}".format(n.name))
+
+  def testIsLoopSwitch(self):
+    g = self.build_test_graph()
+
+    loop_switch = ["OuterCond/cond/OuterWhile/while/Switch_1"]
+    for n in g.get_operations():
+      if control_flow_util.IsSwitch(n):
+        self.assertTrue(
+            control_flow_util.IsCondSwitch(n) != control_flow_util.IsLoopSwitch(
+                n))
+      if n.name in loop_switch:
+        self.assertTrue(control_flow_util.IsSwitch(n))
+        self.assertFalse(
+            control_flow_util.IsCondSwitch(n),
+            msg="Mismatch for {}".format(n.name))
+        self.assertTrue(
+            control_flow_util.IsLoopSwitch(n),
+            msg="Mismatch for {}".format(n.name))
+      else:
+        self.assertFalse(
+            control_flow_util.IsLoopSwitch(n),
+            msg="Mismatch for {}".format(n.name))
+
+  def testIsCondMerge(self):
+    g = self.build_test_graph()
+    cond_merges = [
+        "OuterCond/cond/OuterWhile/while/NestedCond/cond/Merge",
+        "OuterCond/cond/Merge"
+    ]
+    for n in g.get_operations():
+      if n.name in cond_merges:
+        self.assertTrue(control_flow_util.IsMerge(n))
+        self.assertTrue(control_flow_util.IsCondMerge(n))
+        self.assertFalse(control_flow_util.IsLoopMerge(n))
+      else:
+        self.assertFalse(control_flow_util.IsCondMerge(n))
+        self.assertTrue(not control_flow_util.IsMerge(n) or
+                        control_flow_util.IsLoopMerge(n))
+
+  def testIsLoopMerge(self):
+    g = self.build_test_graph()
+    loop_merges = [
+        "OuterCond/cond/OuterWhile/while/Merge",
+    ]
+    for n in g.get_operations():
+      if n.name in loop_merges:
+        self.assertTrue(control_flow_util.IsMerge(n))
+        self.assertFalse(control_flow_util.IsCondMerge(n))
+        self.assertTrue(control_flow_util.IsLoopMerge(n))
+      else:
+        self.assertFalse(control_flow_util.IsLoopMerge(n))
+        self.assertTrue(not control_flow_util.IsMerge(n) or
+                        control_flow_util.IsCondMerge(n))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv1d_test.py b/tensorflow/python/kernel_tests/conv1d_test.py
index e2e6205911caa06b52f21658a91a53d60a0130ff..fcba456004407b1045400119e278b78f78518dfb 100644
--- a/tensorflow/python/kernel_tests/conv1d_test.py
+++ b/tensorflow/python/kernel_tests/conv1d_test.py
@@ -31,9 +31,7 @@ class Conv1DTest(test.TestCase):
 
   def testBasic(self):
     """Test that argument passing to conv1d is handled properly."""
-    # TODO(yongtang): dtypes.float64 can only be enabled once conv2d support
-    # dtypes.float64, as conv1d implicitly calls conv2d after expand_dims.
-    for dtype in [dtypes.float16, dtypes.float32]:
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant([1, 2, 3, 4], dtype=dtype)
       x = array_ops.expand_dims(x, 0)  # Add batch dimension
       x = array_ops.expand_dims(x, 2)  # And depth dimension
diff --git a/tensorflow/python/kernel_tests/conv2d_transpose_test.py b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
index b692d3da609fd97a55b8f5fce3334b8e9d97c827..27804be65ca9a596a520c5e4a6a83a517d10b4dd 100644
--- a/tensorflow/python/kernel_tests/conv2d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
@@ -23,6 +23,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
@@ -292,6 +293,7 @@ class Conv2DTransposeTest(test.TestCase):
 
         self.assertAllClose(cache_values, value)
 
+  @test_util.enable_c_shapes
   def testConv2DTransposeShapeInference(self):
     # Test case for 8972
     initializer = random_ops.truncated_normal(
@@ -301,7 +303,8 @@ class Conv2DTransposeTest(test.TestCase):
     f_shape = array_ops.stack([array_ops.shape(x)[0], 10, 5, 5])
     output = nn_ops.conv2d_transpose(
         x, f, f_shape, strides=[1, 1, 1, 1], padding="SAME")
-    self.assertEqual(output.get_shape().as_list(), [None, 10, 5, 5])
+    self.assertEqual(output.get_shape().as_list(), [3, 10, 5, 5])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv3d_transpose_test.py b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
index 8973a450fa246e3c924f4352d72a1bbd4f7851ea..289ae29fcec7249b6a0f7f891fa63e53cdc6a713 100644
--- a/tensorflow/python/kernel_tests/conv3d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
@@ -131,6 +131,23 @@ class Conv3DTransposeTest(test.TestCase):
     nn_ops.conv3d_transpose(
         x_value, f_value, y_shape, strides, data_format='NCDHW')
 
+  def testConv3DTransposeOutputShapeType(self):
+    # Test case for GitHub issue 18887
+    for dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session():
+        x_shape = [2, 5, 6, 4, 3]
+        y_shape = [2, 5, 6, 4, 2]
+        f_shape = [3, 3, 3, 2, 3]
+        strides = [1, 1, 1, 1, 1]
+        x_value = constant_op.constant(
+            1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+        f_value = constant_op.constant(
+            1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+        output = nn_ops.conv3d_transpose(
+            x_value, f_value, constant_op.constant(y_shape, dtype=dtype),
+            strides=strides, padding="SAME")
+        output.eval()
+
   def testConv3DTransposeValid(self):
     with self.test_session():
       strides = [1, 2, 2, 2, 1]
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 659dc0419a061b89e37330c95c4e19d179e116da..5e223b18281ed9c06a3f72a16b6d22290851f37b 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -355,7 +355,7 @@ class DepthwiseConv2DTest(test.TestCase):
     graph = ops.get_default_graph()
     with self.test_session(graph=graph, use_gpu=use_gpu) as sess:
       tolerance = {
-          dtypes.float16: 2e-0,
+          dtypes.float16: 4e-0,
           dtypes.float32: 5e-4,
           dtypes.float64: 1e-12,
       }[data_type]
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index f3cc9636f91f7d3573d8a66d6b1b4936e49a9141..cf2e8832fd5225e4d4be617a97b355bb410084c2 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -41,6 +41,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    shard_count = 3,
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
index 09812db8166567403dc966ac9cb4304be0740e50..095d1cde1530f15fd2a7ff4cb7f56424f276be5a 100644
--- a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions import bernoulli
 from tensorflow.python.ops.distributions import kullback_leibler
@@ -56,59 +57,65 @@ def entropy(p):
 
 class BernoulliTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testP(self):
     p = [0.2, 0.4]
     dist = bernoulli.Bernoulli(probs=p)
     with self.test_session():
-      self.assertAllClose(p, dist.probs.eval())
+      self.assertAllClose(p, self.evaluate(dist.probs))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testLogits(self):
     logits = [-42., 42.]
     dist = bernoulli.Bernoulli(logits=logits)
     with self.test_session():
-      self.assertAllClose(logits, dist.logits.eval())
+      self.assertAllClose(logits, self.evaluate(dist.logits))
 
     if not special:
       return
 
     with self.test_session():
-      self.assertAllClose(special.expit(logits), dist.probs.eval())
+      self.assertAllClose(special.expit(logits), self.evaluate(dist.probs))
 
     p = [0.01, 0.99, 0.42]
     dist = bernoulli.Bernoulli(probs=p)
     with self.test_session():
-      self.assertAllClose(special.logit(p), dist.logits.eval())
+      self.assertAllClose(special.logit(p), self.evaluate(dist.logits))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testInvalidP(self):
     invalid_ps = [1.01, 2.]
     for p in invalid_ps:
       with self.test_session():
         with self.assertRaisesOpError("probs has components greater than 1"):
           dist = bernoulli.Bernoulli(probs=p, validate_args=True)
-          dist.probs.eval()
+          self.evaluate(dist.probs)
 
     invalid_ps = [-0.01, -3.]
     for p in invalid_ps:
       with self.test_session():
         with self.assertRaisesOpError("Condition x >= 0"):
           dist = bernoulli.Bernoulli(probs=p, validate_args=True)
-          dist.probs.eval()
+          self.evaluate(dist.probs)
 
     valid_ps = [0.0, 0.5, 1.0]
     for p in valid_ps:
       with self.test_session():
         dist = bernoulli.Bernoulli(probs=p)
-        self.assertEqual(p, dist.probs.eval())  # Should not fail
+        self.assertEqual(p, self.evaluate(dist.probs))  # Should not fail
 
+  @test_util.run_in_graph_and_eager_modes()
   def testShapes(self):
     with self.test_session():
       for batch_shape in ([], [1], [2, 3, 4]):
         dist = make_bernoulli(batch_shape)
         self.assertAllEqual(batch_shape, dist.batch_shape.as_list())
-        self.assertAllEqual(batch_shape, dist.batch_shape_tensor().eval())
+        self.assertAllEqual(batch_shape,
+                            self.evaluate(dist.batch_shape_tensor()))
         self.assertAllEqual([], dist.event_shape.as_list())
-        self.assertAllEqual([], dist.event_shape_tensor().eval())
+        self.assertAllEqual([], self.evaluate(dist.event_shape_tensor()))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testDtype(self):
     dist = make_bernoulli([])
     self.assertEqual(dist.dtype, dtypes.int32)
@@ -126,6 +133,7 @@ class BernoulliTest(test.TestCase):
     self.assertEqual(dist64.dtype, dist64.sample(5).dtype)
     self.assertEqual(dist64.dtype, dist64.mode().dtype)
 
+  @test_util.run_in_graph_and_eager_modes()
   def _testPmf(self, **kwargs):
     dist = bernoulli.Bernoulli(**kwargs)
     with self.test_session():
@@ -147,8 +155,9 @@ class BernoulliTest(test.TestCase):
       # pylint: enable=bad-continuation
 
       for x, expected_pmf in zip(xs, expected_pmfs):
-        self.assertAllClose(dist.prob(x).eval(), expected_pmf)
-        self.assertAllClose(dist.log_prob(x).eval(), np.log(expected_pmf))
+        self.assertAllClose(self.evaluate(dist.prob(x)), expected_pmf)
+        self.assertAllClose(
+            self.evaluate(dist.log_prob(x)), np.log(expected_pmf))
 
   def testPmfCorrectBroadcastDynamicShape(self):
     with self.test_session():
@@ -165,15 +174,17 @@ class BernoulliTest(test.TestCase):
               p: [0.2, 0.3, 0.4]
           }), [[0.2, 0.7, 0.4]])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPmfInvalid(self):
     p = [0.1, 0.2, 0.7]
     with self.test_session():
       dist = bernoulli.Bernoulli(probs=p, validate_args=True)
       with self.assertRaisesOpError("must be non-negative."):
-        dist.prob([1, 1, -1]).eval()
+        self.evaluate(dist.prob([1, 1, -1]))
       with self.assertRaisesOpError("Elements cannot exceed 1."):
-        dist.prob([2, 0, 1]).eval()
+        self.evaluate(dist.prob([2, 0, 1]))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPmfWithP(self):
     p = [[0.2, 0.4], [0.3, 0.6]]
     self._testPmf(probs=p)
@@ -203,7 +214,7 @@ class BernoulliTest(test.TestCase):
 
     with self.test_session():
       dist = bernoulli.Bernoulli(probs=0.5)
-      self.assertEqual(2, len(dist.log_prob([[1], [1]]).eval().shape))
+      self.assertEqual(2, len(self.evaluate(dist.log_prob([[1], [1]])).shape))
 
     with self.test_session():
       dist = bernoulli.Bernoulli(probs=0.5)
@@ -215,25 +226,31 @@ class BernoulliTest(test.TestCase):
       dist = bernoulli.Bernoulli(probs=[[0.5], [0.5]])
       self.assertEqual((2, 1), dist.log_prob(1).get_shape())
 
+  @test_util.run_in_graph_and_eager_modes()
   def testBoundaryConditions(self):
     with self.test_session():
       dist = bernoulli.Bernoulli(probs=1.0)
-      self.assertAllClose(np.nan, dist.log_prob(0).eval())
-      self.assertAllClose([np.nan], [dist.log_prob(1).eval()])
+      self.assertAllClose(np.nan, self.evaluate(dist.log_prob(0)))
+      self.assertAllClose([np.nan], [self.evaluate(dist.log_prob(1))])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testEntropyNoBatch(self):
     p = 0.2
     dist = bernoulli.Bernoulli(probs=p)
     with self.test_session():
-      self.assertAllClose(dist.entropy().eval(), entropy(p))
+      self.assertAllClose(self.evaluate(dist.entropy()), entropy(p))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testEntropyWithBatch(self):
     p = [[0.1, 0.7], [0.2, 0.6]]
     dist = bernoulli.Bernoulli(probs=p, validate_args=False)
     with self.test_session():
-      self.assertAllClose(dist.entropy().eval(), [[entropy(0.1), entropy(0.7)],
-                                                  [entropy(0.2), entropy(0.6)]])
+      self.assertAllClose(
+          self.evaluate(dist.entropy()),
+          [[entropy(0.1), entropy(0.7)], [entropy(0.2),
+                                          entropy(0.6)]])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSampleN(self):
     with self.test_session():
       p = [0.2, 0.6]
@@ -242,7 +259,7 @@ class BernoulliTest(test.TestCase):
       samples = dist.sample(n)
       samples.set_shape([n, 2])
       self.assertEqual(samples.dtype, dtypes.int32)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertTrue(np.all(sample_values >= 0))
       self.assertTrue(np.all(sample_values <= 1))
       # Note that the standard error for the sample mean is ~ sqrt(p * (1 - p) /
@@ -262,51 +279,54 @@ class BernoulliTest(test.TestCase):
       n = 1000
       seed = 42
       self.assertAllEqual(
-          dist.sample(n, seed).eval(), dist.sample(n, seed).eval())
+          self.evaluate(dist.sample(n, seed)),
+          self.evaluate(dist.sample(n, seed)))
       n = array_ops.placeholder(dtypes.int32)
       sample, sample = sess.run([dist.sample(n, seed), dist.sample(n, seed)],
                                 feed_dict={n: 1000})
       self.assertAllEqual(sample, sample)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testMean(self):
     with self.test_session():
       p = np.array([[0.2, 0.7], [0.5, 0.4]], dtype=np.float32)
       dist = bernoulli.Bernoulli(probs=p)
-      self.assertAllEqual(dist.mean().eval(), p)
+      self.assertAllEqual(self.evaluate(dist.mean()), p)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testVarianceAndStd(self):
     var = lambda p: p * (1. - p)
     with self.test_session():
       p = [[0.2, 0.7], [0.5, 0.4]]
       dist = bernoulli.Bernoulli(probs=p)
       self.assertAllClose(
-          dist.variance().eval(),
+          self.evaluate(dist.variance()),
           np.array(
               [[var(0.2), var(0.7)], [var(0.5), var(0.4)]], dtype=np.float32))
       self.assertAllClose(
-          dist.stddev().eval(),
+          self.evaluate(dist.stddev()),
           np.array(
               [[np.sqrt(var(0.2)), np.sqrt(var(0.7))],
                [np.sqrt(var(0.5)), np.sqrt(var(0.4))]],
               dtype=np.float32))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testBernoulliBernoulliKL(self):
-    with self.test_session() as sess:
-      batch_size = 6
-      a_p = np.array([0.5] * batch_size, dtype=np.float32)
-      b_p = np.array([0.4] * batch_size, dtype=np.float32)
+    batch_size = 6
+    a_p = np.array([0.5] * batch_size, dtype=np.float32)
+    b_p = np.array([0.4] * batch_size, dtype=np.float32)
 
-      a = bernoulli.Bernoulli(probs=a_p)
-      b = bernoulli.Bernoulli(probs=b_p)
+    a = bernoulli.Bernoulli(probs=a_p)
+    b = bernoulli.Bernoulli(probs=b_p)
 
-      kl = kullback_leibler.kl_divergence(a, b)
-      kl_val = sess.run(kl)
+    kl = kullback_leibler.kl_divergence(a, b)
+    kl_val = self.evaluate(kl)
 
-      kl_expected = (a_p * np.log(a_p / b_p) + (1. - a_p) * np.log(
-          (1. - a_p) / (1. - b_p)))
+    kl_expected = (a_p * np.log(a_p / b_p) + (1. - a_p) * np.log(
+        (1. - a_p) / (1. - b_p)))
 
-      self.assertEqual(kl.get_shape(), (batch_size,))
-      self.assertAllClose(kl_val, kl_expected)
+    self.assertEqual(kl.get_shape(), (batch_size,))
+    self.assertAllClose(kl_val, kl_expected)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/beta_test.py b/tensorflow/python/kernel_tests/distributions/beta_test.py
index ab5041a6eb477ce231acbd1e6041c354ee17409b..4bc8303ebb6939f3f8e2637120b6510c225c2f12 100644
--- a/tensorflow/python/kernel_tests/distributions/beta_test.py
+++ b/tensorflow/python/kernel_tests/distributions/beta_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops.distributions import beta as beta_lib
@@ -45,6 +46,7 @@ special = try_import("scipy.special")
 stats = try_import("scipy.stats")
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BetaTest(test.TestCase):
 
   def testSimpleShapes(self):
@@ -52,8 +54,8 @@ class BetaTest(test.TestCase):
       a = np.random.rand(3)
       b = np.random.rand(3)
       dist = beta_lib.Beta(a, b)
-      self.assertAllEqual([], dist.event_shape_tensor().eval())
-      self.assertAllEqual([3], dist.batch_shape_tensor().eval())
+      self.assertAllEqual([], self.evaluate(dist.event_shape_tensor()))
+      self.assertAllEqual([3], self.evaluate(dist.batch_shape_tensor()))
       self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([3]), dist.batch_shape)
 
@@ -62,8 +64,8 @@ class BetaTest(test.TestCase):
       a = np.random.rand(3, 2, 2)
       b = np.random.rand(3, 2, 2)
       dist = beta_lib.Beta(a, b)
-      self.assertAllEqual([], dist.event_shape_tensor().eval())
-      self.assertAllEqual([3, 2, 2], dist.batch_shape_tensor().eval())
+      self.assertAllEqual([], self.evaluate(dist.event_shape_tensor()))
+      self.assertAllEqual([3, 2, 2], self.evaluate(dist.batch_shape_tensor()))
       self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
       self.assertEqual(
           tensor_shape.TensorShape([3, 2, 2]), dist.batch_shape)
@@ -73,8 +75,8 @@ class BetaTest(test.TestCase):
       a = np.random.rand(3, 2, 2)
       b = np.random.rand(2, 2)
       dist = beta_lib.Beta(a, b)
-      self.assertAllEqual([], dist.event_shape_tensor().eval())
-      self.assertAllEqual([3, 2, 2], dist.batch_shape_tensor().eval())
+      self.assertAllEqual([], self.evaluate(dist.event_shape_tensor()))
+      self.assertAllEqual([3, 2, 2], self.evaluate(dist.batch_shape_tensor()))
       self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
       self.assertEqual(
           tensor_shape.TensorShape([3, 2, 2]), dist.batch_shape)
@@ -85,7 +87,7 @@ class BetaTest(test.TestCase):
     with self.test_session():
       dist = beta_lib.Beta(a, b)
       self.assertEqual([1, 3], dist.concentration1.get_shape())
-      self.assertAllClose(a, dist.concentration1.eval())
+      self.assertAllClose(a, self.evaluate(dist.concentration1))
 
   def testBetaProperty(self):
     a = [[1., 2, 3]]
@@ -93,24 +95,24 @@ class BetaTest(test.TestCase):
     with self.test_session():
       dist = beta_lib.Beta(a, b)
       self.assertEqual([1, 3], dist.concentration0.get_shape())
-      self.assertAllClose(b, dist.concentration0.eval())
+      self.assertAllClose(b, self.evaluate(dist.concentration0))
 
   def testPdfXProper(self):
     a = [[1., 2, 3]]
     b = [[2., 4, 3]]
     with self.test_session():
       dist = beta_lib.Beta(a, b, validate_args=True)
-      dist.prob([.1, .3, .6]).eval()
-      dist.prob([.2, .3, .5]).eval()
+      self.evaluate(dist.prob([.1, .3, .6]))
+      self.evaluate(dist.prob([.2, .3, .5]))
       # Either condition can trigger.
       with self.assertRaisesOpError("sample must be positive"):
-        dist.prob([-1., 0.1, 0.5]).eval()
+        self.evaluate(dist.prob([-1., 0.1, 0.5]))
       with self.assertRaisesOpError("sample must be positive"):
-        dist.prob([0., 0.1, 0.5]).eval()
+        self.evaluate(dist.prob([0., 0.1, 0.5]))
       with self.assertRaisesOpError("sample must be less than `1`"):
-        dist.prob([.1, .2, 1.2]).eval()
+        self.evaluate(dist.prob([.1, .2, 1.2]))
       with self.assertRaisesOpError("sample must be less than `1`"):
-        dist.prob([.1, .2, 1.0]).eval()
+        self.evaluate(dist.prob([.1, .2, 1.0]))
 
   def testPdfTwoBatches(self):
     with self.test_session():
@@ -119,7 +121,7 @@ class BetaTest(test.TestCase):
       x = [.5, .5]
       dist = beta_lib.Beta(a, b)
       pdf = dist.prob(x)
-      self.assertAllClose([1., 3. / 2], pdf.eval())
+      self.assertAllClose([1., 3. / 2], self.evaluate(pdf))
       self.assertEqual((2,), pdf.get_shape())
 
   def testPdfTwoBatchesNontrivialX(self):
@@ -129,7 +131,7 @@ class BetaTest(test.TestCase):
       x = [.3, .7]
       dist = beta_lib.Beta(a, b)
       pdf = dist.prob(x)
-      self.assertAllClose([1, 63. / 50], pdf.eval())
+      self.assertAllClose([1, 63. / 50], self.evaluate(pdf))
       self.assertEqual((2,), pdf.get_shape())
 
   def testPdfUniformZeroBatch(self):
@@ -140,7 +142,7 @@ class BetaTest(test.TestCase):
       x = np.array([.1, .2, .3, .5, .8], dtype=np.float32)
       dist = beta_lib.Beta(a, b)
       pdf = dist.prob(x)
-      self.assertAllClose([1.] * 5, pdf.eval())
+      self.assertAllClose([1.] * 5, self.evaluate(pdf))
       self.assertEqual((5,), pdf.get_shape())
 
   def testPdfAlphaStretchedInBroadcastWhenSameRank(self):
@@ -150,7 +152,7 @@ class BetaTest(test.TestCase):
       x = [[.5, .5], [.3, .7]]
       dist = beta_lib.Beta(a, b)
       pdf = dist.prob(x)
-      self.assertAllClose([[1., 3. / 2], [1., 63. / 50]], pdf.eval())
+      self.assertAllClose([[1., 3. / 2], [1., 63. / 50]], self.evaluate(pdf))
       self.assertEqual((2, 2), pdf.get_shape())
 
   def testPdfAlphaStretchedInBroadcastWhenLowerRank(self):
@@ -159,7 +161,7 @@ class BetaTest(test.TestCase):
       b = [1., 2]
       x = [[.5, .5], [.2, .8]]
       pdf = beta_lib.Beta(a, b).prob(x)
-      self.assertAllClose([[1., 3. / 2], [1., 24. / 25]], pdf.eval())
+      self.assertAllClose([[1., 3. / 2], [1., 24. / 25]], self.evaluate(pdf))
       self.assertEqual((2, 2), pdf.get_shape())
 
   def testPdfXStretchedInBroadcastWhenSameRank(self):
@@ -168,7 +170,7 @@ class BetaTest(test.TestCase):
       b = [[1., 2], [2., 3]]
       x = [[.5, .5]]
       pdf = beta_lib.Beta(a, b).prob(x)
-      self.assertAllClose([[1., 3. / 2], [3. / 2, 15. / 8]], pdf.eval())
+      self.assertAllClose([[1., 3. / 2], [3. / 2, 15. / 8]], self.evaluate(pdf))
       self.assertEqual((2, 2), pdf.get_shape())
 
   def testPdfXStretchedInBroadcastWhenLowerRank(self):
@@ -177,7 +179,7 @@ class BetaTest(test.TestCase):
       b = [[1., 2], [2., 3]]
       x = [.5, .5]
       pdf = beta_lib.Beta(a, b).prob(x)
-      self.assertAllClose([[1., 3. / 2], [3. / 2, 15. / 8]], pdf.eval())
+      self.assertAllClose([[1., 3. / 2], [3. / 2, 15. / 8]], self.evaluate(pdf))
       self.assertEqual((2, 2), pdf.get_shape())
 
   def testBetaMean(self):
@@ -189,7 +191,7 @@ class BetaTest(test.TestCase):
       if not stats:
         return
       expected_mean = stats.beta.mean(a, b)
-      self.assertAllClose(expected_mean, dist.mean().eval())
+      self.assertAllClose(expected_mean, self.evaluate(dist.mean()))
 
   def testBetaVariance(self):
     with session.Session():
@@ -200,7 +202,7 @@ class BetaTest(test.TestCase):
       if not stats:
         return
       expected_variance = stats.beta.var(a, b)
-      self.assertAllClose(expected_variance, dist.variance().eval())
+      self.assertAllClose(expected_variance, self.evaluate(dist.variance()))
 
   def testBetaMode(self):
     with session.Session():
@@ -209,7 +211,7 @@ class BetaTest(test.TestCase):
       expected_mode = (a - 1) / (a + b - 2)
       dist = beta_lib.Beta(a, b)
       self.assertEqual(dist.mode().get_shape(), (3,))
-      self.assertAllClose(expected_mode, dist.mode().eval())
+      self.assertAllClose(expected_mode, self.evaluate(dist.mode()))
 
   def testBetaModeInvalid(self):
     with session.Session():
@@ -217,13 +219,13 @@ class BetaTest(test.TestCase):
       b = np.array([2., 4, 1.2])
       dist = beta_lib.Beta(a, b, allow_nan_stats=False)
       with self.assertRaisesOpError("Condition x < y.*"):
-        dist.mode().eval()
+        self.evaluate(dist.mode())
 
       a = np.array([2., 2, 3])
       b = np.array([1., 4, 1.2])
       dist = beta_lib.Beta(a, b, allow_nan_stats=False)
       with self.assertRaisesOpError("Condition x < y.*"):
-        dist.mode().eval()
+        self.evaluate(dist.mode())
 
   def testBetaModeEnableAllowNanStats(self):
     with session.Session():
@@ -234,7 +236,7 @@ class BetaTest(test.TestCase):
       expected_mode = (a - 1) / (a + b - 2)
       expected_mode[0] = np.nan
       self.assertEqual((3,), dist.mode().get_shape())
-      self.assertAllClose(expected_mode, dist.mode().eval())
+      self.assertAllClose(expected_mode, self.evaluate(dist.mode()))
 
       a = np.array([2., 2, 3])
       b = np.array([1., 4, 1.2])
@@ -243,7 +245,7 @@ class BetaTest(test.TestCase):
       expected_mode = (a - 1) / (a + b - 2)
       expected_mode[0] = np.nan
       self.assertEqual((3,), dist.mode().get_shape())
-      self.assertAllClose(expected_mode, dist.mode().eval())
+      self.assertAllClose(expected_mode, self.evaluate(dist.mode()))
 
   def testBetaEntropy(self):
     with session.Session():
@@ -254,7 +256,7 @@ class BetaTest(test.TestCase):
       if not stats:
         return
       expected_entropy = stats.beta.entropy(a, b)
-      self.assertAllClose(expected_entropy, dist.entropy().eval())
+      self.assertAllClose(expected_entropy, self.evaluate(dist.entropy()))
 
   def testBetaSample(self):
     with self.test_session():
@@ -263,7 +265,7 @@ class BetaTest(test.TestCase):
       beta = beta_lib.Beta(a, b)
       n = constant_op.constant(100000)
       samples = beta.sample(n)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(sample_values.shape, (100000,))
       self.assertFalse(np.any(sample_values < 0.0))
       if not stats:
@@ -291,13 +293,13 @@ class BetaTest(test.TestCase):
       beta1 = beta_lib.Beta(concentration1=a_val,
                             concentration0=b_val,
                             name="beta1")
-      samples1 = beta1.sample(n_val, seed=123456).eval()
+      samples1 = self.evaluate(beta1.sample(n_val, seed=123456))
 
       random_seed.set_random_seed(654321)
       beta2 = beta_lib.Beta(concentration1=a_val,
                             concentration0=b_val,
                             name="beta2")
-      samples2 = beta2.sample(n_val, seed=123456).eval()
+      samples2 = self.evaluate(beta2.sample(n_val, seed=123456))
 
       self.assertAllClose(samples1, samples2)
 
@@ -308,7 +310,7 @@ class BetaTest(test.TestCase):
       beta = beta_lib.Beta(a, b)
       n = constant_op.constant(100000)
       samples = beta.sample(n)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(sample_values.shape, (100000, 3, 2, 2))
       self.assertFalse(np.any(sample_values < 0.0))
       if not stats:
@@ -325,7 +327,7 @@ class BetaTest(test.TestCase):
         a = 10. * np.random.random(shape).astype(dt)
         b = 10. * np.random.random(shape).astype(dt)
         x = np.random.random(shape).astype(dt)
-        actual = beta_lib.Beta(a, b).cdf(x).eval()
+        actual = self.evaluate(beta_lib.Beta(a, b).cdf(x))
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
         if not stats:
@@ -339,7 +341,7 @@ class BetaTest(test.TestCase):
         a = 10. * np.random.random(shape).astype(dt)
         b = 10. * np.random.random(shape).astype(dt)
         x = np.random.random(shape).astype(dt)
-        actual = math_ops.exp(beta_lib.Beta(a, b).log_cdf(x)).eval()
+        actual = self.evaluate(math_ops.exp(beta_lib.Beta(a, b).log_cdf(x)))
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
         if not stats:
@@ -350,46 +352,47 @@ class BetaTest(test.TestCase):
     with self.test_session():
       a, b = -4.2, -9.1
       dist = beta_lib.BetaWithSoftplusConcentration(a, b)
-      self.assertAllClose(nn_ops.softplus(a).eval(), dist.concentration1.eval())
-      self.assertAllClose(nn_ops.softplus(b).eval(), dist.concentration0.eval())
+      self.assertAllClose(
+          self.evaluate(nn_ops.softplus(a)), self.evaluate(dist.concentration1))
+      self.assertAllClose(
+          self.evaluate(nn_ops.softplus(b)), self.evaluate(dist.concentration0))
 
   def testBetaBetaKL(self):
-    with self.test_session() as sess:
-      for shape in [(10,), (4, 5)]:
-        a1 = 6.0 * np.random.random(size=shape) + 1e-4
-        b1 = 6.0 * np.random.random(size=shape) + 1e-4
-        a2 = 6.0 * np.random.random(size=shape) + 1e-4
-        b2 = 6.0 * np.random.random(size=shape) + 1e-4
-        # Take inverse softplus of values to test BetaWithSoftplusConcentration
-        a1_sp = np.log(np.exp(a1) - 1.0)
-        b1_sp = np.log(np.exp(b1) - 1.0)
-        a2_sp = np.log(np.exp(a2) - 1.0)
-        b2_sp = np.log(np.exp(b2) - 1.0)
-
-        d1 = beta_lib.Beta(concentration1=a1, concentration0=b1)
-        d2 = beta_lib.Beta(concentration1=a2, concentration0=b2)
-        d1_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a1_sp,
-                                                       concentration0=b1_sp)
-        d2_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a2_sp,
-                                                       concentration0=b2_sp)
-
-        if not special:
-          return
-        kl_expected = (special.betaln(a2, b2) - special.betaln(a1, b1) +
-                       (a1 - a2) * special.digamma(a1) +
-                       (b1 - b2) * special.digamma(b1) +
-                       (a2 - a1 + b2 - b1) * special.digamma(a1 + b1))
-
-        for dist1 in [d1, d1_sp]:
-          for dist2 in [d2, d2_sp]:
-            kl = kullback_leibler.kl_divergence(dist1, dist2)
-            kl_val = sess.run(kl)
-            self.assertEqual(kl.get_shape(), shape)
-            self.assertAllClose(kl_val, kl_expected)
-
-        # Make sure KL(d1||d1) is 0
-        kl_same = sess.run(kullback_leibler.kl_divergence(d1, d1))
-        self.assertAllClose(kl_same, np.zeros_like(kl_expected))
+    for shape in [(10,), (4, 5)]:
+      a1 = 6.0 * np.random.random(size=shape) + 1e-4
+      b1 = 6.0 * np.random.random(size=shape) + 1e-4
+      a2 = 6.0 * np.random.random(size=shape) + 1e-4
+      b2 = 6.0 * np.random.random(size=shape) + 1e-4
+      # Take inverse softplus of values to test BetaWithSoftplusConcentration
+      a1_sp = np.log(np.exp(a1) - 1.0)
+      b1_sp = np.log(np.exp(b1) - 1.0)
+      a2_sp = np.log(np.exp(a2) - 1.0)
+      b2_sp = np.log(np.exp(b2) - 1.0)
+
+      d1 = beta_lib.Beta(concentration1=a1, concentration0=b1)
+      d2 = beta_lib.Beta(concentration1=a2, concentration0=b2)
+      d1_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a1_sp,
+                                                     concentration0=b1_sp)
+      d2_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a2_sp,
+                                                     concentration0=b2_sp)
+
+      if not special:
+        return
+      kl_expected = (special.betaln(a2, b2) - special.betaln(a1, b1) +
+                     (a1 - a2) * special.digamma(a1) +
+                     (b1 - b2) * special.digamma(b1) +
+                     (a2 - a1 + b2 - b1) * special.digamma(a1 + b1))
+
+      for dist1 in [d1, d1_sp]:
+        for dist2 in [d2, d2_sp]:
+          kl = kullback_leibler.kl_divergence(dist1, dist2)
+          kl_val = self.evaluate(kl)
+          self.assertEqual(kl.get_shape(), shape)
+          self.assertAllClose(kl_val, kl_expected)
+
+      # Make sure KL(d1||d1) is 0
+      kl_same = self.evaluate(kullback_leibler.kl_divergence(d1, d1))
+      self.assertAllClose(kl_same, np.zeros_like(kl_expected))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/bijector_test.py b/tensorflow/python/kernel_tests/distributions/bijector_test.py
index 33db014279de2625380ec367b3fc5a96b5f9c4d6..a7fe336e6a8e88f1130bce3fec9f6d253246c37e 100644
--- a/tensorflow/python/kernel_tests/distributions/bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bijector_test.py
@@ -24,12 +24,14 @@ import numpy as np
 import six
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import bijector
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BaseBijectorTest(test.TestCase):
   """Tests properties of the Bijector base-class."""
 
@@ -47,42 +49,38 @@ class BaseBijectorTest(test.TestCase):
       def __init__(self):
         super(_BareBonesBijector, self).__init__(forward_min_event_ndims=0)
 
-    with self.test_session() as sess:
-      bij = _BareBonesBijector()
-      self.assertEqual([], bij.graph_parents)
-      self.assertEqual(False, bij.is_constant_jacobian)
-      self.assertEqual(False, bij.validate_args)
-      self.assertEqual(None, bij.dtype)
-      self.assertEqual("bare_bones_bijector", bij.name)
-
-      for shape in [[], [1, 2], [1, 2, 3]]:
-        [
-            forward_event_shape_,
-            inverse_event_shape_,
-        ] = sess.run([
-            bij.inverse_event_shape_tensor(shape),
-            bij.forward_event_shape_tensor(shape),
-        ])
-        self.assertAllEqual(shape, forward_event_shape_)
-        self.assertAllEqual(shape, bij.forward_event_shape(shape))
-        self.assertAllEqual(shape, inverse_event_shape_)
-        self.assertAllEqual(shape, bij.inverse_event_shape(shape))
-
-      with self.assertRaisesRegexp(
-          NotImplementedError, "inverse not implemented"):
-        bij.inverse(0)
-
-      with self.assertRaisesRegexp(
-          NotImplementedError, "forward not implemented"):
-        bij.forward(0)
-
-      with self.assertRaisesRegexp(
-          NotImplementedError, "inverse_log_det_jacobian not implemented"):
-        bij.inverse_log_det_jacobian(0, event_ndims=0)
-
-      with self.assertRaisesRegexp(
-          NotImplementedError, "forward_log_det_jacobian not implemented"):
-        bij.forward_log_det_jacobian(0, event_ndims=0)
+    bij = _BareBonesBijector()
+    self.assertEqual([], bij.graph_parents)
+    self.assertEqual(False, bij.is_constant_jacobian)
+    self.assertEqual(False, bij.validate_args)
+    self.assertEqual(None, bij.dtype)
+    self.assertEqual("bare_bones_bijector", bij.name)
+
+    for shape in [[], [1, 2], [1, 2, 3]]:
+      forward_event_shape_ = self.evaluate(
+          bij.inverse_event_shape_tensor(shape))
+      inverse_event_shape_ = self.evaluate(
+          bij.forward_event_shape_tensor(shape))
+      self.assertAllEqual(shape, forward_event_shape_)
+      self.assertAllEqual(shape, bij.forward_event_shape(shape))
+      self.assertAllEqual(shape, inverse_event_shape_)
+      self.assertAllEqual(shape, bij.inverse_event_shape(shape))
+
+    with self.assertRaisesRegexp(
+        NotImplementedError, "inverse not implemented"):
+      bij.inverse(0)
+
+    with self.assertRaisesRegexp(
+        NotImplementedError, "forward not implemented"):
+      bij.forward(0)
+
+    with self.assertRaisesRegexp(
+        NotImplementedError, "inverse_log_det_jacobian not implemented"):
+      bij.inverse_log_det_jacobian(0, event_ndims=0)
+
+    with self.assertRaisesRegexp(
+        NotImplementedError, "forward_log_det_jacobian not implemented"):
+      bij.forward_log_det_jacobian(0, event_ndims=0)
 
 
 class IntentionallyMissingError(Exception):
diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
index a2f1de5aaf3a75c1cfac820cc4494af34d082250..3bcfae0deb5e14b1be0f491bf8c5c7fcf9bbcf1d 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import dirichlet as dirichlet_lib
@@ -41,14 +42,15 @@ def try_import(name):  # pylint: disable=invalid-name
 stats = try_import("scipy.stats")
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class DirichletTest(test.TestCase):
 
   def testSimpleShapes(self):
     with self.test_session():
       alpha = np.random.rand(3)
       dist = dirichlet_lib.Dirichlet(alpha)
-      self.assertEqual(3, dist.event_shape_tensor().eval())
-      self.assertAllEqual([], dist.batch_shape_tensor().eval())
+      self.assertEqual(3, self.evaluate(dist.event_shape_tensor()))
+      self.assertAllEqual([], self.evaluate(dist.batch_shape_tensor()))
       self.assertEqual(tensor_shape.TensorShape([3]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([]), dist.batch_shape)
 
@@ -56,8 +58,8 @@ class DirichletTest(test.TestCase):
     with self.test_session():
       alpha = np.random.rand(3, 2, 2)
       dist = dirichlet_lib.Dirichlet(alpha)
-      self.assertEqual(2, dist.event_shape_tensor().eval())
-      self.assertAllEqual([3, 2], dist.batch_shape_tensor().eval())
+      self.assertEqual(2, self.evaluate(dist.event_shape_tensor()))
+      self.assertAllEqual([3, 2], self.evaluate(dist.batch_shape_tensor()))
       self.assertEqual(tensor_shape.TensorShape([2]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([3, 2]), dist.batch_shape)
 
@@ -66,22 +68,22 @@ class DirichletTest(test.TestCase):
     with self.test_session():
       dist = dirichlet_lib.Dirichlet(alpha)
       self.assertEqual([1, 3], dist.concentration.get_shape())
-      self.assertAllClose(alpha, dist.concentration.eval())
+      self.assertAllClose(alpha, self.evaluate(dist.concentration))
 
   def testPdfXProper(self):
     alpha = [[1., 2, 3]]
     with self.test_session():
       dist = dirichlet_lib.Dirichlet(alpha, validate_args=True)
-      dist.prob([.1, .3, .6]).eval()
-      dist.prob([.2, .3, .5]).eval()
+      self.evaluate(dist.prob([.1, .3, .6]))
+      self.evaluate(dist.prob([.2, .3, .5]))
       # Either condition can trigger.
       with self.assertRaisesOpError("samples must be positive"):
-        dist.prob([-1., 1.5, 0.5]).eval()
+        self.evaluate(dist.prob([-1., 1.5, 0.5]))
       with self.assertRaisesOpError("samples must be positive"):
-        dist.prob([0., .1, .9]).eval()
+        self.evaluate(dist.prob([0., .1, .9]))
       with self.assertRaisesOpError(
           "sample last-dimension must sum to `1`"):
-        dist.prob([.1, .2, .8]).eval()
+        self.evaluate(dist.prob([.1, .2, .8]))
 
   def testPdfZeroBatches(self):
     with self.test_session():
@@ -89,7 +91,7 @@ class DirichletTest(test.TestCase):
       x = [.5, .5]
       dist = dirichlet_lib.Dirichlet(alpha)
       pdf = dist.prob(x)
-      self.assertAllClose(1., pdf.eval())
+      self.assertAllClose(1., self.evaluate(pdf))
       self.assertEqual((), pdf.get_shape())
 
   def testPdfZeroBatchesNontrivialX(self):
@@ -98,7 +100,7 @@ class DirichletTest(test.TestCase):
       x = [.3, .7]
       dist = dirichlet_lib.Dirichlet(alpha)
       pdf = dist.prob(x)
-      self.assertAllClose(7. / 5, pdf.eval())
+      self.assertAllClose(7. / 5, self.evaluate(pdf))
       self.assertEqual((), pdf.get_shape())
 
   def testPdfUniformZeroBatches(self):
@@ -108,7 +110,7 @@ class DirichletTest(test.TestCase):
       x = [[.2, .5, .3], [.3, .4, .3]]
       dist = dirichlet_lib.Dirichlet(alpha)
       pdf = dist.prob(x)
-      self.assertAllClose([2., 2.], pdf.eval())
+      self.assertAllClose([2., 2.], self.evaluate(pdf))
       self.assertEqual((2), pdf.get_shape())
 
   def testPdfAlphaStretchedInBroadcastWhenSameRank(self):
@@ -117,7 +119,7 @@ class DirichletTest(test.TestCase):
       x = [[.5, .5], [.3, .7]]
       dist = dirichlet_lib.Dirichlet(alpha)
       pdf = dist.prob(x)
-      self.assertAllClose([1., 7. / 5], pdf.eval())
+      self.assertAllClose([1., 7. / 5], self.evaluate(pdf))
       self.assertEqual((2), pdf.get_shape())
 
   def testPdfAlphaStretchedInBroadcastWhenLowerRank(self):
@@ -125,7 +127,7 @@ class DirichletTest(test.TestCase):
       alpha = [1., 2]
       x = [[.5, .5], [.2, .8]]
       pdf = dirichlet_lib.Dirichlet(alpha).prob(x)
-      self.assertAllClose([1., 8. / 5], pdf.eval())
+      self.assertAllClose([1., 8. / 5], self.evaluate(pdf))
       self.assertEqual((2), pdf.get_shape())
 
   def testPdfXStretchedInBroadcastWhenSameRank(self):
@@ -133,7 +135,7 @@ class DirichletTest(test.TestCase):
       alpha = [[1., 2], [2., 3]]
       x = [[.5, .5]]
       pdf = dirichlet_lib.Dirichlet(alpha).prob(x)
-      self.assertAllClose([1., 3. / 2], pdf.eval())
+      self.assertAllClose([1., 3. / 2], self.evaluate(pdf))
       self.assertEqual((2), pdf.get_shape())
 
   def testPdfXStretchedInBroadcastWhenLowerRank(self):
@@ -141,7 +143,7 @@ class DirichletTest(test.TestCase):
       alpha = [[1., 2], [2., 3]]
       x = [.5, .5]
       pdf = dirichlet_lib.Dirichlet(alpha).prob(x)
-      self.assertAllClose([1., 3. / 2], pdf.eval())
+      self.assertAllClose([1., 3. / 2], self.evaluate(pdf))
       self.assertEqual((2), pdf.get_shape())
 
   def testMean(self):
@@ -152,43 +154,44 @@ class DirichletTest(test.TestCase):
       if not stats:
         return
       expected_mean = stats.dirichlet.mean(alpha)
-      self.assertAllClose(dirichlet.mean().eval(), expected_mean)
+      self.assertAllClose(self.evaluate(dirichlet.mean()), expected_mean)
 
   def testCovarianceFromSampling(self):
     alpha = np.array([[1., 2, 3],
                       [2.5, 4, 0.01]], dtype=np.float32)
-    with self.test_session() as sess:
-      dist = dirichlet_lib.Dirichlet(alpha)  # batch_shape=[2], event_shape=[3]
-      x = dist.sample(int(250e3), seed=1)
-      sample_mean = math_ops.reduce_mean(x, 0)
-      x_centered = x - sample_mean[None, ...]
-      sample_cov = math_ops.reduce_mean(math_ops.matmul(
-          x_centered[..., None], x_centered[..., None, :]), 0)
-      sample_var = array_ops.matrix_diag_part(sample_cov)
-      sample_stddev = math_ops.sqrt(sample_var)
-      [
-          sample_mean_,
-          sample_cov_,
-          sample_var_,
-          sample_stddev_,
-          analytic_mean,
-          analytic_cov,
-          analytic_var,
-          analytic_stddev,
-      ] = sess.run([
-          sample_mean,
-          sample_cov,
-          sample_var,
-          sample_stddev,
-          dist.mean(),
-          dist.covariance(),
-          dist.variance(),
-          dist.stddev(),
-      ])
-      self.assertAllClose(sample_mean_, analytic_mean, atol=0., rtol=0.04)
-      self.assertAllClose(sample_cov_, analytic_cov, atol=0., rtol=0.06)
-      self.assertAllClose(sample_var_, analytic_var, atol=0., rtol=0.03)
-      self.assertAllClose(sample_stddev_, analytic_stddev, atol=0., rtol=0.02)
+    dist = dirichlet_lib.Dirichlet(alpha)  # batch_shape=[2], event_shape=[3]
+    x = dist.sample(int(250e3), seed=1)
+    sample_mean = math_ops.reduce_mean(x, 0)
+    x_centered = x - sample_mean[None, ...]
+    sample_cov = math_ops.reduce_mean(math_ops.matmul(
+        x_centered[..., None], x_centered[..., None, :]), 0)
+    sample_var = array_ops.matrix_diag_part(sample_cov)
+    sample_stddev = math_ops.sqrt(sample_var)
+
+    [
+        sample_mean_,
+        sample_cov_,
+        sample_var_,
+        sample_stddev_,
+        analytic_mean,
+        analytic_cov,
+        analytic_var,
+        analytic_stddev,
+    ] = self.evaluate([
+        sample_mean,
+        sample_cov,
+        sample_var,
+        sample_stddev,
+        dist.mean(),
+        dist.covariance(),
+        dist.variance(),
+        dist.stddev(),
+    ])
+
+    self.assertAllClose(sample_mean_, analytic_mean, atol=0., rtol=0.04)
+    self.assertAllClose(sample_cov_, analytic_cov, atol=0., rtol=0.06)
+    self.assertAllClose(sample_var_, analytic_var, atol=0., rtol=0.03)
+    self.assertAllClose(sample_stddev_, analytic_stddev, atol=0., rtol=0.02)
 
   def testVariance(self):
     with self.test_session():
@@ -201,7 +204,8 @@ class DirichletTest(test.TestCase):
       expected_covariance = np.diag(stats.dirichlet.var(alpha))
       expected_covariance += [[0., -2, -3], [-2, 0, -6],
                               [-3, -6, 0]] / denominator
-      self.assertAllClose(dirichlet.covariance().eval(), expected_covariance)
+      self.assertAllClose(
+          self.evaluate(dirichlet.covariance()), expected_covariance)
 
   def testMode(self):
     with self.test_session():
@@ -209,7 +213,7 @@ class DirichletTest(test.TestCase):
       expected_mode = (alpha - 1) / (np.sum(alpha) - 3)
       dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
       self.assertEqual(dirichlet.mode().get_shape(), [3])
-      self.assertAllClose(dirichlet.mode().eval(), expected_mode)
+      self.assertAllClose(self.evaluate(dirichlet.mode()), expected_mode)
 
   def testModeInvalid(self):
     with self.test_session():
@@ -217,7 +221,7 @@ class DirichletTest(test.TestCase):
       dirichlet = dirichlet_lib.Dirichlet(concentration=alpha,
                                           allow_nan_stats=False)
       with self.assertRaisesOpError("Condition x < y.*"):
-        dirichlet.mode().eval()
+        self.evaluate(dirichlet.mode())
 
   def testModeEnableAllowNanStats(self):
     with self.test_session():
@@ -227,7 +231,7 @@ class DirichletTest(test.TestCase):
       expected_mode = np.zeros_like(alpha) + np.nan
 
       self.assertEqual(dirichlet.mode().get_shape(), [3])
-      self.assertAllClose(dirichlet.mode().eval(), expected_mode)
+      self.assertAllClose(self.evaluate(dirichlet.mode()), expected_mode)
 
   def testEntropy(self):
     with self.test_session():
@@ -237,7 +241,7 @@ class DirichletTest(test.TestCase):
       if not stats:
         return
       expected_entropy = stats.dirichlet.entropy(alpha)
-      self.assertAllClose(dirichlet.entropy().eval(), expected_entropy)
+      self.assertAllClose(self.evaluate(dirichlet.entropy()), expected_entropy)
 
   def testSample(self):
     with self.test_session():
@@ -245,7 +249,7 @@ class DirichletTest(test.TestCase):
       dirichlet = dirichlet_lib.Dirichlet(alpha)
       n = constant_op.constant(100000)
       samples = dirichlet.sample(n)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(sample_values.shape, (100000, 2))
       self.assertTrue(np.all(sample_values > 0.0))
       if not stats:
diff --git a/tensorflow/python/kernel_tests/distributions/exponential_test.py b/tensorflow/python/kernel_tests/distributions/exponential_test.py
index 7afdf0f947605c6b982e8bf7defdd6224180e089..ebcd41b0e24ae8093752c84cf5077029f2ac9330 100644
--- a/tensorflow/python/kernel_tests/distributions/exponential_test.py
+++ b/tensorflow/python/kernel_tests/distributions/exponential_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops.distributions import exponential as exponential_lib
 from tensorflow.python.platform import test
@@ -42,6 +43,7 @@ def try_import(name):  # pylint: disable=invalid-name
 stats = try_import("scipy.stats")
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class ExponentialTest(test.TestCase):
 
   def testExponentialLogPDF(self):
@@ -61,8 +63,8 @@ class ExponentialTest(test.TestCase):
       if not stats:
         return
       expected_log_pdf = stats.expon.logpdf(x, scale=1 / lam_v)
-      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
-      self.assertAllClose(pdf.eval(), np.exp(expected_log_pdf))
+      self.assertAllClose(self.evaluate(log_pdf), expected_log_pdf)
+      self.assertAllClose(self.evaluate(pdf), np.exp(expected_log_pdf))
 
   def testExponentialCDF(self):
     with session.Session():
@@ -79,7 +81,7 @@ class ExponentialTest(test.TestCase):
       if not stats:
         return
       expected_cdf = stats.expon.cdf(x, scale=1 / lam_v)
-      self.assertAllClose(cdf.eval(), expected_cdf)
+      self.assertAllClose(self.evaluate(cdf), expected_cdf)
 
   def testExponentialMean(self):
     with session.Session():
@@ -89,7 +91,7 @@ class ExponentialTest(test.TestCase):
       if not stats:
         return
       expected_mean = stats.expon.mean(scale=1 / lam_v)
-      self.assertAllClose(exponential.mean().eval(), expected_mean)
+      self.assertAllClose(self.evaluate(exponential.mean()), expected_mean)
 
   def testExponentialVariance(self):
     with session.Session():
@@ -99,7 +101,8 @@ class ExponentialTest(test.TestCase):
       if not stats:
         return
       expected_variance = stats.expon.var(scale=1 / lam_v)
-      self.assertAllClose(exponential.variance().eval(), expected_variance)
+      self.assertAllClose(
+          self.evaluate(exponential.variance()), expected_variance)
 
   def testExponentialEntropy(self):
     with session.Session():
@@ -109,7 +112,8 @@ class ExponentialTest(test.TestCase):
       if not stats:
         return
       expected_entropy = stats.expon.entropy(scale=1 / lam_v)
-      self.assertAllClose(exponential.entropy().eval(), expected_entropy)
+      self.assertAllClose(
+          self.evaluate(exponential.entropy()), expected_entropy)
 
   def testExponentialSample(self):
     with self.test_session():
@@ -119,7 +123,7 @@ class ExponentialTest(test.TestCase):
       exponential = exponential_lib.Exponential(rate=lam)
 
       samples = exponential.sample(n, seed=137)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(sample_values.shape, (100000, 2))
       self.assertFalse(np.any(sample_values < 0.0))
       if not stats:
@@ -142,7 +146,7 @@ class ExponentialTest(test.TestCase):
       samples = exponential.sample(n, seed=138)
       self.assertEqual(samples.get_shape(), (n, batch_size, 2))
 
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
 
       self.assertFalse(np.any(sample_values < 0.0))
       if not stats:
@@ -163,8 +167,8 @@ class ExponentialTest(test.TestCase):
     with self.test_session():
       lam = [-2.2, -3.4]
       exponential = exponential_lib.ExponentialWithSoftplusRate(rate=lam)
-      self.assertAllClose(nn_ops.softplus(lam).eval(),
-                          exponential.rate.eval())
+      self.assertAllClose(
+          self.evaluate(nn_ops.softplus(lam)), self.evaluate(exponential.rate))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/laplace_test.py b/tensorflow/python/kernel_tests/distributions/laplace_test.py
index 55577386c450c7ac63f62c8a6dfd277af50e2387..918c7f63f2065525338632ba68cb180c7c50dea6 100644
--- a/tensorflow/python/kernel_tests/distributions/laplace_test.py
+++ b/tensorflow/python/kernel_tests/distributions/laplace_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops.distributions import laplace as laplace_lib
 from tensorflow.python.platform import test
@@ -43,6 +44,7 @@ def try_import(name):  # pylint: disable=invalid-name
 stats = try_import("scipy.stats")
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class LaplaceTest(test.TestCase):
 
   def testLaplaceShape(self):
@@ -51,9 +53,9 @@ class LaplaceTest(test.TestCase):
       scale = constant_op.constant(11.0)
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
 
-      self.assertEqual(laplace.batch_shape_tensor().eval(), (5,))
+      self.assertEqual(self.evaluate(laplace.batch_shape_tensor()), (5,))
       self.assertEqual(laplace.batch_shape, tensor_shape.TensorShape([5]))
-      self.assertAllEqual(laplace.event_shape_tensor().eval(), [])
+      self.assertAllEqual(self.evaluate(laplace.event_shape_tensor()), [])
       self.assertEqual(laplace.event_shape, tensor_shape.TensorShape([]))
 
   def testLaplaceLogPDF(self):
@@ -70,11 +72,11 @@ class LaplaceTest(test.TestCase):
       if not stats:
         return
       expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
-      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
+      self.assertAllClose(self.evaluate(log_pdf), expected_log_pdf)
 
       pdf = laplace.prob(x)
       self.assertEqual(pdf.get_shape(), (6,))
-      self.assertAllClose(pdf.eval(), np.exp(expected_log_pdf))
+      self.assertAllClose(self.evaluate(pdf), np.exp(expected_log_pdf))
 
   def testLaplaceLogPDFMultidimensional(self):
     with self.test_session():
@@ -86,11 +88,11 @@ class LaplaceTest(test.TestCase):
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
       log_pdf = laplace.log_prob(x)
-      log_pdf_values = log_pdf.eval()
+      log_pdf_values = self.evaluate(log_pdf)
       self.assertEqual(log_pdf.get_shape(), (6, 2))
 
       pdf = laplace.prob(x)
-      pdf_values = pdf.eval()
+      pdf_values = self.evaluate(pdf)
       self.assertEqual(pdf.get_shape(), (6, 2))
       if not stats:
         return
@@ -108,11 +110,11 @@ class LaplaceTest(test.TestCase):
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
       log_pdf = laplace.log_prob(x)
-      log_pdf_values = log_pdf.eval()
+      log_pdf_values = self.evaluate(log_pdf)
       self.assertEqual(log_pdf.get_shape(), (6, 2))
 
       pdf = laplace.prob(x)
-      pdf_values = pdf.eval()
+      pdf_values = self.evaluate(pdf)
       self.assertEqual(pdf.get_shape(), (6, 2))
       if not stats:
         return
@@ -136,7 +138,7 @@ class LaplaceTest(test.TestCase):
       if not stats:
         return
       expected_cdf = stats.laplace.cdf(x, loc_v, scale=scale_v)
-      self.assertAllClose(cdf.eval(), expected_cdf)
+      self.assertAllClose(self.evaluate(cdf), expected_cdf)
 
   def testLaplaceLogCDF(self):
     with self.test_session():
@@ -154,7 +156,7 @@ class LaplaceTest(test.TestCase):
       if not stats:
         return
       expected_cdf = stats.laplace.logcdf(x, loc_v, scale=scale_v)
-      self.assertAllClose(cdf.eval(), expected_cdf)
+      self.assertAllClose(self.evaluate(cdf), expected_cdf)
 
   def testLaplaceLogSurvivalFunction(self):
     with self.test_session():
@@ -172,7 +174,7 @@ class LaplaceTest(test.TestCase):
       if not stats:
         return
       expected_sf = stats.laplace.logsf(x, loc_v, scale=scale_v)
-      self.assertAllClose(sf.eval(), expected_sf)
+      self.assertAllClose(self.evaluate(sf), expected_sf)
 
   def testLaplaceMean(self):
     with self.test_session():
@@ -183,7 +185,7 @@ class LaplaceTest(test.TestCase):
       if not stats:
         return
       expected_means = stats.laplace.mean(loc_v, scale=scale_v)
-      self.assertAllClose(laplace.mean().eval(), expected_means)
+      self.assertAllClose(self.evaluate(laplace.mean()), expected_means)
 
   def testLaplaceMode(self):
     with self.test_session():
@@ -191,7 +193,7 @@ class LaplaceTest(test.TestCase):
       scale_v = np.array([1.0, 4.0, 5.0])
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
       self.assertEqual(laplace.mode().get_shape(), (3,))
-      self.assertAllClose(laplace.mode().eval(), loc_v)
+      self.assertAllClose(self.evaluate(laplace.mode()), loc_v)
 
   def testLaplaceVariance(self):
     with self.test_session():
@@ -202,7 +204,7 @@ class LaplaceTest(test.TestCase):
       if not stats:
         return
       expected_variances = stats.laplace.var(loc_v, scale=scale_v)
-      self.assertAllClose(laplace.variance().eval(), expected_variances)
+      self.assertAllClose(self.evaluate(laplace.variance()), expected_variances)
 
   def testLaplaceStd(self):
     with self.test_session():
@@ -213,7 +215,7 @@ class LaplaceTest(test.TestCase):
       if not stats:
         return
       expected_stddev = stats.laplace.std(loc_v, scale=scale_v)
-      self.assertAllClose(laplace.stddev().eval(), expected_stddev)
+      self.assertAllClose(self.evaluate(laplace.stddev()), expected_stddev)
 
   def testLaplaceEntropy(self):
     with self.test_session():
@@ -224,7 +226,7 @@ class LaplaceTest(test.TestCase):
       if not stats:
         return
       expected_entropy = stats.laplace.entropy(loc_v, scale=scale_v)
-      self.assertAllClose(laplace.entropy().eval(), expected_entropy)
+      self.assertAllClose(self.evaluate(laplace.entropy()), expected_entropy)
 
   def testLaplaceSample(self):
     with session.Session():
@@ -235,7 +237,7 @@ class LaplaceTest(test.TestCase):
       n = 100000
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
       samples = laplace.sample(n, seed=137)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(samples.get_shape(), (n,))
       self.assertEqual(sample_values.shape, (n,))
       if not stats:
@@ -260,7 +262,7 @@ class LaplaceTest(test.TestCase):
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
       n = 10000
       samples = laplace.sample(n, seed=137)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(samples.get_shape(), (n, 10, 100))
       self.assertEqual(sample_values.shape, (n, 10, 100))
       zeros = np.zeros_like(loc_v + scale_v)  # 10 x 100
@@ -297,32 +299,31 @@ class LaplaceTest(test.TestCase):
     return ks < 0.02
 
   def testLaplacePdfOfSampleMultiDims(self):
-    with session.Session() as sess:
-      laplace = laplace_lib.Laplace(loc=[7., 11.], scale=[[5.], [6.]])
-      num = 50000
-      samples = laplace.sample(num, seed=137)
-      pdfs = laplace.prob(samples)
-      sample_vals, pdf_vals = sess.run([samples, pdfs])
-      self.assertEqual(samples.get_shape(), (num, 2, 2))
-      self.assertEqual(pdfs.get_shape(), (num, 2, 2))
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
-      if not stats:
-        return
-      self.assertAllClose(
-          stats.laplace.mean(
-              [[7., 11.], [7., 11.]], scale=np.array([[5., 5.], [6., 6.]])),
-          sample_vals.mean(axis=0),
-          rtol=0.05,
-          atol=0.)
-      self.assertAllClose(
-          stats.laplace.var([[7., 11.], [7., 11.]],
-                            scale=np.array([[5., 5.], [6., 6.]])),
-          sample_vals.var(axis=0),
-          rtol=0.05,
-          atol=0.)
+    laplace = laplace_lib.Laplace(loc=[7., 11.], scale=[[5.], [6.]])
+    num = 50000
+    samples = laplace.sample(num, seed=137)
+    pdfs = laplace.prob(samples)
+    sample_vals, pdf_vals = self.evaluate([samples, pdfs])
+    self.assertEqual(samples.get_shape(), (num, 2, 2))
+    self.assertEqual(pdfs.get_shape(), (num, 2, 2))
+    self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+    self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+    self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+    self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+    if not stats:
+      return
+    self.assertAllClose(
+        stats.laplace.mean(
+            [[7., 11.], [7., 11.]], scale=np.array([[5., 5.], [6., 6.]])),
+        sample_vals.mean(axis=0),
+        rtol=0.05,
+        atol=0.)
+    self.assertAllClose(
+        stats.laplace.var([[7., 11.], [7., 11.]],
+                          scale=np.array([[5., 5.], [6., 6.]])),
+        sample_vals.var(axis=0),
+        rtol=0.05,
+        atol=0.)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1e-3):
     s_p = zip(sample_vals, pdf_vals)
@@ -338,24 +339,27 @@ class LaplaceTest(test.TestCase):
     with self.test_session():
       loc_v = constant_op.constant(0.0, name="loc")
       scale_v = constant_op.constant(-1.0, name="scale")
-      laplace = laplace_lib.Laplace(
-          loc=loc_v, scale=scale_v, validate_args=True)
-      with self.assertRaisesOpError("scale"):
-        laplace.mean().eval()
+      with self.assertRaisesOpError(
+          "Condition x > 0 did not hold element-wise"):
+        laplace = laplace_lib.Laplace(
+            loc=loc_v, scale=scale_v, validate_args=True)
+        self.evaluate(laplace.mean())
       loc_v = constant_op.constant(1.0, name="loc")
       scale_v = constant_op.constant(0.0, name="scale")
-      laplace = laplace_lib.Laplace(
-          loc=loc_v, scale=scale_v, validate_args=True)
-      with self.assertRaisesOpError("scale"):
-        laplace.mean().eval()
+      with self.assertRaisesOpError(
+          "Condition x > 0 did not hold element-wise"):
+        laplace = laplace_lib.Laplace(
+            loc=loc_v, scale=scale_v, validate_args=True)
+        self.evaluate(laplace.mean())
 
   def testLaplaceWithSoftplusScale(self):
     with self.test_session():
       loc_v = constant_op.constant([0.0, 1.0], name="loc")
       scale_v = constant_op.constant([-1.0, 2.0], name="scale")
       laplace = laplace_lib.LaplaceWithSoftplusScale(loc=loc_v, scale=scale_v)
-      self.assertAllClose(nn_ops.softplus(scale_v).eval(), laplace.scale.eval())
-      self.assertAllClose(loc_v.eval(), laplace.loc.eval())
+      self.assertAllClose(
+          self.evaluate(nn_ops.softplus(scale_v)), self.evaluate(laplace.scale))
+      self.assertAllClose(self.evaluate(loc_v), self.evaluate(laplace.loc))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/normal_test.py b/tensorflow/python/kernel_tests/distributions/normal_test.py
index 07c7d6d11d0f3bcecfd1029295d3249c3ea8584b..d793e03272909cc97543e313041b6ae7f487ae3f 100644
--- a/tensorflow/python/kernel_tests/distributions/normal_test.py
+++ b/tensorflow/python/kernel_tests/distributions/normal_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
@@ -54,7 +55,7 @@ class NormalTest(test.TestCase):
     self._rng = np.random.RandomState(123)
 
   def assertAllFinite(self, tensor):
-    is_finite = np.isfinite(tensor.eval())
+    is_finite = np.isfinite(self.evaluate(tensor))
     all_true = np.ones_like(is_finite, dtype=np.bool)
     self.assertAllEqual(all_true, is_finite)
 
@@ -62,13 +63,13 @@ class NormalTest(test.TestCase):
     with self.test_session():
       param_shapes = normal_lib.Normal.param_shapes(sample_shape)
       mu_shape, sigma_shape = param_shapes["loc"], param_shapes["scale"]
-      self.assertAllEqual(expected, mu_shape.eval())
-      self.assertAllEqual(expected, sigma_shape.eval())
+      self.assertAllEqual(expected, self.evaluate(mu_shape))
+      self.assertAllEqual(expected, self.evaluate(sigma_shape))
       mu = array_ops.zeros(mu_shape)
       sigma = array_ops.ones(sigma_shape)
       self.assertAllEqual(
           expected,
-          array_ops.shape(normal_lib.Normal(mu, sigma).sample()).eval())
+          self.evaluate(array_ops.shape(normal_lib.Normal(mu, sigma).sample())))
 
   def _testParamStaticShapes(self, sample_shape, expected):
     param_shapes = normal_lib.Normal.param_static_shapes(sample_shape)
@@ -76,25 +77,30 @@ class NormalTest(test.TestCase):
     self.assertEqual(expected, mu_shape)
     self.assertEqual(expected, sigma_shape)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testParamShapes(self):
     sample_shape = [10, 3, 4]
     self._testParamShapes(sample_shape, sample_shape)
     self._testParamShapes(constant_op.constant(sample_shape), sample_shape)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testParamStaticShapes(self):
     sample_shape = [10, 3, 4]
     self._testParamStaticShapes(sample_shape, sample_shape)
     self._testParamStaticShapes(
         tensor_shape.TensorShape(sample_shape), sample_shape)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalWithSoftplusScale(self):
     with self.test_session():
       mu = array_ops.zeros((10, 3))
       rho = array_ops.ones((10, 3)) * -2.
       normal = normal_lib.NormalWithSoftplusScale(loc=mu, scale=rho)
-      self.assertAllEqual(mu.eval(), normal.loc.eval())
-      self.assertAllEqual(nn_ops.softplus(rho).eval(), normal.scale.eval())
+      self.assertAllEqual(self.evaluate(mu), self.evaluate(normal.loc))
+      self.assertAllEqual(
+          self.evaluate(nn_ops.softplus(rho)), self.evaluate(normal.scale))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalLogPDF(self):
     with self.test_session():
       batch_size = 6
@@ -104,25 +110,31 @@ class NormalTest(test.TestCase):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       log_pdf = normal.log_prob(x)
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          log_pdf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          log_pdf.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), log_pdf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(log_pdf).shape)
       self.assertAllEqual(normal.batch_shape, log_pdf.get_shape())
-      self.assertAllEqual(normal.batch_shape, log_pdf.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(log_pdf).shape)
 
       pdf = normal.prob(x)
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), pdf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(pdf).shape)
       self.assertAllEqual(normal.batch_shape, pdf.get_shape())
-      self.assertAllEqual(normal.batch_shape, pdf.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(pdf).shape)
 
       if not stats:
         return
-      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
-      self.assertAllClose(expected_log_pdf, log_pdf.eval())
-      self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
+      expected_log_pdf = stats.norm(self.evaluate(mu),
+                                    self.evaluate(sigma)).logpdf(x)
+      self.assertAllClose(expected_log_pdf, self.evaluate(log_pdf))
+      self.assertAllClose(np.exp(expected_log_pdf), self.evaluate(pdf))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalLogPDFMultidimensional(self):
     with self.test_session():
       batch_size = 6
@@ -133,29 +145,34 @@ class NormalTest(test.TestCase):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       log_pdf = normal.log_prob(x)
-      log_pdf_values = log_pdf.eval()
+      log_pdf_values = self.evaluate(log_pdf)
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          log_pdf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          log_pdf.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), log_pdf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(log_pdf).shape)
       self.assertAllEqual(normal.batch_shape, log_pdf.get_shape())
-      self.assertAllEqual(normal.batch_shape, log_pdf.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(log_pdf).shape)
 
       pdf = normal.prob(x)
-      pdf_values = pdf.eval()
+      pdf_values = self.evaluate(pdf)
       self.assertEqual(pdf.get_shape(), (6, 2))
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf_values.shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), pdf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), pdf_values.shape)
       self.assertAllEqual(normal.batch_shape, pdf.get_shape())
       self.assertAllEqual(normal.batch_shape, pdf_values.shape)
 
       if not stats:
         return
-      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
+      expected_log_pdf = stats.norm(self.evaluate(mu),
+                                    self.evaluate(sigma)).logpdf(x)
       self.assertAllClose(expected_log_pdf, log_pdf_values)
       self.assertAllClose(np.exp(expected_log_pdf), pdf_values)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalCDF(self):
     with self.test_session():
       batch_size = 50
@@ -165,15 +182,19 @@ class NormalTest(test.TestCase):
 
       normal = normal_lib.Normal(loc=mu, scale=sigma)
       cdf = normal.cdf(x)
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), cdf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(cdf).shape)
       self.assertAllEqual(normal.batch_shape, cdf.get_shape())
-      self.assertAllEqual(normal.batch_shape, cdf.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(cdf).shape)
       if not stats:
         return
       expected_cdf = stats.norm(mu, sigma).cdf(x)
-      self.assertAllClose(expected_cdf, cdf.eval(), atol=0)
+      self.assertAllClose(expected_cdf, self.evaluate(cdf), atol=0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalSurvivalFunction(self):
     with self.test_session():
       batch_size = 50
@@ -184,15 +205,19 @@ class NormalTest(test.TestCase):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       sf = normal.survival_function(x)
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), sf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(sf).shape)
       self.assertAllEqual(normal.batch_shape, sf.get_shape())
-      self.assertAllEqual(normal.batch_shape, sf.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(sf).shape)
       if not stats:
         return
       expected_sf = stats.norm(mu, sigma).sf(x)
-      self.assertAllClose(expected_sf, sf.eval(), atol=0)
+      self.assertAllClose(expected_sf, self.evaluate(sf), atol=0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalLogCDF(self):
     with self.test_session():
       batch_size = 50
@@ -203,15 +228,18 @@ class NormalTest(test.TestCase):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       cdf = normal.log_cdf(x)
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), cdf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(cdf).shape)
       self.assertAllEqual(normal.batch_shape, cdf.get_shape())
-      self.assertAllEqual(normal.batch_shape, cdf.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(cdf).shape)
 
       if not stats:
         return
       expected_cdf = stats.norm(mu, sigma).logcdf(x)
-      self.assertAllClose(expected_cdf, cdf.eval(), atol=0, rtol=1e-5)
+      self.assertAllClose(expected_cdf, self.evaluate(cdf), atol=0, rtol=1e-5)
 
   def testFiniteGradientAtDifficultPoints(self):
     for dtype in [np.float32, np.float64]:
@@ -233,6 +261,7 @@ class NormalTest(test.TestCase):
             self.assertAllFinite(grads[0])
             self.assertAllFinite(grads[1])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalLogSurvivalFunction(self):
     with self.test_session():
       batch_size = 50
@@ -243,16 +272,20 @@ class NormalTest(test.TestCase):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       sf = normal.log_survival_function(x)
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), sf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(sf).shape)
       self.assertAllEqual(normal.batch_shape, sf.get_shape())
-      self.assertAllEqual(normal.batch_shape, sf.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(sf).shape)
 
       if not stats:
         return
       expected_sf = stats.norm(mu, sigma).logsf(x)
-      self.assertAllClose(expected_sf, sf.eval(), atol=0, rtol=1e-5)
+      self.assertAllClose(expected_sf, self.evaluate(sf), atol=0, rtol=1e-5)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalEntropyWithScalarInputs(self):
     # Scipy.stats.norm cannot deal with the shapes in the other test.
     with self.test_session():
@@ -261,18 +294,20 @@ class NormalTest(test.TestCase):
       normal = normal_lib.Normal(loc=mu_v, scale=sigma_v)
 
       entropy = normal.entropy()
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          entropy.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          entropy.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), entropy.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(entropy).shape)
       self.assertAllEqual(normal.batch_shape, entropy.get_shape())
-      self.assertAllEqual(normal.batch_shape, entropy.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(entropy).shape)
       # scipy.stats.norm cannot deal with these shapes.
       if not stats:
         return
       expected_entropy = stats.norm(mu_v, sigma_v).entropy()
-      self.assertAllClose(expected_entropy, entropy.eval())
+      self.assertAllClose(expected_entropy, self.evaluate(entropy))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalEntropy(self):
     with self.test_session():
       mu_v = np.array([1.0, 1.0, 1.0])
@@ -284,14 +319,16 @@ class NormalTest(test.TestCase):
       expected_entropy = 0.5 * np.log(2 * np.pi * np.exp(1) * sigma_broadcast**
                                       2)
       entropy = normal.entropy()
-      np.testing.assert_allclose(expected_entropy, entropy.eval())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          entropy.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          entropy.eval().shape)
+      np.testing.assert_allclose(expected_entropy, self.evaluate(entropy))
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), entropy.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(entropy).shape)
       self.assertAllEqual(normal.batch_shape, entropy.get_shape())
-      self.assertAllEqual(normal.batch_shape, entropy.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(entropy).shape)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalMeanAndMode(self):
     with self.test_session():
       # Mu will be broadcast to [7, 7, 7].
@@ -301,11 +338,12 @@ class NormalTest(test.TestCase):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       self.assertAllEqual((3,), normal.mean().get_shape())
-      self.assertAllEqual([7., 7, 7], normal.mean().eval())
+      self.assertAllEqual([7., 7, 7], self.evaluate(normal.mean()))
 
       self.assertAllEqual((3,), normal.mode().get_shape())
-      self.assertAllEqual([7., 7, 7], normal.mode().eval())
+      self.assertAllEqual([7., 7, 7], self.evaluate(normal.mode()))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalQuantile(self):
     with self.test_session():
       batch_size = 52
@@ -319,15 +357,18 @@ class NormalTest(test.TestCase):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
       x = normal.quantile(p)
 
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), x.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), x.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), x.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(x).shape)
       self.assertAllEqual(normal.batch_shape, x.get_shape())
-      self.assertAllEqual(normal.batch_shape, x.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(x).shape)
 
       if not stats:
         return
       expected_x = stats.norm(mu, sigma).ppf(p)
-      self.assertAllClose(expected_x, x.eval(), atol=0.)
+      self.assertAllClose(expected_x, self.evaluate(x), atol=0.)
 
   def _baseQuantileFiniteGradientAtDifficultPoints(self, dtype):
     g = ops.Graph()
@@ -354,6 +395,7 @@ class NormalTest(test.TestCase):
   def testQuantileFiniteGradientAtDifficultPointsFloat64(self):
     self._baseQuantileFiniteGradientAtDifficultPoints(np.float64)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalVariance(self):
     with self.test_session():
       # sigma will be broadcast to [7, 7, 7]
@@ -363,8 +405,9 @@ class NormalTest(test.TestCase):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       self.assertAllEqual((3,), normal.variance().get_shape())
-      self.assertAllEqual([49., 49, 49], normal.variance().eval())
+      self.assertAllEqual([49., 49, 49], self.evaluate(normal.variance()))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalStandardDeviation(self):
     with self.test_session():
       # sigma will be broadcast to [7, 7, 7]
@@ -374,8 +417,9 @@ class NormalTest(test.TestCase):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       self.assertAllEqual((3,), normal.stddev().get_shape())
-      self.assertAllEqual([7., 7, 7], normal.stddev().eval())
+      self.assertAllEqual([7., 7, 7], self.evaluate(normal.stddev()))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalSample(self):
     with self.test_session():
       mu = constant_op.constant(3.0)
@@ -385,7 +429,7 @@ class NormalTest(test.TestCase):
       n = constant_op.constant(100000)
       normal = normal_lib.Normal(loc=mu, scale=sigma)
       samples = normal.sample(n)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       # Note that the standard error for the sample mean is ~ sigma / sqrt(n).
       # The sample variance similarly is dependent on sigma and n.
       # Thus, the tolerances below are very sensitive to number of samples
@@ -394,18 +438,22 @@ class NormalTest(test.TestCase):
       self.assertAllClose(sample_values.mean(), mu_v, atol=1e-1)
       self.assertAllClose(sample_values.std(), sigma_v, atol=1e-1)
 
-      expected_samples_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
-          tensor_shape.TensorShape(normal.batch_shape_tensor().eval()))
+      expected_samples_shape = tensor_shape.TensorShape(
+          [self.evaluate(n)]).concatenate(
+              tensor_shape.TensorShape(
+                  self.evaluate(normal.batch_shape_tensor())))
 
       self.assertAllEqual(expected_samples_shape, samples.get_shape())
       self.assertAllEqual(expected_samples_shape, sample_values.shape)
 
-      expected_samples_shape = (tensor_shape.TensorShape(
-          [n.eval()]).concatenate(normal.batch_shape))
+      expected_samples_shape = (
+          tensor_shape.TensorShape([self.evaluate(n)]).concatenate(
+              normal.batch_shape))
 
       self.assertAllEqual(expected_samples_shape, samples.get_shape())
       self.assertAllEqual(expected_samples_shape, sample_values.shape)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalSampleMultiDimensional(self):
     with self.test_session():
       batch_size = 2
@@ -417,7 +465,7 @@ class NormalTest(test.TestCase):
       n = constant_op.constant(100000)
       normal = normal_lib.Normal(loc=mu, scale=sigma)
       samples = normal.sample(n)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       # Note that the standard error for the sample mean is ~ sigma / sqrt(n).
       # The sample variance similarly is dependent on sigma and n.
       # Thus, the tolerances below are very sensitive to number of samples
@@ -428,32 +476,37 @@ class NormalTest(test.TestCase):
       self.assertAllClose(sample_values[:, 0, 1].mean(), mu_v[1], atol=1e-1)
       self.assertAllClose(sample_values[:, 0, 1].std(), sigma_v[1], atol=1e-1)
 
-      expected_samples_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
-          tensor_shape.TensorShape(normal.batch_shape_tensor().eval()))
+      expected_samples_shape = tensor_shape.TensorShape(
+          [self.evaluate(n)]).concatenate(
+              tensor_shape.TensorShape(
+                  self.evaluate(normal.batch_shape_tensor())))
       self.assertAllEqual(expected_samples_shape, samples.get_shape())
       self.assertAllEqual(expected_samples_shape, sample_values.shape)
 
-      expected_samples_shape = (tensor_shape.TensorShape(
-          [n.eval()]).concatenate(normal.batch_shape))
+      expected_samples_shape = (
+          tensor_shape.TensorShape([self.evaluate(n)]).concatenate(
+              normal.batch_shape))
       self.assertAllEqual(expected_samples_shape, samples.get_shape())
       self.assertAllEqual(expected_samples_shape, sample_values.shape)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNegativeSigmaFails(self):
     with self.test_session():
-      normal = normal_lib.Normal(
-          loc=[1.], scale=[-5.], validate_args=True, name="G")
       with self.assertRaisesOpError("Condition x > 0 did not hold"):
-        normal.mean().eval()
+        normal = normal_lib.Normal(
+            loc=[1.], scale=[-5.], validate_args=True, name="G")
+        self.evaluate(normal.mean())
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalShape(self):
     with self.test_session():
       mu = constant_op.constant([-3.0] * 5)
       sigma = constant_op.constant(11.0)
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
-      self.assertEqual(normal.batch_shape_tensor().eval(), [5])
+      self.assertEqual(self.evaluate(normal.batch_shape_tensor()), [5])
       self.assertEqual(normal.batch_shape, tensor_shape.TensorShape([5]))
-      self.assertAllEqual(normal.event_shape_tensor().eval(), [])
+      self.assertAllEqual(self.evaluate(normal.event_shape_tensor()), [])
       self.assertEqual(normal.event_shape, tensor_shape.TensorShape([]))
 
   def testNormalShapeWithPlaceholders(self):
@@ -465,31 +518,31 @@ class NormalTest(test.TestCase):
       # get_batch_shape should return an "<unknown>" tensor.
       self.assertEqual(normal.batch_shape, tensor_shape.TensorShape(None))
       self.assertEqual(normal.event_shape, ())
-      self.assertAllEqual(normal.event_shape_tensor().eval(), [])
+      self.assertAllEqual(self.evaluate(normal.event_shape_tensor()), [])
       self.assertAllEqual(
           sess.run(normal.batch_shape_tensor(),
                    feed_dict={mu: 5.0,
                               sigma: [1.0, 2.0]}), [2])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalNormalKL(self):
-    with self.test_session() as sess:
-      batch_size = 6
-      mu_a = np.array([3.0] * batch_size)
-      sigma_a = np.array([1.0, 2.0, 3.0, 1.5, 2.5, 3.5])
-      mu_b = np.array([-3.0] * batch_size)
-      sigma_b = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0])
+    batch_size = 6
+    mu_a = np.array([3.0] * batch_size)
+    sigma_a = np.array([1.0, 2.0, 3.0, 1.5, 2.5, 3.5])
+    mu_b = np.array([-3.0] * batch_size)
+    sigma_b = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0])
 
-      n_a = normal_lib.Normal(loc=mu_a, scale=sigma_a)
-      n_b = normal_lib.Normal(loc=mu_b, scale=sigma_b)
+    n_a = normal_lib.Normal(loc=mu_a, scale=sigma_a)
+    n_b = normal_lib.Normal(loc=mu_b, scale=sigma_b)
 
-      kl = kullback_leibler.kl_divergence(n_a, n_b)
-      kl_val = sess.run(kl)
+    kl = kullback_leibler.kl_divergence(n_a, n_b)
+    kl_val = self.evaluate(kl)
 
-      kl_expected = ((mu_a - mu_b)**2 / (2 * sigma_b**2) + 0.5 * (
-          (sigma_a**2 / sigma_b**2) - 1 - 2 * np.log(sigma_a / sigma_b)))
+    kl_expected = ((mu_a - mu_b)**2 / (2 * sigma_b**2) + 0.5 * (
+        (sigma_a**2 / sigma_b**2) - 1 - 2 * np.log(sigma_a / sigma_b)))
 
-      self.assertEqual(kl.get_shape(), (batch_size,))
-      self.assertAllClose(kl_val, kl_expected)
+    self.assertEqual(kl.get_shape(), (batch_size,))
+    self.assertAllClose(kl_val, kl_expected)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/special_math_test.py b/tensorflow/python/kernel_tests/distributions/special_math_test.py
index 2d434a39c2933832daebbe8f710de9553d0bf38d..4565bf5c4669b4d416049816046f6f8ed187270d 100644
--- a/tensorflow/python/kernel_tests/distributions/special_math_test.py
+++ b/tensorflow/python/kernel_tests/distributions/special_math_test.py
@@ -23,11 +23,14 @@ import importlib
 
 import numpy as np
 
+from tensorflow.python.eager import backprop as tfe_backprop
+from tensorflow.python.eager import context as tfe_context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import variables
 from tensorflow.python.ops.distributions import special_math
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -64,6 +67,16 @@ def _make_grid(dtype, grid_spec):
   return np.reshape(grid, grid_spec.shape)
 
 
+def _value_and_gradient(fn, *args):
+  """Calls `fn` and computes the gradient of the result wrt `arg`."""
+  if tfe_context.executing_eagerly():
+    v, g = tfe_backprop.val_and_grad_function(fn)(args)
+  else:
+    v = fn(*args)
+    g = gradients_impl.gradients(v, args)
+  return v, g
+
+
 GridSpec = collections.namedtuple("GridSpec", ["min", "max", "shape"])
 
 ErrorSpec = collections.namedtuple("ErrorSpec", ["rtol", "atol"])
@@ -71,11 +84,12 @@ ErrorSpec = collections.namedtuple("ErrorSpec", ["rtol", "atol"])
 
 class NdtriTest(test.TestCase):
 
-  def assertAllFinite(self, tensor):
-    is_finite = np.isfinite(tensor.eval())
+  def assertAllFinite(self, x):
+    is_finite = np.isfinite(x)
     all_true = np.ones_like(is_finite, dtype=np.bool)
     self.assertAllEqual(all_true, is_finite)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNdtri(self):
     """Verifies that ndtri computation is correct."""
     with self.test_session():
@@ -89,7 +103,7 @@ class NdtriTest(test.TestCase):
                      np.exp(-2), 1. - np.exp(-2)))
       expected_x = special.ndtri(p)
       x = special_math.ndtri(p)
-      self.assertAllClose(expected_x, x.eval(), atol=0.)
+      self.assertAllClose(expected_x, self.evaluate(x), atol=0.)
 
   def testNdtriDynamicShape(self):
     """Verifies that ndtri computation is correct."""
@@ -108,27 +122,32 @@ class NdtriTest(test.TestCase):
 
   def _baseNdtriFiniteGradientTest(self, dtype):
     """Verifies that ndtri has finite gradients at interesting points."""
-    g = ops.Graph()
-    with g.as_default():
-      # Tests gradients at 0, 1, and piece-wise boundaries.
-      p = variables.Variable(
-          np.array([0.,
-                    np.exp(-32.), np.exp(-2.),
-                    1. - np.exp(-2.), 1. - np.exp(-32.),
-                    1.]).astype(dtype))
-    value = special_math.ndtri(p)
-    grads = gradients_impl.gradients(value, p)
-    with self.test_session(graph=g):
-      variables.global_variables_initializer().run()
-      self.assertAllFinite(grads[0])
-
+    # Tests gradients at 0, 1, and piece-wise boundaries.
+    p = constant_op.constant(
+        np.array([
+            0.,
+            np.exp(-32.),
+            np.exp(-2.),
+            1. - np.exp(-2.),
+            1. - np.exp(-32.),
+            1.,
+        ]).astype(dtype))
+    # Not having the lambda sanitzer means we'd get an `IndexError` whenever
+    # the user supplied function has default args.
+    _, grads = _value_and_gradient(
+        lambda x: special_math.ndtri(x), p)  # pylint: disable=unnecessary-lambda
+    self.assertAllFinite(self.evaluate(grads[0]))
+
+  @test_util.run_in_graph_and_eager_modes()
   def testNdtriFiniteGradientFloat32(self):
     self._baseNdtriFiniteGradientTest(np.float32)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNdtriFiniteGradientFloat64(self):
     self._baseNdtriFiniteGradientTest(np.float64)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class NdtrTest(test.TestCase):
   _use_log = False
   # Grid min/max chosen to ensure 0 < cdf(x) < 1.
@@ -147,55 +166,53 @@ class NdtrTest(test.TestCase):
     if not special:
       return
 
-    with self.test_session():
-      grid = _make_grid(dtype, grid_spec)
-      actual = sm.log_ndtr(grid).eval()
-
-      # Basic tests.
-      # isfinite checks for NaN and Inf.
-      self.assertTrue(np.isfinite(actual).all())
-      # On the grid, -inf < log_cdf(x) < 0.  In this case, we should be able
-      # to use a huge grid because we have used tricks to escape numerical
-      # difficulties.
-      self.assertTrue((actual < 0).all())
-      _check_strictly_increasing(actual)
-
-      # Versus scipy.
-      expected = special.log_ndtr(grid)
-      # Scipy prematurely goes to zero at some places that we don't.  So don't
-      # include these in the comparison.
-      self.assertAllClose(
-          expected.astype(np.float64)[expected < 0],
-          actual.astype(np.float64)[expected < 0],
-          rtol=error_spec.rtol,
-          atol=error_spec.atol)
+    grid = _make_grid(dtype, grid_spec)
+    actual = self.evaluate(sm.log_ndtr(grid))
+
+    # Basic tests.
+    # isfinite checks for NaN and Inf.
+    self.assertTrue(np.isfinite(actual).all())
+    # On the grid, -inf < log_cdf(x) < 0.  In this case, we should be able
+    # to use a huge grid because we have used tricks to escape numerical
+    # difficulties.
+    self.assertTrue((actual < 0).all())
+    _check_strictly_increasing(actual)
+
+    # Versus scipy.
+    expected = special.log_ndtr(grid)
+    # Scipy prematurely goes to zero at some places that we don't.  So don't
+    # include these in the comparison.
+    self.assertAllClose(
+        expected.astype(np.float64)[expected < 0],
+        actual.astype(np.float64)[expected < 0],
+        rtol=error_spec.rtol,
+        atol=error_spec.atol)
 
   def _test_grid_no_log(self, dtype, grid_spec, error_spec):
     if not special:
       return
 
-    with self.test_session():
-      grid = _make_grid(dtype, grid_spec)
-      actual = sm.ndtr(grid).eval()
-
-      # Basic tests.
-      # isfinite checks for NaN and Inf.
-      self.assertTrue(np.isfinite(actual).all())
-      # On the grid, 0 < cdf(x) < 1.  The grid cannot contain everything due
-      # to numerical limitations of cdf.
-      self.assertTrue((actual > 0).all())
-      self.assertTrue((actual < 1).all())
-      _check_strictly_increasing(actual)
-
-      # Versus scipy.
-      expected = special.ndtr(grid)
-      # Scipy prematurely goes to zero at some places that we don't.  So don't
-      # include these in the comparison.
-      self.assertAllClose(
-          expected.astype(np.float64)[expected < 0],
-          actual.astype(np.float64)[expected < 0],
-          rtol=error_spec.rtol,
-          atol=error_spec.atol)
+    grid = _make_grid(dtype, grid_spec)
+    actual = self.evaluate(sm.ndtr(grid))
+
+    # Basic tests.
+    # isfinite checks for NaN and Inf.
+    self.assertTrue(np.isfinite(actual).all())
+    # On the grid, 0 < cdf(x) < 1.  The grid cannot contain everything due
+    # to numerical limitations of cdf.
+    self.assertTrue((actual > 0).all())
+    self.assertTrue((actual < 1).all())
+    _check_strictly_increasing(actual)
+
+    # Versus scipy.
+    expected = special.ndtr(grid)
+    # Scipy prematurely goes to zero at some places that we don't.  So don't
+    # include these in the comparison.
+    self.assertAllClose(
+        expected.astype(np.float64)[expected < 0],
+        actual.astype(np.float64)[expected < 0],
+        rtol=error_spec.rtol,
+        atol=error_spec.atol)
 
   def test_float32(self):
     self._test_grid(np.float32, self._grid32, self._error32)
@@ -254,14 +271,17 @@ class NdtrGradientTest(test.TestCase):
     self.assertAllEqual(np.zeros_like(v, dtype=np.bool), v)
 
   def _test_grad_finite(self, dtype):
-    with self.test_session():
-      x = variables.Variable([-100., 0., 100.], dtype=dtype)
-      output = (sm.log_ndtr(x) if self._use_log else sm.ndtr(x))
-      grad_output = gradients_impl.gradients(output, x)
-      variables.global_variables_initializer().run()
-      # isfinite checks for NaN and Inf.
-      self.assert_all_true(np.isfinite(output.eval()))
-      self.assert_all_true(np.isfinite(grad_output[0].eval()))
+    x = constant_op.constant([-100., 0., 100.], dtype=dtype)
+    output = (sm.log_ndtr(x) if self._use_log else sm.ndtr(x))
+    fn = sm.log_ndtr if self._use_log else sm.ndtr
+    # Not having the lambda sanitzer means we'd get an `IndexError` whenever
+    # the user supplied function has default args.
+    output, grad_output = _value_and_gradient(
+        lambda x_: fn(x_), x)  # pylint: disable=unnecessary-lambda
+    # isfinite checks for NaN and Inf.
+    output_, grad_output_ = self.evaluate([output, grad_output])
+    self.assert_all_true(np.isfinite(output_))
+    self.assert_all_true(np.isfinite(grad_output_[0]))
 
   def _test_grad_accuracy(self, dtype, grid_spec, error_spec):
     raw_grid = _make_grid(dtype, grid_spec)
@@ -357,7 +377,6 @@ class ErfInvTest(test.TestCase):
         special_math.erfinv(x)
 
 
-
 class LogCDFLaplaceTest(test.TestCase):
   # Note that scipy.stats.laplace does not have a stable Log CDF, so we cannot
   # rely on scipy to cross check the extreme values.
diff --git a/tensorflow/python/kernel_tests/distributions/student_t_test.py b/tensorflow/python/kernel_tests/distributions/student_t_test.py
index f1150de58e0dae5da25f74f95fb391c340a01262..a4fdb658e857d832d5bf69485bbfb2517646a7b7 100644
--- a/tensorflow/python/kernel_tests/distributions/student_t_test.py
+++ b/tensorflow/python/kernel_tests/distributions/student_t_test.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops.distributions import student_t
@@ -44,6 +45,7 @@ def try_import(name):  # pylint: disable=invalid-name
 stats = try_import("scipy.stats")
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class StudentTTest(test.TestCase):
 
   def testStudentPDFAndLogPDF(self):
@@ -60,10 +62,10 @@ class StudentTTest(test.TestCase):
 
       log_pdf = student.log_prob(t)
       self.assertEquals(log_pdf.get_shape(), (6,))
-      log_pdf_values = log_pdf.eval()
+      log_pdf_values = self.evaluate(log_pdf)
       pdf = student.prob(t)
       self.assertEquals(pdf.get_shape(), (6,))
-      pdf_values = pdf.eval()
+      pdf_values = self.evaluate(pdf)
 
       if not stats:
         return
@@ -88,10 +90,10 @@ class StudentTTest(test.TestCase):
       t = np.array([[-2.5, 2.5, 4., 0., -1., 2.]], dtype=np.float32).T
       student = student_t.StudentT(df, loc=mu, scale=sigma)
       log_pdf = student.log_prob(t)
-      log_pdf_values = log_pdf.eval()
+      log_pdf_values = self.evaluate(log_pdf)
       self.assertEqual(log_pdf.get_shape(), (6, 2))
       pdf = student.prob(t)
-      pdf_values = pdf.eval()
+      pdf_values = self.evaluate(pdf)
       self.assertEqual(pdf.get_shape(), (6, 2))
 
       if not stats:
@@ -117,10 +119,10 @@ class StudentTTest(test.TestCase):
 
       log_cdf = student.log_cdf(t)
       self.assertEquals(log_cdf.get_shape(), (6,))
-      log_cdf_values = log_cdf.eval()
+      log_cdf_values = self.evaluate(log_cdf)
       cdf = student.cdf(t)
       self.assertEquals(cdf.get_shape(), (6,))
-      cdf_values = cdf.eval()
+      cdf_values = self.evaluate(cdf)
 
       if not stats:
         return
@@ -140,7 +142,7 @@ class StudentTTest(test.TestCase):
     with self.test_session():
       student = student_t.StudentT(df=df_v, loc=mu_v, scale=sigma_v)
       ent = student.entropy()
-      ent_values = ent.eval()
+      ent_values = self.evaluate(ent)
 
     # Help scipy broadcast to 3x3
     ones = np.array([[1, 1, 1]])
@@ -167,7 +169,7 @@ class StudentTTest(test.TestCase):
       n = constant_op.constant(200000)
       student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       samples = student.sample(n, seed=123456)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       n_val = 200000
       self.assertEqual(sample_values.shape, (n_val,))
       self.assertAllClose(sample_values.mean(), mu_v, rtol=1e-2, atol=0)
@@ -189,12 +191,12 @@ class StudentTTest(test.TestCase):
       random_seed.set_random_seed(654321)
       student = student_t.StudentT(
           df=df, loc=mu, scale=sigma, name="student_t1")
-      samples1 = student.sample(n, seed=123456).eval()
+      samples1 = self.evaluate(student.sample(n, seed=123456))
 
       random_seed.set_random_seed(654321)
       student2 = student_t.StudentT(
           df=df, loc=mu, scale=sigma, name="student_t2")
-      samples2 = student2.sample(n, seed=123456).eval()
+      samples2 = self.evaluate(student2.sample(n, seed=123456))
 
       self.assertAllClose(samples1, samples2)
 
@@ -205,7 +207,7 @@ class StudentTTest(test.TestCase):
       n = constant_op.constant(200000)
       student = student_t.StudentT(df=df, loc=1., scale=1.)
       samples = student.sample(n, seed=123456)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       n_val = 200000
       self.assertEqual(sample_values.shape, (n_val, 4))
       self.assertTrue(np.all(np.logical_not(np.isnan(sample_values))))
@@ -223,7 +225,7 @@ class StudentTTest(test.TestCase):
       n = constant_op.constant(200000)
       student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       samples = student.sample(n, seed=123456)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(samples.get_shape(), (200000, batch_size, 2))
       self.assertAllClose(
           sample_values[:, 0, 0].mean(), mu_v[0], rtol=1e-2, atol=0)
@@ -325,7 +327,7 @@ class StudentTTest(test.TestCase):
     with self.test_session():
       mu = [1., 3.3, 4.4]
       student = student_t.StudentT(df=[3., 5., 7.], loc=mu, scale=[3., 2., 1.])
-      mean = student.mean().eval()
+      mean = self.evaluate(student.mean())
       self.assertAllClose([1., 3.3, 4.4], mean)
 
   def testMeanAllowNanStatsIsFalseRaisesWhenBatchMemberIsUndefined(self):
@@ -335,7 +337,7 @@ class StudentTTest(test.TestCase):
           df=[0.5, 5., 7.], loc=mu, scale=[3., 2., 1.],
           allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
-        student.mean().eval()
+        self.evaluate(student.mean())
 
   def testMeanAllowNanStatsIsTrueReturnsNaNForUndefinedBatchMembers(self):
     with self.test_session():
@@ -344,7 +346,7 @@ class StudentTTest(test.TestCase):
       student = student_t.StudentT(
           df=[0.5, 1., 3., 5., 7.], loc=mu, scale=sigma,
           allow_nan_stats=True)
-      mean = student.mean().eval()
+      mean = self.evaluate(student.mean())
       self.assertAllClose([np.nan, np.nan, 1., 3.3, 4.4], mean)
 
   def testVarianceAllowNanStatsTrueReturnsNaNforUndefinedBatchMembers(self):
@@ -356,7 +358,7 @@ class StudentTTest(test.TestCase):
       sigma = [5., 4., 3., 2., 1.]
       student = student_t.StudentT(
           df=df, loc=mu, scale=sigma, allow_nan_stats=True)
-      var = student.variance().eval()
+      var = self.evaluate(student.variance())
       ## scipy uses inf for variance when the mean is undefined.  When mean is
       # undefined we say variance is undefined as well.  So test the first
       # member of var, making sure it is NaN, then replace with inf and compare
@@ -379,7 +381,7 @@ class StudentTTest(test.TestCase):
       mu = [0., 1., 3.3, 4.4]
       sigma = [4., 3., 2., 1.]
       student = student_t.StudentT(df=df, loc=mu, scale=sigma)
-      var = student.variance().eval()
+      var = self.evaluate(student.variance())
 
       if not stats:
         return
@@ -394,14 +396,14 @@ class StudentTTest(test.TestCase):
       student = student_t.StudentT(
           df=1., loc=0., scale=1., allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
-        student.variance().eval()
+        self.evaluate(student.variance())
 
     with self.test_session():
       # df <= 1 ==> variance not defined
       student = student_t.StudentT(
           df=0.5, loc=0., scale=1., allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
-        student.variance().eval()
+        self.evaluate(student.variance())
 
   def testStd(self):
     with self.test_session():
@@ -411,7 +413,7 @@ class StudentTTest(test.TestCase):
       sigma = [5., 4., 3., 2., 1.]
       student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       # Test broadcast of mu across shape of df/sigma
-      stddev = student.stddev().eval()
+      stddev = self.evaluate(student.stddev())
       mu *= len(df)
 
       if not stats:
@@ -428,59 +430,58 @@ class StudentTTest(test.TestCase):
       sigma = [5., 4., 3.]
       student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       # Test broadcast of mu across shape of df/sigma
-      mode = student.mode().eval()
+      mode = self.evaluate(student.mode())
       self.assertAllClose([-1., 0, 1], mode)
 
   def testPdfOfSample(self):
-    with self.test_session() as sess:
-      student = student_t.StudentT(df=3., loc=np.pi, scale=1.)
-      num = 20000
-      samples = student.sample(num, seed=123456)
-      pdfs = student.prob(samples)
-      mean = student.mean()
-      mean_pdf = student.prob(student.mean())
-      sample_vals, pdf_vals, mean_val, mean_pdf_val = sess.run(
-          [samples, pdfs, student.mean(), mean_pdf])
-      self.assertEqual(samples.get_shape(), (num,))
-      self.assertEqual(pdfs.get_shape(), (num,))
-      self.assertEqual(mean.get_shape(), ())
-      self.assertNear(np.pi, np.mean(sample_vals), err=0.02)
-      self.assertNear(np.pi, mean_val, err=1e-6)
-      # Verify integral over sample*pdf ~= 1.
-      self._assertIntegral(sample_vals, pdf_vals, err=2e-3)
-      if not stats:
-        return
-      self.assertNear(stats.t.pdf(np.pi, 3., loc=np.pi), mean_pdf_val, err=1e-6)
+    student = student_t.StudentT(df=3., loc=np.pi, scale=1.)
+    num = 20000
+    samples = student.sample(num, seed=123456)
+    pdfs = student.prob(samples)
+    mean = student.mean()
+    mean_pdf = student.prob(student.mean())
+    sample_vals, pdf_vals, mean_val, mean_pdf_val = self.evaluate(
+        [samples, pdfs, student.mean(), mean_pdf])
+    self.assertEqual(samples.get_shape(), (num,))
+    self.assertEqual(pdfs.get_shape(), (num,))
+    self.assertEqual(mean.get_shape(), ())
+    self.assertNear(np.pi, np.mean(sample_vals), err=0.02)
+    self.assertNear(np.pi, mean_val, err=1e-6)
+    # Verify integral over sample*pdf ~= 1.
+    # Tolerance increased since eager was getting a value of 1.002041.
+    self._assertIntegral(sample_vals, pdf_vals, err=3e-3)
+    if not stats:
+      return
+    self.assertNear(stats.t.pdf(np.pi, 3., loc=np.pi), mean_pdf_val, err=1e-6)
 
   def testPdfOfSampleMultiDims(self):
-    with self.test_session() as sess:
-      student = student_t.StudentT(df=[7., 11.], loc=[[5.], [6.]], scale=3.)
-      self.assertAllEqual([], student.event_shape)
-      self.assertAllEqual([], student.event_shape_tensor().eval())
-      self.assertAllEqual([2, 2], student.batch_shape)
-      self.assertAllEqual([2, 2], student.batch_shape_tensor().eval())
-      num = 50000
-      samples = student.sample(num, seed=123456)
-      pdfs = student.prob(samples)
-      sample_vals, pdf_vals = sess.run([samples, pdfs])
-      self.assertEqual(samples.get_shape(), (num, 2, 2))
-      self.assertEqual(pdfs.get_shape(), (num, 2, 2))
-      self.assertNear(5., np.mean(sample_vals[:, 0, :]), err=.03)
-      self.assertNear(6., np.mean(sample_vals[:, 1, :]), err=.03)
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
-      if not stats:
-        return
-      self.assertNear(
-          stats.t.var(7., loc=0., scale=3.),  # loc d.n. effect var
-          np.var(sample_vals[:, :, 0]),
-          err=.4)
-      self.assertNear(
-          stats.t.var(11., loc=0., scale=3.),  # loc d.n. effect var
-          np.var(sample_vals[:, :, 1]),
-          err=.4)
+    student = student_t.StudentT(df=[7., 11.], loc=[[5.], [6.]], scale=3.)
+    self.assertAllEqual([], student.event_shape)
+    self.assertAllEqual([], self.evaluate(student.event_shape_tensor()))
+    self.assertAllEqual([2, 2], student.batch_shape)
+    self.assertAllEqual([2, 2], self.evaluate(student.batch_shape_tensor()))
+    num = 50000
+    samples = student.sample(num, seed=123456)
+    pdfs = student.prob(samples)
+    sample_vals, pdf_vals = self.evaluate([samples, pdfs])
+    self.assertEqual(samples.get_shape(), (num, 2, 2))
+    self.assertEqual(pdfs.get_shape(), (num, 2, 2))
+    self.assertNear(5., np.mean(sample_vals[:, 0, :]), err=.03)
+    self.assertNear(6., np.mean(sample_vals[:, 1, :]), err=.03)
+    self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+    self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+    self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+    self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+    if not stats:
+      return
+    self.assertNear(
+        stats.t.var(7., loc=0., scale=3.),  # loc d.n. effect var
+        np.var(sample_vals[:, :, 0]),
+        err=.4)
+    self.assertNear(
+        stats.t.var(11., loc=0., scale=3.),  # loc d.n. effect var
+        np.var(sample_vals[:, :, 1]),
+        err=.4)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1.5e-3):
     s_p = zip(sample_vals, pdf_vals)
@@ -494,10 +495,10 @@ class StudentTTest(test.TestCase):
 
   def testNegativeDofFails(self):
     with self.test_session():
-      student = student_t.StudentT(df=[2, -5.], loc=0., scale=1.,
-                                   validate_args=True, name="S")
       with self.assertRaisesOpError(r"Condition x > 0 did not hold"):
-        student.mean().eval()
+        student = student_t.StudentT(
+            df=[2, -5.], loc=0., scale=1., validate_args=True, name="S")
+        self.evaluate(student.mean())
 
   def testStudentTWithAbsDfSoftplusScale(self):
     with self.test_session():
@@ -507,9 +508,11 @@ class StudentTTest(test.TestCase):
       student = student_t.StudentTWithAbsDfSoftplusScale(
           df=df, loc=mu, scale=sigma)
       self.assertAllClose(
-          math_ops.floor(math_ops.abs(df)).eval(), student.df.eval())
-      self.assertAllClose(mu.eval(), student.loc.eval())
-      self.assertAllClose(nn_ops.softplus(sigma).eval(), student.scale.eval())
+          math_ops.floor(self.evaluate(math_ops.abs(df))),
+          self.evaluate(student.df))
+      self.assertAllClose(self.evaluate(mu), self.evaluate(student.loc))
+      self.assertAllClose(
+          self.evaluate(nn_ops.softplus(sigma)), self.evaluate(student.scale))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/uniform_test.py b/tensorflow/python/kernel_tests/distributions/uniform_test.py
index a8def95b147b6dd4825675769187733b8493b374..e74051c9013b7d51914868e66022546ae8862b60 100644
--- a/tensorflow/python/kernel_tests/distributions/uniform_test.py
+++ b/tensorflow/python/kernel_tests/distributions/uniform_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import uniform as uniform_lib
@@ -46,15 +47,17 @@ stats = try_import("scipy.stats")
 
 class UniformTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformRange(self):
     with self.test_session():
       a = 3.0
       b = 10.0
       uniform = uniform_lib.Uniform(low=a, high=b)
-      self.assertAllClose(a, uniform.low.eval())
-      self.assertAllClose(b, uniform.high.eval())
-      self.assertAllClose(b - a, uniform.range().eval())
+      self.assertAllClose(a, self.evaluate(uniform.low))
+      self.assertAllClose(b, self.evaluate(uniform.high))
+      self.assertAllClose(b - a, self.evaluate(uniform.range()))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformPDF(self):
     with self.test_session():
       a = constant_op.constant([-3.0] * 5 + [15.0])
@@ -75,22 +78,24 @@ class UniformTest(test.TestCase):
       expected_pdf = _expected_pdf()
 
       pdf = uniform.prob(x)
-      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertAllClose(expected_pdf, self.evaluate(pdf))
 
       log_pdf = uniform.log_prob(x)
-      self.assertAllClose(np.log(expected_pdf), log_pdf.eval())
+      self.assertAllClose(np.log(expected_pdf), self.evaluate(log_pdf))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformShape(self):
     with self.test_session():
       a = constant_op.constant([-3.0] * 5)
       b = constant_op.constant(11.0)
       uniform = uniform_lib.Uniform(low=a, high=b)
 
-      self.assertEqual(uniform.batch_shape_tensor().eval(), (5,))
+      self.assertEqual(self.evaluate(uniform.batch_shape_tensor()), (5,))
       self.assertEqual(uniform.batch_shape, tensor_shape.TensorShape([5]))
-      self.assertAllEqual(uniform.event_shape_tensor().eval(), [])
+      self.assertAllEqual(self.evaluate(uniform.event_shape_tensor()), [])
       self.assertEqual(uniform.event_shape, tensor_shape.TensorShape([]))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformPDFWithScalarEndpoint(self):
     with self.test_session():
       a = constant_op.constant([0.0, 5.0])
@@ -101,8 +106,9 @@ class UniformTest(test.TestCase):
       expected_pdf = np.array([1.0 / (10.0 - 0.0), 1.0 / (10.0 - 5.0)])
 
       pdf = uniform.prob(x)
-      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertAllClose(expected_pdf, self.evaluate(pdf))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformCDF(self):
     with self.test_session():
       batch_size = 6
@@ -121,11 +127,12 @@ class UniformTest(test.TestCase):
         return cdf
 
       cdf = uniform.cdf(x)
-      self.assertAllClose(_expected_cdf(), cdf.eval())
+      self.assertAllClose(_expected_cdf(), self.evaluate(cdf))
 
       log_cdf = uniform.log_cdf(x)
-      self.assertAllClose(np.log(_expected_cdf()), log_cdf.eval())
+      self.assertAllClose(np.log(_expected_cdf()), self.evaluate(log_cdf))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformEntropy(self):
     with self.test_session():
       a_v = np.array([1.0, 1.0, 1.0])
@@ -133,18 +140,20 @@ class UniformTest(test.TestCase):
       uniform = uniform_lib.Uniform(low=a_v, high=b_v)
 
       expected_entropy = np.log(b_v - a_v)
-      self.assertAllClose(expected_entropy, uniform.entropy().eval())
+      self.assertAllClose(expected_entropy, self.evaluate(uniform.entropy()))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformAssertMaxGtMin(self):
     with self.test_session():
       a_v = np.array([1.0, 1.0, 1.0], dtype=np.float32)
       b_v = np.array([1.0, 2.0, 3.0], dtype=np.float32)
-      uniform = uniform_lib.Uniform(low=a_v, high=b_v, validate_args=True)
 
       with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
                                                "x < y"):
-        uniform.low.eval()
+        uniform = uniform_lib.Uniform(low=a_v, high=b_v, validate_args=True)
+        self.evaluate(uniform.low)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformSample(self):
     with self.test_session():
       a = constant_op.constant([3.0, 4.0])
@@ -156,7 +165,7 @@ class UniformTest(test.TestCase):
       uniform = uniform_lib.Uniform(low=a, high=b)
 
       samples = uniform.sample(n, seed=137)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(sample_values.shape, (100000, 2))
       self.assertAllClose(
           sample_values[::, 0].mean(), (b_v + a1_v) / 2, atol=1e-2)
@@ -167,6 +176,7 @@ class UniformTest(test.TestCase):
       self.assertFalse(
           np.any(sample_values[::, 1] < a2_v) or np.any(sample_values >= b_v))
 
+  @test_util.run_in_graph_and_eager_modes()
   def _testUniformSampleMultiDimensional(self):
     # DISABLED: Please enable this test once b/issues/30149644 is resolved.
     with self.test_session():
@@ -183,7 +193,7 @@ class UniformTest(test.TestCase):
       samples = uniform.sample(n)
       self.assertEqual(samples.get_shape(), (n_v, batch_size, 2))
 
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
 
       self.assertFalse(
           np.any(sample_values[:, 0, 0] < a_v[0]) or
@@ -197,6 +207,7 @@ class UniformTest(test.TestCase):
       self.assertAllClose(
           sample_values[:, 0, 1].mean(), (a_v[1] + b_v[1]) / 2, atol=1e-2)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformMean(self):
     with self.test_session():
       a = 10.0
@@ -205,8 +216,9 @@ class UniformTest(test.TestCase):
       if not stats:
         return
       s_uniform = stats.uniform(loc=a, scale=b - a)
-      self.assertAllClose(uniform.mean().eval(), s_uniform.mean())
+      self.assertAllClose(self.evaluate(uniform.mean()), s_uniform.mean())
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformVariance(self):
     with self.test_session():
       a = 10.0
@@ -215,8 +227,9 @@ class UniformTest(test.TestCase):
       if not stats:
         return
       s_uniform = stats.uniform(loc=a, scale=b - a)
-      self.assertAllClose(uniform.variance().eval(), s_uniform.var())
+      self.assertAllClose(self.evaluate(uniform.variance()), s_uniform.var())
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformStd(self):
     with self.test_session():
       a = 10.0
@@ -225,8 +238,9 @@ class UniformTest(test.TestCase):
       if not stats:
         return
       s_uniform = stats.uniform(loc=a, scale=b - a)
-      self.assertAllClose(uniform.stddev().eval(), s_uniform.std())
+      self.assertAllClose(self.evaluate(uniform.stddev()), s_uniform.std())
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformNans(self):
     with self.test_session():
       a = 10.0
@@ -235,23 +249,26 @@ class UniformTest(test.TestCase):
 
       no_nans = constant_op.constant(1.0)
       nans = constant_op.constant(0.0) / constant_op.constant(0.0)
-      self.assertTrue(math_ops.is_nan(nans).eval())
+      self.assertTrue(self.evaluate(math_ops.is_nan(nans)))
       with_nans = array_ops.stack([no_nans, nans])
 
       pdf = uniform.prob(with_nans)
 
-      is_nan = math_ops.is_nan(pdf).eval()
+      is_nan = self.evaluate(math_ops.is_nan(pdf))
       self.assertFalse(is_nan[0])
       self.assertTrue(is_nan[1])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformSamplePdf(self):
     with self.test_session():
       a = 10.0
       b = [11.0, 100.0]
       uniform = uniform_lib.Uniform(a, b)
       self.assertTrue(
-          math_ops.reduce_all(uniform.prob(uniform.sample(10)) > 0).eval())
+          self.evaluate(
+              math_ops.reduce_all(uniform.prob(uniform.sample(10)) > 0)))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformBroadcasting(self):
     with self.test_session():
       a = 10.0
@@ -260,8 +277,9 @@ class UniformTest(test.TestCase):
 
       pdf = uniform.prob([[10.5, 11.5], [9.0, 19.0], [10.5, 21.0]])
       expected_pdf = np.array([[1.0, 0.1], [0.0, 0.1], [1.0, 0.0]])
-      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertAllClose(expected_pdf, self.evaluate(pdf))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformSampleWithShape(self):
     with self.test_session():
       a = 10.0
@@ -275,12 +293,13 @@ class UniformTest(test.TestCase):
           [[1.0, 0.1], [1.0, 0.1], [1.0, 0.1]],
       ]
       # pylint: enable=bad-continuation
-      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertAllClose(expected_pdf, self.evaluate(pdf))
 
       pdf = uniform.prob(uniform.sample())
       expected_pdf = [1.0, 0.1]
-      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertAllClose(expected_pdf, self.evaluate(pdf))
 
+  # Eager doesn't pass due to a type mismatch in one of the ops.
   def testUniformFloat64(self):
     uniform = uniform_lib.Uniform(
         low=np.float64(0.), high=np.float64(1.))
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index 5ec80b95eefabee60fd4de1cbec11b7743ffff1b..63d19c15cfdc8c8a29b98d3cc76df5aaa22ee36d 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -22,6 +22,7 @@ import importlib
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -56,7 +57,6 @@ def _logit(x):
   return np.log(x) - np.log1p(-x)
 
 
-@test_util.with_c_api
 class AssertCloseTest(test.TestCase):
 
   def testAssertCloseIntegerDtype(self):
@@ -99,6 +99,7 @@ class AssertCloseTest(test.TestCase):
         with ops.control_dependencies([du.assert_close(y, z)]):
           array_ops.identity(y).eval(feed_dict=feed_dict)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testAssertCloseEpsilon(self):
     x = [0., 5, 10, 15, 20]
     # x != y
@@ -107,15 +108,15 @@ class AssertCloseTest(test.TestCase):
     z = [1e-8, 5, 10, 15, 20]
     with self.test_session():
       with ops.control_dependencies([du.assert_close(x, z)]):
-        array_ops.identity(x).eval()
+        self.evaluate(array_ops.identity(x))
 
       with self.assertRaisesOpError("Condition x ~= y"):
         with ops.control_dependencies([du.assert_close(x, y)]):
-          array_ops.identity(x).eval()
+          self.evaluate(array_ops.identity(x))
 
       with self.assertRaisesOpError("Condition x ~= y"):
         with ops.control_dependencies([du.assert_close(y, z)]):
-          array_ops.identity(y).eval()
+          self.evaluate(array_ops.identity(y))
 
   def testAssertIntegerForm(self):
     # This should only be detected as an integer.
@@ -147,9 +148,38 @@ class AssertCloseTest(test.TestCase):
           array_ops.identity(w).eval(feed_dict=feed_dict)
 
 
-@test_util.with_c_api
+class MaybeGetStaticTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGetStaticInt(self):
+    x = 2
+    self.assertEqual(x, du.maybe_get_static_value(x))
+    self.assertAllClose(
+        np.array(2.), du.maybe_get_static_value(x, dtype=np.float64))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGetStaticNumpyArray(self):
+    x = np.array(2, dtype=np.int32)
+    self.assertEqual(x, du.maybe_get_static_value(x))
+    self.assertAllClose(
+        np.array(2.), du.maybe_get_static_value(x, dtype=np.float64))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGetStaticConstant(self):
+    x = constant_op.constant(2, dtype=dtypes.int32)
+    self.assertEqual(np.array(2, dtype=np.int32), du.maybe_get_static_value(x))
+    self.assertAllClose(
+        np.array(2.), du.maybe_get_static_value(x, dtype=np.float64))
+
+  def testGetStaticPlaceholder(self):
+    x = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+    self.assertEqual(None, du.maybe_get_static_value(x))
+    self.assertEqual(None, du.maybe_get_static_value(x, dtype=np.float64))
+
+
 class GetLogitsAndProbsTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testImproperArguments(self):
     with self.test_session():
       with self.assertRaises(ValueError):
@@ -158,6 +188,7 @@ class GetLogitsAndProbsTest(test.TestCase):
       with self.assertRaises(ValueError):
         du.get_logits_and_probs(logits=[0.1], probs=[0.1])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testLogits(self):
     p = np.array([0.01, 0.2, 0.5, 0.7, .99], dtype=np.float32)
     logits = _logit(p)
@@ -166,9 +197,10 @@ class GetLogitsAndProbsTest(test.TestCase):
       new_logits, new_p = du.get_logits_and_probs(
           logits=logits, validate_args=True)
 
-      self.assertAllClose(p, new_p.eval(), rtol=1e-5, atol=0.)
-      self.assertAllClose(logits, new_logits.eval(), rtol=1e-5, atol=0.)
+      self.assertAllClose(p, self.evaluate(new_p), rtol=1e-5, atol=0.)
+      self.assertAllClose(logits, self.evaluate(new_logits), rtol=1e-5, atol=0.)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testLogitsMultidimensional(self):
     p = np.array([0.2, 0.3, 0.5], dtype=np.float32)
     logits = np.log(p)
@@ -177,9 +209,10 @@ class GetLogitsAndProbsTest(test.TestCase):
       new_logits, new_p = du.get_logits_and_probs(
           logits=logits, multidimensional=True, validate_args=True)
 
-      self.assertAllClose(new_p.eval(), p)
-      self.assertAllClose(new_logits.eval(), logits)
+      self.assertAllClose(self.evaluate(new_p), p)
+      self.assertAllClose(self.evaluate(new_logits), logits)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testProbability(self):
     p = np.array([0.01, 0.2, 0.5, 0.7, .99], dtype=np.float32)
 
@@ -187,9 +220,10 @@ class GetLogitsAndProbsTest(test.TestCase):
       new_logits, new_p = du.get_logits_and_probs(
           probs=p, validate_args=True)
 
-      self.assertAllClose(_logit(p), new_logits.eval())
-      self.assertAllClose(p, new_p.eval())
+      self.assertAllClose(_logit(p), self.evaluate(new_logits))
+      self.assertAllClose(p, self.evaluate(new_p))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testProbabilityMultidimensional(self):
     p = np.array([[0.3, 0.4, 0.3], [0.1, 0.5, 0.4]], dtype=np.float32)
 
@@ -197,9 +231,10 @@ class GetLogitsAndProbsTest(test.TestCase):
       new_logits, new_p = du.get_logits_and_probs(
           probs=p, multidimensional=True, validate_args=True)
 
-      self.assertAllClose(np.log(p), new_logits.eval())
-      self.assertAllClose(p, new_p.eval())
+      self.assertAllClose(np.log(p), self.evaluate(new_logits))
+      self.assertAllClose(p, self.evaluate(new_p))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testProbabilityValidateArgs(self):
     p = [0.01, 0.2, 0.5, 0.7, .99]
     # Component less than 0.
@@ -210,26 +245,27 @@ class GetLogitsAndProbsTest(test.TestCase):
     with self.test_session():
       _, prob = du.get_logits_and_probs(
           probs=p, validate_args=True)
-      prob.eval()
+      self.evaluate(prob)
 
       with self.assertRaisesOpError("Condition x >= 0"):
         _, prob = du.get_logits_and_probs(
             probs=p2, validate_args=True)
-        prob.eval()
+        self.evaluate(prob)
 
       _, prob = du.get_logits_and_probs(
           probs=p2, validate_args=False)
-      prob.eval()
+      self.evaluate(prob)
 
       with self.assertRaisesOpError("probs has components greater than 1"):
         _, prob = du.get_logits_and_probs(
             probs=p3, validate_args=True)
-        prob.eval()
+        self.evaluate(prob)
 
       _, prob = du.get_logits_and_probs(
           probs=p3, validate_args=False)
-      prob.eval()
+      self.evaluate(prob)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testProbabilityValidateArgsMultidimensional(self):
     p = np.array([[0.3, 0.4, 0.3], [0.1, 0.5, 0.4]], dtype=np.float32)
     # Component less than 0. Still sums to 1.
@@ -242,35 +278,35 @@ class GetLogitsAndProbsTest(test.TestCase):
     with self.test_session():
       _, prob = du.get_logits_and_probs(
           probs=p, multidimensional=True)
-      prob.eval()
+      self.evaluate(prob)
 
       with self.assertRaisesOpError("Condition x >= 0"):
         _, prob = du.get_logits_and_probs(
             probs=p2, multidimensional=True, validate_args=True)
-        prob.eval()
+        self.evaluate(prob)
 
       _, prob = du.get_logits_and_probs(
           probs=p2, multidimensional=True, validate_args=False)
-      prob.eval()
+      self.evaluate(prob)
 
       with self.assertRaisesOpError(
           "(probs has components greater than 1|probs does not sum to 1)"):
         _, prob = du.get_logits_and_probs(
             probs=p3, multidimensional=True, validate_args=True)
-        prob.eval()
+        self.evaluate(prob)
 
       _, prob = du.get_logits_and_probs(
           probs=p3, multidimensional=True, validate_args=False)
-      prob.eval()
+      self.evaluate(prob)
 
       with self.assertRaisesOpError("probs does not sum to 1"):
         _, prob = du.get_logits_and_probs(
             probs=p4, multidimensional=True, validate_args=True)
-        prob.eval()
+        self.evaluate(prob)
 
       _, prob = du.get_logits_and_probs(
           probs=p4, multidimensional=True, validate_args=False)
-      prob.eval()
+      self.evaluate(prob)
 
   def testProbsMultidimShape(self):
     with self.test_session():
@@ -301,7 +337,6 @@ class GetLogitsAndProbsTest(test.TestCase):
         logit.eval(feed_dict={l: np.ones([int(2**11+1)])})
 
 
-@test_util.with_c_api
 class EmbedCheckCategoricalEventShapeTest(test.TestCase):
 
   def testTooSmall(self):
@@ -332,6 +367,7 @@ class EmbedCheckCategoricalEventShapeTest(test.TestCase):
             param)
         checked_param.eval(feed_dict={param: np.ones([int(2**11+1)])})
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUnsupportedDtype(self):
     with self.test_session():
       with self.assertRaises(TypeError):
@@ -339,7 +375,6 @@ class EmbedCheckCategoricalEventShapeTest(test.TestCase):
         du.embed_check_categorical_event_shape(param)
 
 
-@test_util.with_c_api
 class EmbedCheckIntegerCastingClosedTest(test.TestCase):
 
   def testCorrectlyAssertsNonnegative(self):
@@ -375,7 +410,7 @@ class EmbedCheckIntegerCastingClosedTest(test.TestCase):
         x_checked.eval(feed_dict={x: np.array([1, -1], dtype=np.int32)})
 
 
-@test_util.with_c_api
+@test_util.run_all_in_graph_and_eager_modes
 class LogCombinationsTest(test.TestCase):
 
   def testLogCombinationsBinomial(self):
@@ -392,7 +427,7 @@ class LogCombinationsTest(test.TestCase):
       counts = [[1., 1], [2., 3], [4., 8], [11, 4]]
       log_binom = du.log_combinations(n, counts)
       self.assertEqual([4], log_binom.get_shape())
-      self.assertAllClose(log_combs, log_binom.eval())
+      self.assertAllClose(log_combs, self.evaluate(log_binom))
 
   def testLogCombinationsShape(self):
     # Shape [2, 2]
@@ -406,7 +441,6 @@ class LogCombinationsTest(test.TestCase):
       self.assertEqual([2, 2], log_binom.get_shape())
 
 
-@test_util.with_c_api
 class DynamicShapeTest(test.TestCase):
 
   def testSameDynamicShape(self):
@@ -511,7 +545,6 @@ class DynamicShapeTest(test.TestCase):
               }))
 
 
-@test_util.with_c_api
 class RotateTransposeTest(test.TestCase):
 
   def _np_rotate_transpose(self, x, shift):
@@ -519,14 +552,20 @@ class RotateTransposeTest(test.TestCase):
       x = np.array(x)
     return np.transpose(x, np.roll(np.arange(len(x.shape)), shift))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testRollStatic(self):
     with self.test_session():
-      with self.assertRaisesRegexp(ValueError, "None values not supported."):
+      if context.executing_eagerly():
+        error_message = r"Attempt to convert a value \(None\)"
+      else:
+        error_message = "None values not supported."
+      with self.assertRaisesRegexp(ValueError, error_message):
         du.rotate_transpose(None, 1)
       for x in (np.ones(1), np.ones((2, 1)), np.ones((3, 2, 1))):
         for shift in np.arange(-5, 5):
           y = du.rotate_transpose(x, shift)
-          self.assertAllEqual(self._np_rotate_transpose(x, shift), y.eval())
+          self.assertAllEqual(
+              self._np_rotate_transpose(x, shift), self.evaluate(y))
           self.assertAllEqual(np.roll(x.shape, shift), y.get_shape().as_list())
 
   def testRollDynamic(self):
@@ -545,19 +584,16 @@ class RotateTransposeTest(test.TestCase):
                                   shift: shift_value}))
 
 
-@test_util.with_c_api
 class PickVectorTest(test.TestCase):
 
   def testCorrectlyPicksVector(self):
     with self.test_session():
       x = np.arange(10, 12)
       y = np.arange(15, 18)
-      self.assertAllEqual(x,
-                          du.pick_vector(
-                              math_ops.less(0, 5), x, y).eval())
-      self.assertAllEqual(y,
-                          du.pick_vector(
-                              math_ops.less(5, 0), x, y).eval())
+      self.assertAllEqual(
+          x, self.evaluate(du.pick_vector(math_ops.less(0, 5), x, y)))
+      self.assertAllEqual(
+          y, self.evaluate(du.pick_vector(math_ops.less(5, 0), x, y)))
       self.assertAllEqual(x,
                           du.pick_vector(
                               constant_op.constant(True), x, y))  # No eval.
@@ -566,7 +602,6 @@ class PickVectorTest(test.TestCase):
                               constant_op.constant(False), x, y))  # No eval.
 
 
-@test_util.with_c_api
 class PreferStaticRankTest(test.TestCase):
 
   def testNonEmptyConstantTensor(self):
@@ -606,7 +641,6 @@ class PreferStaticRankTest(test.TestCase):
       self.assertAllEqual(0, rank.eval(feed_dict={x: 1}))
 
 
-@test_util.with_c_api
 class PreferStaticShapeTest(test.TestCase):
 
   def testNonEmptyConstantTensor(self):
@@ -646,7 +680,6 @@ class PreferStaticShapeTest(test.TestCase):
       self.assertAllEqual(np.array([]), shape.eval(feed_dict={x: 1}))
 
 
-@test_util.with_c_api
 class PreferStaticValueTest(test.TestCase):
 
   def testNonEmptyConstantTensor(self):
@@ -687,7 +720,6 @@ class PreferStaticValueTest(test.TestCase):
       self.assertAllEqual(np.array(1), value.eval(feed_dict={x: 1}))
 
 
-@test_util.with_c_api
 class FillTriangularTest(test.TestCase):
 
   def setUp(self):
@@ -782,7 +814,6 @@ class FillTriangularTest(test.TestCase):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)), upper=True)
 
 
-@test_util.with_c_api
 class ReduceWeightedLogSumExp(test.TestCase):
 
   def _reduce_weighted_logsumexp(self, logx, w, axis, keep_dims=False):
@@ -858,28 +889,27 @@ class ReduceWeightedLogSumExp(test.TestCase):
                                 [1, 1, 1]])
 
       self.assertAllClose(
-          np.log(4),
-          du.reduce_weighted_logsumexp(x, w).eval())
+          np.log(4), self.evaluate(du.reduce_weighted_logsumexp(x, w)))
 
       with np.errstate(divide="ignore"):
         self.assertAllClose(
             np.log([0, 2, 2]),
-            du.reduce_weighted_logsumexp(x, w, axis=0).eval())
+            self.evaluate(du.reduce_weighted_logsumexp(x, w, axis=0)))
 
       self.assertAllClose(
           np.log([1, 3]),
-          du.reduce_weighted_logsumexp(x, w, axis=1).eval())
+          self.evaluate(du.reduce_weighted_logsumexp(x, w, axis=1)))
 
       self.assertAllClose(
           np.log([[1], [3]]),
-          du.reduce_weighted_logsumexp(x, w, axis=1, keep_dims=True).eval())
+          self.evaluate(
+              du.reduce_weighted_logsumexp(x, w, axis=1, keep_dims=True)))
 
       self.assertAllClose(
           np.log(4),
-          du.reduce_weighted_logsumexp(x, w, axis=[0, 1]).eval())
+          self.evaluate(du.reduce_weighted_logsumexp(x, w, axis=[0, 1])))
 
 
-@test_util.with_c_api
 class GenNewSeedTest(test.TestCase):
 
   def testOnlyNoneReturnsNone(self):
@@ -890,7 +920,6 @@ class GenNewSeedTest(test.TestCase):
 # TODO(jvdillon): Merge this test back into:
 # tensorflow/python/kernel_tests/softplus_op_test.py
 # once TF core is accepting new ops.
-@test_util.with_c_api
 class SoftplusTest(test.TestCase):
 
   def _npSoftplus(self, np_features):
@@ -976,7 +1005,7 @@ class SoftplusTest(test.TestCase):
       # Note that this range contains both zero and inf.
       x = constant_op.constant(np.logspace(-8, 6).astype(np.float16))
       y = du.softplus_inverse(x)
-      grads = gradients_impl.gradients(y, x)[0].eval()
+      grads = self.evaluate(gradients_impl.gradients(y, x)[0])
       # Equivalent to `assertAllFalse` (if it existed).
       self.assertAllEqual(np.zeros_like(grads).astype(np.bool), np.isnan(grads))
 
@@ -986,11 +1015,69 @@ class SoftplusTest(test.TestCase):
       # gradient and its approximations should be finite as well.
       x = constant_op.constant(np.logspace(-4.8, 4.5).astype(np.float16))
       y = du.softplus_inverse(x)
-      grads = gradients_impl.gradients(y, x)[0].eval()
+      grads = self.evaluate(gradients_impl.gradients(y, x)[0])
       # Equivalent to `assertAllTrue` (if it existed).
       self.assertAllEqual(
           np.ones_like(grads).astype(np.bool), np.isfinite(grads))
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class ArgumentsTest(test.TestCase):
+
+  def testNoArguments(self):
+    def foo():
+      return du.parent_frame_arguments()
+
+    self.assertEqual({}, foo())
+
+  def testPositionalArguments(self):
+    def foo(a, b, c, d):  # pylint: disable=unused-argument
+      return du.parent_frame_arguments()
+
+    self.assertEqual({"a": 1, "b": 2, "c": 3, "d": 4}, foo(1, 2, 3, 4))
+
+    # Tests that it does not matter where this function is called, and
+    # no other local variables are returned back.
+    def bar(a, b, c):
+      unused_x = a * b
+      unused_y = c * 3
+      return du.parent_frame_arguments()
+
+    self.assertEqual({"a": 1, "b": 2, "c": 3}, bar(1, 2, 3))
+
+  def testOverloadedArgumentValues(self):
+    def foo(a, b, c):  # pylint: disable=unused-argument
+      a = 42
+      b = 31
+      c = 42
+      return du.parent_frame_arguments()
+    self.assertEqual({"a": 42, "b": 31, "c": 42}, foo(1, 2, 3))
+
+  def testKeywordArguments(self):
+    def foo(**kwargs):  # pylint: disable=unused-argument
+      return du.parent_frame_arguments()
+
+    self.assertEqual({"a": 1, "b": 2, "c": 3, "d": 4}, foo(a=1, b=2, c=3, d=4))
+
+  def testPositionalKeywordArgs(self):
+    def foo(a, b, c, **kwargs):  # pylint: disable=unused-argument
+      return du.parent_frame_arguments()
+
+    self.assertEqual({"a": 1, "b": 2, "c": 3}, foo(a=1, b=2, c=3))
+    self.assertEqual({"a": 1, "b": 2, "c": 3, "unicorn": None},
+                     foo(a=1, b=2, c=3, unicorn=None))
+
+  def testNoVarargs(self):
+    def foo(a, b, c, *varargs, **kwargs):  # pylint: disable=unused-argument
+      return du.parent_frame_arguments()
+
+    self.assertEqual({"a": 1, "b": 2, "c": 3}, foo(a=1, b=2, c=3))
+    self.assertEqual({"a": 1, "b": 2, "c": 3}, foo(1, 2, 3, *[1, 2, 3]))
+    self.assertEqual({"a": 1, "b": 2, "c": 3, "unicorn": None},
+                     foo(1, 2, 3, unicorn=None))
+    self.assertEqual({"a": 1, "b": 2, "c": 3, "unicorn": None},
+                     foo(1, 2, 3, *[1, 2, 3], unicorn=None))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 35a274e75f51b795cb4e2041e4dfa8da012f6a58..facadc971ff516e4f9edea0c4f52ab0953ec5fce 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tensorflow.kernels.bcast_ops."""
+"""Tests for tensorflow.kernels.functional_ops."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -328,6 +328,23 @@ class FunctionalOpsTest(test.TestCase):
       self.assertAllEqual([2., 4., 12., 48., 240., 1440.], self.evaluate(r))
       # pylint: enable=unnecessary-lambda
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testScan_Reverse(self):
+    with self.test_session():
+      elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data")
+      v = constant_op.constant(2.0, name="v")
+
+      # pylint: disable=unnecessary-lambda
+      r = functional_ops.scan(lambda a, x: math_ops.multiply(a, x), elems,
+                              reverse=True)
+      self.assertAllEqual([720., 720., 360., 120., 30., 6.], self.evaluate(r))
+      r = functional_ops.scan(
+          lambda a, x: math_ops.multiply(a, x), elems, initializer=v,
+          reverse=True)
+      self.assertAllEqual([1440., 1440., 720., 240., 60., 12.],
+                          self.evaluate(r))
+      # pylint: enable=unnecessary-lambda
+
   @test_util.run_in_graph_and_eager_modes()
   def testScan_SingleInputMultiOutput(self):
     with self.test_session():
@@ -670,117 +687,117 @@ class FunctionalOpsTest(test.TestCase):
 
     with self.test_session(use_gpu=False) as sess:
 
-      def Run(x):
-        return sess.run(
-            functional_ops.If(math_ops.greater(x, 0), [x], Twice, Thrice))[0]
+      x = array_ops.placeholder(dtypes.float32)
+      ret = functional_ops.If(math_ops.greater(x, 0), [x], Twice, Thrice)[0]
 
-      self.assertAllEqual(Run(9.), 18.)
-      self.assertAllEqual(Run(-8.), -23.)
-      self.assertAllEqual(Run(0.), 1.)
+      self.assertAllEqual(sess.run(ret, feed_dict={x: 9.}), 18.)
+      self.assertAllEqual(sess.run(ret, feed_dict={x: -8.}), -23.)
+      self.assertAllEqual(sess.run(ret, feed_dict={x: 0.}), 1.)
 
   def testWhile(self):
 
-    @function.Defun(*[dtypes.float32] * 2)
-    def Cond(n, unused_x):
-      return n > 0
+    for use_gpu in (True, False):
+      with ops.Graph().as_default() as g:
 
-    @function.Defun(*[dtypes.float32] * 2)
-    def Body(n, x):
-      return n - 1, x + n
+        @function.Defun(*[dtypes.float32] * 2)
+        def Cond(n, unused_x):
+          return n > 0
 
-    # TODO(b/65752372): Set `use_gpu=False` because
-    # `functional_ops.While()` does not reliably work on GPU (apparently
-    # because the result of evaluating the condition may be in device
-    # memory, but it is read on the host).
-    with self.test_session(use_gpu=False) as sess:
+        @function.Defun(*[dtypes.float32] * 2)
+        def Body(n, x):
+          return n - 1, x + n
 
-      def Run(n):
-        return sess.run(functional_ops.While([n, 0.], Cond, Body))[1]
+        def Run(sess, n):
+          return sess.run(functional_ops.While([n, 0.], Cond, Body))[1]
 
-      self.assertAllEqual(Run(20.), 210.)
-      self.assertAllEqual(Run(100.), 5050.)
+        with self.test_session(graph=g, use_gpu=use_gpu) as sess:
+          self.assertAllEqual(Run(sess, 20.), 210.)
+          self.assertAllEqual(Run(sess, 100.), 5050.)
 
   def testWhileError(self):
-
-    @function.Defun(*[dtypes.float32] * 2)
-    def Cond(n, unused_x):
-      return n > 0
-
-    @function.Defun(*[dtypes.float32] * 2)
-    def CondReturnsTooManyArgs(n, x):
-      return n > 0, x
-
-    @function.Defun(*[dtypes.float32] * 2)
-    def Body(n, x):
-      return n - 1, x + n
-
-    @function.Defun(*[dtypes.float32] * 2)
-    def BodyReturnsTooManyArgs(n, x):
-      return n - 1, x + n, x
-
-    # TODO(b/65752372): Set `use_gpu=False` because
-    # `functional_ops.While()` does not reliably work on GPU (apparently
-    # because the result of evaluating the condition may be in device
-    # memory, but it is read on the host).
-    with self.test_session(use_gpu=False):
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Expected a single scalar.*got 2 tensors."):
-        functional_ops.While([5., 0.], CondReturnsTooManyArgs, Body)[0].eval()
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "While loop body returned 3 arguments. Expected: 2"):
-        functional_ops.While([5., 0.], Cond, BodyReturnsTooManyArgs)[0].eval()
+    for use_gpu in (True, False):
+      with ops.Graph().as_default() as g:
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def Cond(n, unused_x):
+          return n > 0
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def CondReturnsTooManyArgs(n, x):
+          return n > 0, x
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def Body(n, x):
+          return n - 1, x + n
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def BodyReturnsTooManyArgs(n, x):
+          return n - 1, x + n, x
+
+        with self.test_session(graph=g, use_gpu=use_gpu):
+          with self.assertRaisesRegexp(
+              errors.InvalidArgumentError,
+              "Expected a single scalar.*got 2 tensors."):
+            functional_ops.While([5., 0.], CondReturnsTooManyArgs,
+                                 Body)[0].eval()
+          with self.assertRaisesRegexp(
+              errors.InvalidArgumentError,
+              "While loop body returned 3 arguments. Expected: 2"):
+            functional_ops.While([5., 0.], Cond,
+                                 BodyReturnsTooManyArgs)[0].eval()
 
   def testWhileInMultipleSubgraphs(self):
 
-    @function.Defun(* [dtypes.float32] * 2)
-    def Cond(n, x):  # pylint: disable=unused-argument
-      return n > 0
-
-    @function.Defun(* [dtypes.float32] * 2)
-    def Body(n, x):
-      return n - 1, x + n
-
-    # TODO(b/65752372): Set `use_gpu=False` because
-    # `functional_ops.While()` does not reliably work on GPU (apparently
-    # because the result of evaluating the condition may be in device
-    # memory, but it is read on the host).
-    with self.test_session(use_gpu=False) as sess:
-      n = array_ops.placeholder(dtypes.float32)
-      _, result = functional_ops.While([n, 0.], Cond, Body)
-      c = constant_op.constant(37.)
-
-      self.assertAllEqual(210., sess.run(result, feed_dict={n: 20.}))
-      self.assertAllEqual(5050., sess.run(result, feed_dict={n: 100.}))
-      # Test that the result is the same when we run a different subgraph.
-      self.assertAllEqual(5050., sess.run([result, c], feed_dict={n: 100.})[0])
-
-  def _tfSum(self, rewrite_with_while):
-    # On GPU, don't rewrite using a while loop.
-    use_gpu = not rewrite_with_while
-    with self.test_session(use_gpu=use_gpu) as sess:
-
-      @function.Defun(dtypes.int32, dtypes.float32)
-      def Body(n, x):
-        return x + math_ops.to_float(n)
-
-      xs = [
-          # 1 + 2  + ... + 20
-          functional_ops.For(
-              1, 21, 1, [0.], Body, rewrite_with_while=rewrite_with_while)[0],
-          # 100 + 99 + ... + 1
-          functional_ops.For(
-              100, 0, -1, [0.], Body, rewrite_with_while=rewrite_with_while)[0],
-      ]
-      xvals = sess.run(xs)
-    self.assertAllEqual(210, xvals[0])
-    self.assertAllEqual(5050, xvals[1])
+    for use_gpu in (True, False):
+      with ops.Graph().as_default() as g:
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def Cond(n, x):  # pylint: disable=unused-argument
+          return n > 0
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def Body(n, x):
+          return n - 1, x + n
+
+        with self.test_session(graph=g, use_gpu=use_gpu) as sess:
+          n = array_ops.placeholder(dtypes.float32)
+          _, result = functional_ops.While([n, 0.], Cond, Body)
+          c = constant_op.constant(37.)
+
+          self.assertAllEqual(210., sess.run(result, feed_dict={n: 20.}))
+          self.assertAllEqual(5050., sess.run(result, feed_dict={n: 100.}))
+          # Test that the result is the same when we run a different subgraph.
+          self.assertAllEqual(5050.,
+                              sess.run([result, c], feed_dict={n: 100.})[0])
+
+  def _tfSum(self, use_gpu, rewrite_with_while):
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g, use_gpu=use_gpu) as sess:
+
+        @function.Defun(dtypes.int32, dtypes.float32)
+        def Body(n, x):
+          return x + math_ops.to_float(n)
+
+        xs = [
+            # 1 + 2  + ... + 20
+            functional_ops.For(
+                1, 21, 1, [0.], Body, rewrite_with_while=rewrite_with_while)[0],
+            # 100 + 99 + ... + 1
+            functional_ops.For(
+                100, 0, -1, [0.], Body, rewrite_with_while=rewrite_with_while)
+            [0],
+        ]
+        xvals = sess.run(xs)
+      self.assertAllEqual(210, xvals[0])
+      self.assertAllEqual(5050, xvals[1])
 
   def testFor(self):
-    self._tfSum(False)
+    for use_gpu in (True, False):
+      self._tfSum(use_gpu, False)
 
   def testForWithWhile(self):
-    self._tfSum(True)
+    for use_gpu in (True, False):
+      self._tfSum(use_gpu, True)
 
   def testForWithWhileNaming(self):
     g = ops.Graph()
@@ -816,10 +833,6 @@ class FunctionalOpsTest(test.TestCase):
       return x + math_ops.to_float(n) + v, x2 + v
 
     for rewrite_with_while in (True, False):
-      # TODO(b/65752372): Set `use_gpu=False` because
-      # `functional_ops.While()` does not reliably work on GPU (apparently
-      # because the result of evaluating the condition may be in device
-      # memory, but it is read on the host).
       use_gpu = not rewrite_with_while
       with self.test_session(use_gpu=use_gpu) as sess:
         result_nullary = functional_ops.For(
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 91ebe8de9921268b2a3c5ad645585e1fe83c7419..58e2a8ac2a3b827647b1b1176f4b69e6a88b76c6 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -197,7 +197,21 @@ class GatherNdTest(test.TestCase):
     self.assertEqual(None, shape.ndims)
     self.assertEqual(None, shape[0].value)
 
-  def testBadIndices(self):
+  def testBadIndicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [0, 1, 2]
+      indices = [[[0], [7]]]  # Make this one higher rank
+      gather_nd = array_ops.gather_nd(params, indices)
+      with self.assertRaisesOpError(
+          r"flat indices\[1, :\] = \[7\] does not index into param "
+          r"\(shape: \[3\]\)"):
+        gather_nd.eval()
+
+  def _disabledTestBadIndicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [0, 1, 2]
       indices = [[[0], [7]]]  # Make this one higher rank
@@ -207,7 +221,21 @@ class GatherNdTest(test.TestCase):
           r"\(shape: \[3\]\)"):
         gather_nd.eval()
 
-  def testBadIndicesWithSlices(self):
+  def testBadIndicesWithSlicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [[0, 1, 2]]
+      indices = [[[0], [0], [1]]]  # Make this one higher rank
+      gather_nd = array_ops.gather_nd(params, indices)
+      with self.assertRaisesOpError(
+          r"flat indices\[2, :\] = \[1\] does not index into param "
+          r"\(shape: \[1,3\]\)"):
+        gather_nd.eval()
+
+  def _disabledTestBadIndicesWithSlicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2]]
       indices = [[[0], [0], [1]]]  # Make this one higher rank
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index a2fcd751dfa94605d271587640815fae6ac1c360..033fa959359e20894c376341d2f9ad79d30a5878 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -27,7 +27,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.platform import test
 
-_TEST_TYPES = (dtypes.float32, dtypes.complex64, dtypes.complex128)
+_TEST_TYPES = (dtypes.int64, dtypes.float32,
+               dtypes.complex64, dtypes.complex128)
 
 
 class GatherTest(test.TestCase):
@@ -122,6 +123,9 @@ class GatherTest(test.TestCase):
                 gather, [tf_params, tf_indices, tf_axis], gather_grad)
             self.assertEqual(indices_grad, None)
             self.assertEqual(axis_grad, None)
+            if dtype.is_integer:
+              self.assertEqual(params_grad, None)
+              continue
             # For axis 0, we are able to create an efficient IndexedSlices for
             # the gradient.
             if axis == 0:
@@ -177,7 +181,19 @@ class GatherTest(test.TestCase):
     gather_t = array_ops.gather(params, indices, axis=axis)
     self.assertEqual(None, gather_t.shape)
 
-  def testBadIndices(self):
+  def testBadIndicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [[0, 1, 2], [3, 4, 5]]
+      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
+        array_ops.gather(params, [[7]], axis=0).eval()
+      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 3\)"):
+        array_ops.gather(params, [[7]], axis=1).eval()
+
+  def _disabledTestBadIndicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2], [3, 4, 5]]
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
diff --git a/tensorflow/python/kernel_tests/large_concat_op_test.py b/tensorflow/python/kernel_tests/large_concat_op_test.py
index 184d1dde2aa5e8d786cb85141f8dfb90c0bdad63..66afb6ec014991ca32efd5b0895ff695d3d1015f 100644
--- a/tensorflow/python/kernel_tests/large_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/large_concat_op_test.py
@@ -19,12 +19,10 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-@test_util.with_c_api
 class LargeConcatOpTest(test.TestCase):
   """Tests that belong in concat_op_test.py, but run over large tensors."""
 
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 052f11f92e90b0e703362832529318c6d9b3bd0d..91be80322c37792be02d1b625df6757c9d80b060 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -85,7 +85,10 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     shard_count = 5,
-    tags = ["noasan"],  # times out b/63678675
+    tags = [
+        "noasan",  # times out, b/63678675
+        "optonly",  # times out, b/79171797
+    ],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
index cce1ecd45e543e3a40f0c90dac65db83a028c42c..784c730bbc8179dd1302294b2d558e8a0c532c0c 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
@@ -97,6 +97,10 @@ class SquareLinearOperatorKroneckerTest(
         build_info((3, 6, 6), factors=[(3, 1, 1), (1, 2, 2), (1, 3, 3)]),
     ]
 
+  @property
+  def _tests_to_skip(self):
+    return ["det", "solve", "solve_with_broadcast"]
+
   def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
     shape = list(build_info.shape)
     expected_factors = build_info.__dict__["factors"]
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 098f9724a2a65f544005e1799f6329a29bb02abe..49855200c2427a88a4bd582c2ef786c38a6fa76a 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -43,6 +43,7 @@ def scalar_shape():
   return ops.convert_to_tensor([], dtype=dtypes.int32)
 
 
+@test_util.with_c_shapes
 class ListOpsTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index 361853448ce2c8477af6920257c58c1eba0fa952..944de217a175764aa2e43b4fc488d912041e279a 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -317,6 +317,11 @@ class PadOpTest(test.TestCase):
                            [constant_op.constant(1, shape=[2]), [0, unknown]])
     self.assertEqual([6, None], padded.get_shape().as_list())
 
+    # Zero padding on a known dimension.
+    inp = array_ops.placeholder(dtypes.int32, [None, None, 20])
+    padded = array_ops.pad(inp, [[0, 0], [0, unknown], [0, 0]])
+    self.assertEqual([None, None, 20], padded.get_shape().as_list())
+
   def testScalars(self):
     paddings = np.zeros((0, 2), dtype=np.int32)
     inp = np.asarray(7)
diff --git a/tensorflow/python/kernel_tests/reduce_join_op_test.py b/tensorflow/python/kernel_tests/reduce_join_op_test.py
index fb9e5cc2a3727b70f731ca8ae946b6d5ef325833..663561ced781614e56902c3ef73da303dbdc6b5b 100644
--- a/tensorflow/python/kernel_tests/reduce_join_op_test.py
+++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py
@@ -100,7 +100,7 @@ class ReduceJoinTest(UnicodeTestCase):
                       input_array,
                       truth,
                       truth_shape,
-                      reduction_indices,
+                      axis,
                       keep_dims=False,
                       separator=""):
     """Compares the output of reduce_join to an expected result.
@@ -109,14 +109,14 @@ class ReduceJoinTest(UnicodeTestCase):
       input_array: The string input to be joined.
       truth: An array or np.array of the expected result.
       truth_shape: An array or np.array of the expected shape.
-      reduction_indices: The indices to reduce over.
+      axis: The indices to reduce over.
       keep_dims: Whether or not to retain reduced dimensions.
       separator: The separator to use for joining.
     """
     with self.test_session():
       output = string_ops.reduce_join(
           inputs=input_array,
-          reduction_indices=reduction_indices,
+          axis=axis,
           keep_dims=keep_dims,
           separator=separator)
       output_array = output.eval()
@@ -124,11 +124,8 @@ class ReduceJoinTest(UnicodeTestCase):
     self.assertAllEqualUnicode(truth, output_array)
     self.assertAllEqual(truth_shape, output.get_shape())
 
-  def _testMultipleReduceJoin(self,
-                              input_array,
-                              reduction_indices,
-                              separator=" "):
-    """Tests reduce_join for one input and multiple reduction_indices.
+  def _testMultipleReduceJoin(self, input_array, axis, separator=" "):
+    """Tests reduce_join for one input and multiple axes.
 
     Does so by comparing the output to that from nested reduce_string_joins.
     The correctness of single-dimension reduce_join is verified by other
@@ -136,31 +133,22 @@ class ReduceJoinTest(UnicodeTestCase):
 
     Args:
       input_array: The input to test.
-      reduction_indices: The indices to reduce.
+      axis: The indices to reduce.
       separator: The separator to use when joining.
     """
     with self.test_session():
       output = string_ops.reduce_join(
-          inputs=input_array,
-          reduction_indices=reduction_indices,
-          keep_dims=False,
-          separator=separator)
+          inputs=input_array, axis=axis, keep_dims=False, separator=separator)
       output_keep_dims = string_ops.reduce_join(
-          inputs=input_array,
-          reduction_indices=reduction_indices,
-          keep_dims=True,
-          separator=separator)
+          inputs=input_array, axis=axis, keep_dims=True, separator=separator)
 
       truth = input_array
-      for index in reduction_indices:
+      for index in axis:
         truth = string_ops.reduce_join(
-            inputs=truth,
-            reduction_indices=index,
-            keep_dims=True,
-            separator=separator)
-      if not reduction_indices:
+            inputs=truth, axis=index, keep_dims=True, separator=separator)
+      if not axis:
         truth = constant_op.constant(truth)
-      truth_squeezed = array_ops.squeeze(truth, axis=reduction_indices)
+      truth_squeezed = array_ops.squeeze(truth, axis=axis)
       output_array = output.eval()
       output_keep_dims_array = output_keep_dims.eval()
       truth_array = truth.eval()
@@ -174,7 +162,7 @@ class ReduceJoinTest(UnicodeTestCase):
     input_array = ["this", "is", "a", "test"]
     truth = "thisisatest"
     truth_shape = []
-    self._testReduceJoin(input_array, truth, truth_shape, reduction_indices=0)
+    self._testReduceJoin(input_array, truth, truth_shape, axis=0)
 
   def testRankTwo(self):
     input_array = [["this", "is", "a", "test"],
@@ -184,43 +172,32 @@ class ReduceJoinTest(UnicodeTestCase):
     truth_dim_one = ["thisisatest", "pleasedonotpanic"]
     truth_shape_dim_one = [2]
     self._testReduceJoin(
-        input_array, truth_dim_zero, truth_shape_dim_zero, reduction_indices=0)
+        input_array, truth_dim_zero, truth_shape_dim_zero, axis=0)
     self._testReduceJoin(
-        input_array, truth_dim_one, truth_shape_dim_one, reduction_indices=1)
+        input_array, truth_dim_one, truth_shape_dim_one, axis=1)
 
-    expected_val = "thisisatestpleasedonotpanic"
-    expected_shape = None
-    self._testReduceJoin(
-        input_array, expected_val, expected_shape, reduction_indices=None)
-
-    # When using Tensor for input with reduction_indices=None, shape is known.
     expected_val = "thisisatestpleasedonotpanic"
     expected_shape = []
-    self._testReduceJoin(
-        constant_op.constant(input_array), expected_val,
-        expected_shape, reduction_indices=None)
+    self._testReduceJoin(input_array, expected_val, expected_shape, axis=None)
 
-    # Using [] reduction_indices is a no-op.
+    # Using axis=[] is a no-op.
     expected_val = input_array
     expected_shape = [2, 4]
-    self._testReduceJoin(
-        input_array, expected_val, expected_shape, reduction_indices=[])
+    self._testReduceJoin(input_array, expected_val, expected_shape, axis=[])
 
   def testRankFive(self):
     input_array = _input_array(num_dims=5)
     truths = [_joined_array(num_dims=5, reduce_dim=i) for i in xrange(5)]
     truth_shape = [2] * 4
     for i in xrange(5):
-      self._testReduceJoin(
-          input_array, truths[i], truth_shape, reduction_indices=i)
+      self._testReduceJoin(input_array, truths[i], truth_shape, axis=i)
 
   def testNegative(self):
     input_array = _input_array(num_dims=5)
     truths = [_joined_array(num_dims=5, reduce_dim=i) for i in xrange(5)]
     truth_shape = [2] * 4
     for i in xrange(5):
-      self._testReduceJoin(
-          input_array, truths[i], truth_shape, reduction_indices=i - 5)
+      self._testReduceJoin(input_array, truths[i], truth_shape, axis=i - 5)
 
   def testSingletonDimension(self):
     input_arrays = [
@@ -230,8 +207,7 @@ class ReduceJoinTest(UnicodeTestCase):
     truth = _input_array(num_dims=5)
     truth_shape = [2] * 5
     for i in xrange(6):
-      self._testReduceJoin(
-          input_arrays[i], truth, truth_shape, reduction_indices=i)
+      self._testReduceJoin(input_arrays[i], truth, truth_shape, axis=i)
 
   def testSeparator(self):
     input_array = [["this", "is", "a", "test"],
@@ -245,13 +221,13 @@ class ReduceJoinTest(UnicodeTestCase):
         input_array,
         truth_dim_zero,
         truth_shape_dim_zero,
-        reduction_indices=0,
+        axis=0,
         separator="  ")
     self._testReduceJoin(
         input_array,
         truth_dim_one,
         truth_shape_dim_one,
-        reduction_indices=1,
+        axis=1,
         separator="  ")
 
   def testUnknownShape(self):
@@ -260,7 +236,7 @@ class ReduceJoinTest(UnicodeTestCase):
     truth_shape = None
     with self.test_session():
       placeholder = array_ops.placeholder(dtypes.string, name="placeholder")
-      reduced = string_ops.reduce_join(placeholder, reduction_indices=0)
+      reduced = string_ops.reduce_join(placeholder, axis=0)
       output_array = reduced.eval(feed_dict={placeholder.name: input_array})
       self.assertAllEqualUnicode(truth, output_array)
       self.assertAllEqual(truth_shape, reduced.get_shape())
@@ -273,8 +249,7 @@ class ReduceJoinTest(UnicodeTestCase):
     truth_shape = None
     with self.test_session():
       placeholder = array_ops.placeholder(dtypes.int32, name="placeholder")
-      reduced = string_ops.reduce_join(
-          input_array, reduction_indices=placeholder)
+      reduced = string_ops.reduce_join(input_array, axis=placeholder)
       output_array_dim_zero = reduced.eval(feed_dict={placeholder.name: [0]})
       output_array_dim_one = reduced.eval(feed_dict={placeholder.name: [1]})
       self.assertAllEqualUnicode(truth_dim_zero, output_array_dim_zero)
@@ -293,27 +268,26 @@ class ReduceJoinTest(UnicodeTestCase):
         input_array,
         truth_dim_zero,
         truth_shape_dim_zero,
-        reduction_indices=0,
+        axis=0,
         keep_dims=True)
     self._testReduceJoin(
         input_array,
         truth_dim_one,
         truth_shape_dim_one,
-        reduction_indices=1,
+        axis=1,
         keep_dims=True)
 
     expected_val = [["thisisatestpleasedonotpanic"]]
     expected_shape = [1, 1]
     self._testReduceJoin(
         constant_op.constant(input_array), expected_val, expected_shape,
-        keep_dims=True, reduction_indices=None)
+        keep_dims=True, axis=None)
 
-    # Using [] reduction_indices is a no-op.
+    # Using axis=[] is a no-op.
     expected_val = input_array
     expected_shape = [2, 4]
     self._testReduceJoin(
-        input_array, expected_val, expected_shape,
-        keep_dims=True, reduction_indices=[])
+        input_array, expected_val, expected_shape, keep_dims=True, axis=[])
 
   def testMultiIndex(self):
     num_dims = 3
@@ -321,42 +295,41 @@ class ReduceJoinTest(UnicodeTestCase):
     # Also tests [].
     for i in xrange(num_dims + 1):
       for permutation in itertools.permutations(xrange(num_dims), i):
-        self._testMultipleReduceJoin(input_array, reduction_indices=permutation)
+        self._testMultipleReduceJoin(input_array, axis=permutation)
 
   def testInvalidReductionIndices(self):
     with self.test_session():
       with self.assertRaisesRegexp(ValueError, "Invalid reduction dim"):
-        string_ops.reduce_join(inputs="", reduction_indices=0)
+        string_ops.reduce_join(inputs="", axis=0)
       with self.assertRaisesRegexp(ValueError,
                                    "Invalid reduction dimension -3"):
-        string_ops.reduce_join(inputs=[[""]], reduction_indices=-3)
+        string_ops.reduce_join(inputs=[[""]], axis=-3)
       with self.assertRaisesRegexp(ValueError, "Invalid reduction dimension 2"):
-        string_ops.reduce_join(inputs=[[""]], reduction_indices=2)
+        string_ops.reduce_join(inputs=[[""]], axis=2)
       with self.assertRaisesRegexp(ValueError,
                                    "Invalid reduction dimension -3"):
-        string_ops.reduce_join(inputs=[[""]], reduction_indices=[0, -3])
+        string_ops.reduce_join(inputs=[[""]], axis=[0, -3])
       with self.assertRaisesRegexp(ValueError, "Invalid reduction dimension 2"):
-        string_ops.reduce_join(inputs=[[""]], reduction_indices=[0, 2])
+        string_ops.reduce_join(inputs=[[""]], axis=[0, 2])
 
   def testZeroDims(self):
     with self.test_session():
       inputs = np.zeros([0, 1], dtype=str)
 
       # Reduction that drops the dim of size 0.
-      output = string_ops.reduce_join(inputs=inputs, reduction_indices=0)
+      output = string_ops.reduce_join(inputs=inputs, axis=0)
       self.assertAllEqualUnicode([""], output.eval())
 
       # Reduction that keeps the dim of size 0.
-      output = string_ops.reduce_join(inputs=inputs, reduction_indices=1)
+      output = string_ops.reduce_join(inputs=inputs, axis=1)
       output_shape = output.eval().shape
       self.assertAllEqual([0], output_shape)
 
   def testInvalidArgsUnknownShape(self):
     with self.test_session():
       placeholder = array_ops.placeholder(dtypes.string, name="placeholder")
-      index_too_high = string_ops.reduce_join(placeholder, reduction_indices=1)
-      duplicate_index = string_ops.reduce_join(
-          placeholder, reduction_indices=[-1, 1])
+      index_too_high = string_ops.reduce_join(placeholder, axis=1)
+      duplicate_index = string_ops.reduce_join(placeholder, axis=[-1, 1])
       with self.assertRaisesOpError("Invalid reduction dimension 1"):
         index_too_high.eval(feed_dict={placeholder.name: [""]})
       with self.assertRaisesOpError("Duplicate reduction dimension 1"):
@@ -365,8 +338,7 @@ class ReduceJoinTest(UnicodeTestCase):
   def testInvalidArgsUnknownIndices(self):
     with self.test_session():
       placeholder = array_ops.placeholder(dtypes.int32, name="placeholder")
-      reduced = string_ops.reduce_join(
-          ["test", "test2"], reduction_indices=placeholder)
+      reduced = string_ops.reduce_join(["test", "test2"], axis=placeholder)
 
       with self.assertRaisesOpError("reduction dimension -2"):
         reduced.eval(feed_dict={placeholder.name: -2})
diff --git a/tensorflow/python/kernel_tests/regex_full_match_op_test.py b/tensorflow/python/kernel_tests/regex_full_match_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5daae1b79bf493bfcbc54037eea49ea7dff27647
--- /dev/null
+++ b/tensorflow/python/kernel_tests/regex_full_match_op_test.py
@@ -0,0 +1,54 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RegexFullMatch op from string_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class RegexFullMatchOpTest(test.TestCase):
+
+  def testRegexFullMatch(self):
+    values = ["abaaba", "abcdabcde"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      matched = string_ops.regex_full_match(input_vector, "a.*a").eval()
+      self.assertAllEqual([True, False], matched)
+
+  def testEmptyMatch(self):
+    values = ["abc", "1"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      matched = string_ops.regex_full_match(input_vector, "").eval()
+      self.assertAllEqual([False, False], matched)
+
+  def testInvalidPattern(self):
+    values = ["abc", "1"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      invalid_pattern = "A["
+      matched = string_ops.regex_full_match(input_vector, invalid_pattern)
+      with self.assertRaisesOpError("Invalid pattern"):
+        matched.eval()
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 984192258c9724dd9d73105c65177786def98e83..846231fe8197461d42f4a23aeda64ffa370cd086 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -42,7 +42,6 @@ from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 
 
-@test_util.with_c_api
 class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
 
   def tearDown(self):
@@ -107,6 +106,13 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       v = resource_variable_ops.ResourceVariable(False, name="bool_test")
       self.assertAllEqual(bool(v), False)
 
+  def testDifferentAssignGraph(self):
+    with ops.Graph().as_default():
+      v = resource_variable_ops.ResourceVariable(1.0)
+    ops.reset_default_graph()
+    v.assign(2.0)  # Note: this fails if we run convert_to_tensor on not the
+                   # variable graph.
+
   def testFetchHandle(self):
     with self.test_session():
       handle = resource_variable_ops.var_handle_op(
@@ -400,6 +406,15 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
               resource_variable_ops.var_is_initialized_op(abc.handle)),
           True)
 
+  def testScatterBool(self):
+    with context.eager_mode():
+      ref = resource_variable_ops.ResourceVariable(
+          [False, True, False], trainable=False)
+      indices = math_ops.range(3)
+      updates = constant_op.constant([True, True, True])
+      state_ops.scatter_update(ref, indices, updates)
+      self.assertAllEqual(ref.read_value(), [True, True, True])
+
   @test_util.run_in_graph_and_eager_modes()
   def testConstraintArg(self):
     constraint = lambda x: x
diff --git a/tensorflow/python/kernel_tests/scalar_test.py b/tensorflow/python/kernel_tests/scalar_test.py
index 0d8fd232946883ac1d95c4c2d9744af69175ab90..287919bab72b425cf174efbcff7d885b5d02b57e 100644
--- a/tensorflow/python/kernel_tests/scalar_test.py
+++ b/tensorflow/python/kernel_tests/scalar_test.py
@@ -31,7 +31,6 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
-@test_util.with_c_api
 class ScalarTest(test.TestCase):
 
   def check(self, op, args, error, correct=None):
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index b7477a768ab718c5a57293cc6f774298e1c9f891..faa4b49a8d7d8b0169f10592845d3d30a3996c41 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -23,8 +23,11 @@ import functools
 import numpy as np
 
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
@@ -141,7 +144,9 @@ class StatefulScatterNdTest(test.TestCase):
         self.assertAllClose(new, ref_var.eval())
 
   def _VariableRankTests(self, np_scatter, tf_scatter):
-    for vtype in (np.float32, np.float64, np.complex64, np.complex128):
+    for vtype in (np.int32,
+                  np.float32, np.float64,
+                  np.complex64, np.complex128):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(np_scatter, tf_scatter, vtype, itype)
 
@@ -218,7 +223,7 @@ class StatefulScatterNdTest(test.TestCase):
   #   self._VariableRankTests(_NumpyDiv, state_ops.scatter_nd_div)
 
   def _ScatterRepeatIndicesTest(self, np_scatter, tf_scatter):
-    for vtype in (np.float32, np.float64):
+    for vtype in (np.int32, np.float32, np.float64):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
@@ -364,6 +369,15 @@ class ScatterNdTest(test.TestCase):
     del input_  # input_ is not used in scatter_nd
     return array_ops.scatter_nd(indices, updates, shape)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testInvalidShape(self):
+    # TODO(apassos) figure out how to unify these errors
+    with self.assertRaises(errors.InvalidArgumentError
+                           if context.executing_eagerly() else ValueError):
+      array_ops.scatter_nd(indices=[0],  # this should be indices=[[0]]
+                           updates=[0.0],
+                           shape=[1])
+
   def testString(self):
     indices = constant_op.constant([[4], [3], [1], [7]],
                                    dtype=dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index c70a4ffce7be71effe3ea10faa9754ab2b3842ce..1a0fa744aeec7b6df281835c266ebbd901f22fea 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -159,7 +159,13 @@ class ScatterTest(test.TestCase):
 
           # Clips small values to avoid division by zero.
           def clip_small_values(x):
-            return 1e-4 * np.sign(x) if np.abs(x) < 1e-4 else x
+            threshold = 1e-4
+            sign = np.sign(x)
+
+            if isinstance(x, np.int32):
+              threshold = 1
+              sign = np.random.choice([-1, 1])
+            return threshold * sign if np.abs(x) < threshold else x
 
           updates = np.vectorize(clip_small_values)(updates)
           old = _AsType(np.random.randn(*((first_dim,) + extra_shape)), vtype)
@@ -181,7 +187,11 @@ class ScatterTest(test.TestCase):
                          tf_scatter,
                          repeat_indices=False,
                          updates_are_scalar=False):
-    for vtype in (np.float32, np.float64):
+    vtypes = [np.float32, np.float64]
+    if tf_scatter != state_ops.scatter_div:
+      vtypes.append(np.int32)
+
+    for vtype in vtypes:
       for itype in (np.int32, np.int64):
         self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
                                updates_are_scalar)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 3bca5fadc42693f514911c7ffa8f078de8ef9bcd..794be096b7309a18f9fe225642bcaafb5058df78 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -91,16 +91,18 @@ class SegmentReductionOpTest(SegmentReductionHelper):
     ]
 
     # Each item is np_op1, np_op2, tf_op
-    ops_list = [(np.add, None, math_ops.segment_sum), (self._mean_cum_op,
-                                                       self._mean_reduce_op,
-                                                       math_ops.segment_mean),
+    ops_list = [(np.add, None, math_ops.segment_sum),
+                (self._mean_cum_op, self._mean_reduce_op,
+                 math_ops.segment_mean),
                 (np.ndarray.__mul__, None, math_ops.segment_prod),
                 (np.minimum, None, math_ops.segment_min),
                 (np.maximum, None, math_ops.segment_max)]
 
     # A subset of ops has been enabled for complex numbers
     complex_ops_list = [(np.add, None, math_ops.segment_sum),
-                        (np.ndarray.__mul__, None, math_ops.segment_prod)]
+                        (np.ndarray.__mul__, None, math_ops.segment_prod),
+                        (self._mean_cum_op, self._mean_reduce_op,
+                         math_ops.segment_mean)]
 
     n = 10
     shape = [n, 2]
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index dc4d4dbeabf3c53be6e30cb357f4a1d2d7e69064..427c07cfb8e47d65ac013dec2ecc0753ce4f5c05 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -24,14 +24,12 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 
 
-@test_util.with_c_api
 class SoftmaxTest(test.TestCase):
 
   def _npSoftmax(self, features, dim=-1, log=False):
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
index 3d09badf27e621ec244730eb6c1f6b637546219f..ca7898d466972c71f4abca973693363919becf11 100644
--- a/tensorflow/python/kernel_tests/sparse_cross_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -32,7 +32,7 @@ class SparseCrossOpTest(test.TestCase):
 
   def test_simple(self):
     """Tests a simple scenario."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor([['batch1-FC1-F1'],
                              ['batch2-FC1-F1', 'batch2-FC1-F2']]),
         self._sparse_tensor([['batch1-FC2-F1'],
@@ -47,7 +47,7 @@ class SparseCrossOpTest(test.TestCase):
 
   def test_dense(self):
     """Tests only dense inputs."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'],
                               ['batch2-FC1-F1', 'batch2-FC1-F2']],
                              dtypes.string),
@@ -67,7 +67,7 @@ class SparseCrossOpTest(test.TestCase):
 
   def test_integer_mixed_string_sparse(self):
     """Tests mixed type."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor([[11], [333, 55555]]),
         self._sparse_tensor([['batch1-FC2-F1'],
                              ['batch2-FC2-F1', 'batch2-FC2-F2']])
@@ -81,7 +81,7 @@ class SparseCrossOpTest(test.TestCase):
 
   def test_integer_mixed_string_dense(self):
     """Tests mixed dense inputs."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         constant_op.constant([[11, 333], [55555, 999999]], dtypes.int64),
         constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
                               ['batch2-FC2-F1', 'batch2-FC2-F2']],
@@ -99,7 +99,7 @@ class SparseCrossOpTest(test.TestCase):
 
   def test_sparse_cross_dense(self):
     """Tests sparse and dense inputs."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor([['batch1-FC1-F1'],
                              ['batch2-FC1-F1', 'batch2-FC1-F2']]),
         constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
@@ -116,7 +116,7 @@ class SparseCrossOpTest(test.TestCase):
 
   def test_integer_sparse_input(self):
     """Tests mixed type sparse and dense inputs."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor([[11], [333, 5555]]),
         constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
                               ['batch2-FC2-F1', 'batch2-FC2-F2']],
@@ -132,7 +132,7 @@ class SparseCrossOpTest(test.TestCase):
 
   def test_permutation_3x3x3(self):
     """Tests 3x3x3 permutation."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor(
             [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
         self._sparse_tensor(
@@ -174,7 +174,7 @@ class SparseCrossOpTest(test.TestCase):
 
   def test_permutation_3x1x2(self):
     """Tests 3x1x2 permutation."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor(
             [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
         self._sparse_tensor([['batch1-FC2-F1']]),
@@ -203,8 +203,9 @@ class SparseCrossOpTest(test.TestCase):
       col2.append(['batch%d-FC2-F1' % b])
       col3.append(['batch%d-FC3-F1' % b, 'batch%d-FC3-F2' % b])
 
-    op = sparse_ops._sparse_cross([
-        self._sparse_tensor(col1), self._sparse_tensor(col2),
+    op = sparse_ops.sparse_cross([
+        self._sparse_tensor(col1),
+        self._sparse_tensor(col2),
         self._sparse_tensor(col3)
     ])
 
@@ -228,7 +229,7 @@ class SparseCrossOpTest(test.TestCase):
 
     The crossed tensor should be empty.
     """
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']]),
         self._sparse_tensor([], 1),
         self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
@@ -241,7 +242,7 @@ class SparseCrossOpTest(test.TestCase):
 
     Cross for the corresponding batch should be empty.
     """
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']], 2),
         self._sparse_tensor([['batch1-FC2-F1'], ['batch2-FC2-F1']], 2),
         self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']], 2)
@@ -260,27 +261,27 @@ class SparseCrossOpTest(test.TestCase):
 
     The crossed tensor should be empty.
     """
-    op = sparse_ops._sparse_cross([
-        self._sparse_tensor([]), self._sparse_tensor([]),
+    op = sparse_ops.sparse_cross([
+        self._sparse_tensor([]),
+        self._sparse_tensor([]),
         self._sparse_tensor([])
     ])
     with self.test_session() as sess:
       self._assert_sparse_tensor_empty(sess.run(op))
 
   def test_hashed_zero_bucket_no_hash_key(self):
-    op = sparse_ops._sparse_cross_hashed(
-        [
-            self._sparse_tensor([['batch1-FC1-F1']]),
-            self._sparse_tensor([['batch1-FC2-F1']]),
-            self._sparse_tensor([['batch1-FC3-F1']])
-        ])
+    op = sparse_ops.sparse_cross_hashed([
+        self._sparse_tensor([['batch1-FC1-F1']]),
+        self._sparse_tensor([['batch1-FC2-F1']]),
+        self._sparse_tensor([['batch1-FC3-F1']])
+    ])
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[1971693436396284976]])
     with self.test_session() as sess:
       self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 
   def test_hashed_zero_bucket(self):
-    op = sparse_ops._sparse_cross_hashed(
+    op = sparse_ops.sparse_cross_hashed(
         [
             self._sparse_tensor([['batch1-FC1-F1']]),
             self._sparse_tensor([['batch1-FC2-F1']]),
@@ -294,7 +295,7 @@ class SparseCrossOpTest(test.TestCase):
 
   # TODO(sibyl-Aix6ihai): Add benchmark to compare Hashed vs Non-hashed.
   def test_hashed_no_hash_key(self):
-    op = sparse_ops._sparse_cross_hashed(
+    op = sparse_ops.sparse_cross_hashed(
         [
             self._sparse_tensor([['batch1-FC1-F1']]),
             self._sparse_tensor([['batch1-FC2-F1']]),
@@ -307,7 +308,7 @@ class SparseCrossOpTest(test.TestCase):
       self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 
   def test_hashed_output(self):
-    op = sparse_ops._sparse_cross_hashed(
+    op = sparse_ops.sparse_cross_hashed(
         [
             self._sparse_tensor([['batch1-FC1-F1']]),
             self._sparse_tensor([['batch1-FC2-F1']]),
@@ -326,10 +327,8 @@ class SparseCrossOpTest(test.TestCase):
     # As a result, all the crosses shouldn't collide.
     t1 = constant_op.constant([[359], [359 + 1024]])
     t2 = constant_op.constant([list(range(10)), list(range(10))])
-    cross = sparse_ops._sparse_cross_hashed(
-        [t2, t1],
-        num_buckets=1024,
-        hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
+    cross = sparse_ops.sparse_cross_hashed(
+        [t2, t1], num_buckets=1024, hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
     cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
     with session.Session():
       values = cross_dense.eval()
@@ -337,7 +336,7 @@ class SparseCrossOpTest(test.TestCase):
 
   def test_hashed_3x1x2(self):
     """Tests 3x1x2 permutation with hashed output."""
-    op = sparse_ops._sparse_cross_hashed(
+    op = sparse_ops.sparse_cross_hashed(
         [
             self._sparse_tensor(
                 [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 918bbd38edfd18bf2b65aa84ef7279e1448badd3..c0b36f143d109eb28e2784b49e8fd4099b5799a6 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -438,7 +438,6 @@ class TensorArrayTest(test.TestCase):
           "Tried to read from index 3 but array size is: 3"):
         self.evaluate(ta.read(3))
 
-  @test_util.run_in_graph_and_eager_modes()
   def testTensorArrayWriteMultipleFails(self):
     with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 51aa671098905e840b7c96cd5a984887d347adf9..9dc4ec0f9625ccf399807316c9c46309432bb2e7 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -40,6 +40,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
 class VariableScopeTest(test.TestCase):
@@ -110,6 +111,12 @@ class VariableScopeTest(test.TestCase):
         w = variable_scope.get_variable("w", [])
         self.assertEqual(w.constraint, constraint)
 
+  def testStringDefaultInitializer(self):
+    with self.test_session():
+      v = variable_scope.get_variable("string", shape=[], dtype=dtypes.string)
+      variables_lib.global_variables_initializer().run()
+      self.assertAllEqual(compat.as_bytes(v.eval()), b"")
+
   @test_util.run_in_graph_and_eager_modes()
   def testVarScopeDType(self):
     with variable_scope.variable_scope("tower2") as tower:
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 64db49c900c21d60ba2337f920d6fa2cb9ab7b5f..eda036ece4a7d74e5752e80a4a2f4e4ada1b0a38 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -20,12 +20,12 @@ from __future__ import print_function
 import copy
 
 from tensorflow.python.eager import context
-from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.keras._impl.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -191,6 +191,16 @@ class Layer(base_layer.Layer):
       RuntimeError: If called with partioned variable regularization and
         eager execution is enabled.
     """
+    
+    def _should_add_regularizer(variable, existing_variable_set):
+      if isinstance(variable, tf_variables.PartitionedVariable):
+        for var in variable:
+          if var in existing_variable_set:
+            return False
+        return True
+      else:
+        return variable not in existing_variable_set
+
     init_graph = None
     if not context.executing_eagerly():
       default_graph = ops.get_default_graph()
@@ -233,7 +243,8 @@ class Layer(base_layer.Layer):
             getter=vs.get_variable)
 
         if regularizer:
-          if context.executing_eagerly() or variable not in existing_variables:
+          if context.executing_eagerly() or _should_add_regularizer(
+              variable, existing_variables):
             self._handle_weight_regularization(name, variable, regularizer)
 
         if init_graph is not None:
@@ -308,7 +319,7 @@ class Layer(base_layer.Layer):
       try:
         call_has_scope_arg = self._call_has_scope_arg
       except AttributeError:
-        self._call_fn_args = estimator_util.fn_args(self.call)
+        self._call_fn_args = function_utils.fn_args(self.call)
         self._call_has_scope_arg = 'scope' in self._call_fn_args
         call_has_scope_arg = self._call_has_scope_arg
       if call_has_scope_arg:
@@ -353,4 +364,3 @@ def _add_elements_to_collection(elements, collection_list):
     for element in elements:
       if element not in collection_set:
         collection.append(element)
-
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index f08b552840f5ff9144edae1cb0f90a1bc3db0f8c..ab49e37b90e183034ae7ab720fa92b06f39b2aed 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
@@ -95,6 +96,21 @@ class BaseLayerTest(test.TestCase):
           regularizer=regularizer)
       self.assertEqual(len(layer.losses), 1)
 
+  def testReusePartitionedVaraiblesAndRegularizers(self):
+    regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
+    partitioner = partitioned_variables.fixed_size_partitioner(3)
+    for reuse in [False, True]:
+      with variable_scope.variable_scope(variable_scope.get_variable_scope(),
+                                         partitioner=partitioner,
+                                         reuse=reuse):
+        layer = base_layers.Layer(name='my_layer')
+        variable = layer.add_variable(
+            'reg_part_var', [4, 4],
+            initializer=init_ops.zeros_initializer(),
+            regularizer=regularizer)
+    self.assertEqual(
+        len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 3)
+
   def testNoEagerActivityRegularizer(self):
     with context.eager_mode():
       with self.assertRaisesRegexp(ValueError, 'activity_regularizer'):
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 34a1487e748e41eebae8b87b17c34d0deda8597f..267d78dbcb27392a528bf09414b857d9b1a7c2f9 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -23,7 +23,7 @@ from __future__ import print_function
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import layers as keras_layers
+from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
index cdb42f5bd18292cad9d8536e88ea1c58c1d7d777..625320b48bc801d8220bff9ea50a97b2156680d4 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -22,7 +22,6 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional as conv_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -34,7 +33,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-@test_util.with_c_api
 class ConvTest(test.TestCase):
 
   def testInvalidDataFormat(self):
@@ -332,7 +330,6 @@ class ConvTest(test.TestCase):
     conv_layers.conv3d(images, 32, 9, data_format='channels_first')
 
 
-@test_util.with_c_api
 class SeparableConv1DTest(test.TestCase):
 
   def testInvalidDataFormat(self):
@@ -494,7 +491,6 @@ class SeparableConv1DTest(test.TestCase):
     self.assertEqual(layer.bias_constraint, b_constraint)
 
 
-@test_util.with_c_api
 class SeparableConv2DTest(test.TestCase):
 
   def testInvalidDataFormat(self):
@@ -738,7 +734,6 @@ class SeparableConv2DTest(test.TestCase):
     self.assertEqual(layer.bias_constraint, b_constraint)
 
 
-@test_util.with_c_api
 class Conv2DTransposeTest(test.TestCase):
 
   def testInvalidDataFormat(self):
@@ -924,7 +919,6 @@ class Conv2DTransposeTest(test.TestCase):
     self.assertEqual(layer.bias_constraint, b_constraint)
 
 
-@test_util.with_c_api
 class Conv3DTransposeTest(test.TestCase):
 
   def testInvalidDataFormat(self):
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 6d8e9eac878bb2eb65bfa29e872a0576a39af662..abbacac442c5bb20feeb255d4ad3f90626c75327 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -27,7 +27,7 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import layers as keras_layers
+from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.ops import init_ops
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 33284b0d695272db5a4e0d757d6f24b1930068de..d082e312e9a3750726235a0360ef466fb8915208 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -24,7 +24,7 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import layers as keras_layers
+from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.ops import init_ops
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 75abe56f51f2a206ea3e5a5dad032446c150293a..c53cca3d312470c6fc22b4cca0bb9c76ed0865af 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -19,7 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras import layers as keras_layers
+from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index a07e305ffbe8b4c4736c3231f6d1d7872d91e04e..9df38d464ca6ad40f22b720902e1c6f127cf846d 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -145,7 +145,7 @@ Status PyBytesArrayMap(PyArrayObject* array, F f) {
   while (PyArray_ITER_NOTDONE(iter.get())) {
     auto item = tensorflow::make_safe(PyArray_GETITEM(
         array, static_cast<char*>(PyArray_ITER_DATA(iter.get()))));
-    if (!item.get()) {
+    if (!item) {
       return errors::Internal("Unable to get element from the feed - no item.");
     }
     char* ptr;
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index 65e2178cda498294ffc4a5066b5692132e86180f..0d5838505f2553b842168784dd98338b4992f763 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -72,10 +72,11 @@ struct TensorReleaser {
 
 extern PyTypeObject TensorReleaserType;
 
-static void TensorReleaser_dealloc(TensorReleaser* self) {
+static void TensorReleaser_dealloc(PyObject* pself) {
+  TensorReleaser* self = reinterpret_cast<TensorReleaser*>(pself);
   (*self->destructor)();
   delete self->destructor;
-  TensorReleaserType.tp_free(self);
+  TensorReleaserType.tp_free(pself);
 }
 
 PyTypeObject TensorReleaserType = {
@@ -84,26 +85,26 @@ PyTypeObject TensorReleaserType = {
     sizeof(TensorReleaser),           /* tp_basicsize */
     0,                                /* tp_itemsize */
     /* methods */
-    (destructor)TensorReleaser_dealloc, /* tp_dealloc */
-    nullptr,                            /* tp_print */
-    nullptr,                            /* tp_getattr */
-    nullptr,                            /* tp_setattr */
-    nullptr,                            /* tp_compare */
-    nullptr,                            /* tp_repr */
-    nullptr,                            /* tp_as_number */
-    nullptr,                            /* tp_as_sequence */
-    nullptr,                            /* tp_as_mapping */
-    nullptr,                            /* tp_hash */
-    nullptr,                            /* tp_call */
-    nullptr,                            /* tp_str */
-    nullptr,                            /* tp_getattro */
-    nullptr,                            /* tp_setattro */
-    nullptr,                            /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
-    "Wrapped TensorFlow Tensor",        /* tp_doc */
-    nullptr,                            /* tp_traverse */
-    nullptr,                            /* tp_clear */
-    nullptr,                            /* tp_richcompare */
+    TensorReleaser_dealloc,      /* tp_dealloc */
+    nullptr,                     /* tp_print */
+    nullptr,                     /* tp_getattr */
+    nullptr,                     /* tp_setattr */
+    nullptr,                     /* tp_compare */
+    nullptr,                     /* tp_repr */
+    nullptr,                     /* tp_as_number */
+    nullptr,                     /* tp_as_sequence */
+    nullptr,                     /* tp_as_mapping */
+    nullptr,                     /* tp_hash */
+    nullptr,                     /* tp_call */
+    nullptr,                     /* tp_str */
+    nullptr,                     /* tp_getattro */
+    nullptr,                     /* tp_setattro */
+    nullptr,                     /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,          /* tp_flags */
+    "Wrapped TensorFlow Tensor", /* tp_doc */
+    nullptr,                     /* tp_traverse */
+    nullptr,                     /* tp_clear */
+    nullptr,                     /* tp_richcompare */
 };
 
 Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 96df15684b857166d4ecdefcc5e73a4359b65bb3..fb81798602cb900da2fca83f4547a3776f63eddb 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1897,7 +1897,7 @@ def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pyl
         and paddings_constant is not None):
       new_shape = []
       for padding, dim in zip(paddings_constant, input_shape.as_list()):
-        if padding is None or dim is None or not all(padding):
+        if padding is None or dim is None or any((x is None for x in padding)):
           new_shape.append(None)
         else:
           new_shape.append(sum(padding) + dim)
@@ -2619,6 +2619,10 @@ reverse.__doc__ = gen_array_ops.reverse_v2.__doc__
 
 # pylint: disable=redefined-builtin
 @tf_export("reverse_sequence")
+@deprecation.deprecated_args(
+    None, "seq_dim is deprecated, use seq_axis instead", "seq_dim")
+@deprecation.deprecated_args(
+    None, "batch_dim is deprecated, use batch_axis instead", "batch_dim")
 def reverse_sequence(input,
                      seq_lengths,
                      seq_axis=None,
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 306055d2025f17904e3d4b2a342659a389f167a8..cabc1e724cdb667f4d0c5059ff1d78854a45b30c 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -1169,19 +1169,35 @@ def _assert_same_base_type(items, expected_type=None):
   Raises:
     ValueError: If any types do not match.
   """
-  original_item_str = None
+  original_expected_type = expected_type
+  mismatch = False
   for item in items:
     if item is not None:
       item_type = item.dtype.base_dtype
       if not expected_type:
         expected_type = item_type
-        original_item_str = item.name if hasattr(item, 'name') else str(item)
       elif expected_type != item_type:
-        raise ValueError('%s, type=%s, must be of the same type (%s)%s.' % (
-            item.name if hasattr(item, 'name') else str(item),
-            item_type, expected_type,
-            (' as %s' % original_item_str) if original_item_str else ''))
-  return expected_type
+        mismatch = True
+        break
+  if mismatch:
+    # Loop back through and build up an informative error message (this is very
+    # slow, so we don't do it unless we found an error above).
+    expected_type = original_expected_type
+    original_item_str = None
+    for item in items:
+      if item is not None:
+        item_type = item.dtype.base_dtype
+        if not expected_type:
+          expected_type = item_type
+          original_item_str = item.name if hasattr(item, 'name') else str(item)
+        elif expected_type != item_type:
+          raise ValueError('%s, type=%s, must be of the same type (%s)%s.' % (
+              item.name if hasattr(item, 'name') else str(item),
+              item_type, expected_type,
+              (' as %s' % original_item_str) if original_item_str else ''))
+    return expected_type  # Should be unreachable
+  else:
+    return expected_type
 
 
 @tf_export('assert_same_float_dtype')
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 07d4ff7b02c70ee01fdd21f6725623b2a73705ff..ee024ce64a79de3aa326ce710b3f9daba25fb260 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_logging_ops
+from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 # go/tf-wildcard-import
@@ -1191,20 +1192,18 @@ class ControlFlowState(object):
       to backprop.
     """
     loop_exits = []
-    for _, grad_state in self._map.items():
-      # pylint: disable=protected-access
+    for grad_state in self._map.values():
       for y in grad_state.forward_loop_exits:
-        if pending_count[y.op._id] == 0:
+        if pending_count[y.op] == 0:
           grad_state.pending_exits_count -= 1
-          if y.op._id not in to_ops_set:
+          if y.op not in to_ops_set:
             grad_state.unused_exits.append(y)
           if grad_state.pending_exits_count == 0:
             loop_exits.extend(grad_state.unused_exits)
       # Need to include Enters in backprop for higher-order gradients.
       for y in grad_state.forward_context.loop_enters:
-        if pending_count[y.op._id] == 0:
-          pending_count[y.op._id] = 1
-      # pylint: enable=protected-access
+        if pending_count[y.op] == 0:
+          pending_count[y.op] = 1
     return loop_exits
 
   def EnterGradWhileContext(self, op, before):
@@ -1242,8 +1241,8 @@ class ControlFlowState(object):
 
       # We need to include all exits of a loop for backprop.
       for loop_exit in grad_state.forward_loop_exits:
-        if not between_ops[loop_exit.op._id]:
-          between_ops[loop_exit.op._id] = True
+        if loop_exit.op not in between_ops:
+          between_ops.add(loop_exit.op)
           between_op_list.append(loop_exit.op)
 
   def ZerosLikeForExit(self, val):
@@ -1433,6 +1432,8 @@ def ZerosLikeOutsideLoop(op, index):
   """Create zeros_like for the specified output of an op."""
   val = op.outputs[index]
   if not util.IsSwitch(op):
+    if val.dtype == dtypes.resource:
+      return array_ops.zeros(gen_resource_variable_ops.variable_shape(val))
     return array_ops.zeros_like(val, optimize=False)
   else:
     op_ctxt = op._get_control_flow_context()
@@ -1441,6 +1442,10 @@ def ZerosLikeOutsideLoop(op, index):
       pred = op_ctxt.pred
       branch = op_ctxt.branch
       switch_val = switch(op.inputs[0], pred)[1 - branch]
+      if val.dtype == dtypes.resource:
+        with ops.control_dependencies([switch_val]):
+          return array_ops.zeros(
+              gen_resource_variable_ops.variable_shape(switch_val))
       zeros_shape = array_ops.shape_internal(switch_val, optimize=False)
       # Ensure ops created within array_ops.zeros are dominated by switch in
       # cond context.
@@ -1678,12 +1683,12 @@ class CondContext(ControlFlowContext):
       self._pivot = pivot  # The predicate tensor in this branch
       self._branch = branch  # 0 or 1 representing this branch
 
-      # Values considered to have been already seen in this context. They are
-      # not included in this context.
+      # Values considered to have been already seen in this context. pred is not
+      # included in this context.
       self._values.add(pred.name)
       self._external_values[pred.name] = pred
       self._values.add(pivot.name)
-      self._external_values[pivot.name] = pivot
+      pivot.op._set_control_flow_context(self)  # pylint: disable=protected-access
 
   def _init_from_proto(self, context_def, import_scope=None):
     """Creates a new `CondContext` from protocol buffer.
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 289df6f3016e9df6a42d694ae854b4f22fdf84f9..59bb925df0f25b3bf88112bc3eb1b13b21ace414 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -51,7 +51,6 @@ TestTuple = collections.namedtuple("TestTuple", "a b")
 SingletonTestTuple = collections.namedtuple("SingletonTestTuple", "a")
 
 
-@test_util.with_c_api
 class GroupTestCase(test_util.TensorFlowTestCase):
 
   def _StripNode(self, nd):
@@ -133,7 +132,6 @@ class GroupTestCase(test_util.TensorFlowTestCase):
         control_flow_ops.group(1, 2)
 
 
-@test_util.with_c_api
 class ShapeTestCase(test_util.TensorFlowTestCase):
 
   def testShape(self):
@@ -145,7 +143,6 @@ class ShapeTestCase(test_util.TensorFlowTestCase):
                             [constant_op.constant(1.0)], tensor).get_shape())
 
 
-@test_util.with_c_api
 class WithDependenciesTestCase(test_util.TensorFlowTestCase):
 
   def testTupleDependencies(self):
@@ -177,7 +174,6 @@ class WithDependenciesTestCase(test_util.TensorFlowTestCase):
         self.assertEquals(1, counter.eval())
 
 
-@test_util.with_c_api
 class SwitchTestCase(test_util.TensorFlowTestCase):
 
   def testIndexedSlicesWithDenseShape(self):
@@ -349,12 +345,9 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
       self.assertEquals(grad_x_false.eval(), 0.)
 
 
-@test_util.with_c_api
 class CondTest(test_util.TensorFlowTestCase):
 
   def testCondTrue(self):
-    # Create new Graph and Session for each test so we pick up _USE_C_API
-    # correctly.
     with ops.Graph().as_default():
       with session.Session():
         x = constant_op.constant(2)
@@ -438,7 +431,6 @@ class CondTest(test_util.TensorFlowTestCase):
           control_flow_ops.cond(True, lambda: x, lambda: x, fn2=lambda: x)
 
 
-@test_util.with_c_api
 class ContextTest(test_util.TensorFlowTestCase):
 
   def testCondContext(self):
@@ -535,7 +527,6 @@ def _raw_nested_shape(nested_shape):
 
 
 # TODO(yori): Add tests for indexed slices.
-@test_util.with_c_api
 class DataTypesTest(test_util.TensorFlowTestCase):
 
   def assertAllEqualNested(self, a, b):
@@ -885,7 +876,6 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self.assertEqual(matrix.get_shape(), tensor_shape.TensorShape([2, 2]))
 
 
-@test_util.with_c_api
 class CaseTest(test_util.TensorFlowTestCase):
 
   def testCase_withDefault(self):
@@ -947,7 +937,6 @@ class CaseTest(test_util.TensorFlowTestCase):
         sess.run(output, feed_dict={x: 4})
 
 
-@test_util.with_c_api
 class WhileLoopTestCase(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
index eee31102db57b44ee29cb04ea79aabf003603f2f..7a18986c5b03446d2b0e0a2ecd161dccdc3d70e1 100644
--- a/tensorflow/python/ops/control_flow_util.py
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -53,6 +53,11 @@ def IsSwitch(op):
   return op.type == "Switch" or op.type == "RefSwitch"
 
 
+def IsMerge(op):
+  """Return true if `op` is a Merge."""
+  return op.type == "Merge" or op.type == "RefMerge"
+
+
 def IsLoopEnter(op):
   """Returns true if `op` is an Enter."""
   return op.type == "Enter" or op.type == "RefEnter"
@@ -63,11 +68,57 @@ def IsLoopExit(op):
   return op.type == "Exit" or op.type == "RefExit"
 
 
+def IsCondSwitch(op):
+  """Return true if `op` is the Switch for a conditional."""
+  if not IsSwitch(op):
+    return False
+  if not op.outputs:
+    return False
+  # Switch nodes are not part of the cond control flow context that they
+  # represent, so consider the consumers of its outputs to determine if it is
+  # cond switch or not. A switch is a cond switch iff all its consumers are in
+  # cond contexts.
+  is_cond_switch = True
+  for o in op.outputs:
+    for c in o.consumers():
+      ctxt = c._get_control_flow_context()  # pylint: disable=protected-access
+      if IsLoopEnter(c):
+        ctxt = ctxt.outer_context
+      is_cond_switch = is_cond_switch and (ctxt is not None and
+                                           ctxt.IsCondContext())
+  return is_cond_switch
+
+
+def IsCondMerge(op):
+  """Return true if `op` is the Merge for a conditional."""
+  if not IsMerge(op):
+    return False
+  if not op.inputs:
+    return False
+  # Merge nodes are not part of the cond control flow context that they
+  # represent, so consider the inputs to the merge of to determine if it is
+  # cond merge or not: A merge is a cond merge iff all its inputs are in
+  # cond contexts.
+  is_cond_merge = True
+  for i in op.inputs:
+    ctxt = GetOutputContext(i.op)
+    is_cond_merge = is_cond_merge and ctxt is not None and ctxt.IsCondContext()
+  return is_cond_merge
+
+
 def IsLoopSwitch(op):
   """Return true if `op` is the Switch for a while loop."""
   if IsSwitch(op):
     ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
-    return ctxt and ctxt.IsWhileContext()
+    return ctxt is not None and ctxt.IsWhileContext() and not IsCondSwitch(op)
+  return False
+
+
+def IsLoopMerge(op):
+  """Return true if `op` is the Merge for a while loop."""
+  if IsMerge(op):
+    ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+    return ctxt is not None and ctxt.IsWhileContext() and not IsCondMerge(op)
   return False
 
 
diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
index 2c9f0e9a32dd3f2caa81befebc06dcc740c832cb..d7fb3f1f783cceef280e07b6110098a80011b19f 100644
--- a/tensorflow/python/ops/distributions/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -71,7 +71,7 @@ class Bernoulli(distribution.Distribution):
     Raises:
       ValueError: If p and logits are passed, or if neither are passed.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits=logits,
diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py
index 8beab99bf868cdb16dc842eae113cf55dd565131..b6978486004affbf97ee0e5da15fee7ab092eb32 100644
--- a/tensorflow/python/ops/distributions/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -150,7 +150,7 @@ class Beta(distribution.Distribution):
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[concentration1, concentration0]) as name:
       self._concentration1 = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration1, name="concentration1"),
@@ -321,7 +321,7 @@ class BetaWithSoftplusConcentration(Beta):
                validate_args=False,
                allow_nan_stats=True,
                name="BetaWithSoftplusConcentration"):
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[concentration1,
                                       concentration0]) as name:
       super(BetaWithSoftplusConcentration, self).__init__(
diff --git a/tensorflow/python/ops/distributions/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
index 36eee5ce78f010b072f1a71d2fbc0d8a296e3970..caceadf53a0a0816379b0d75808ec756b558e861 100644
--- a/tensorflow/python/ops/distributions/bijector_impl.py
+++ b/tensorflow/python/ops/distributions/bijector_impl.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -527,8 +528,6 @@ class Bijector(object):
       ValueError:  If a member of `graph_parents` is not a `Tensor`.
     """
     self._graph_parents = graph_parents or []
-    forward_min_event_ndims = get_static_value(forward_min_event_ndims)
-    inverse_min_event_ndims = get_static_value(inverse_min_event_ndims)
 
     if forward_min_event_ndims is None and inverse_min_event_ndims is None:
       raise ValueError("Must specify at least one of `forward_min_event_ndims` "
@@ -538,12 +537,23 @@ class Bijector(object):
     elif forward_min_event_ndims is None:
       forward_min_event_ndims = inverse_min_event_ndims
 
+    if not isinstance(forward_min_event_ndims, int):
+      raise TypeError("Expected forward_min_event_ndims to be of "
+                      "type int, got {}".format(
+                          type(forward_min_event_ndims).__name__))
+
+    if not isinstance(inverse_min_event_ndims, int):
+      raise TypeError("Expected inverse_min_event_ndims to be of "
+                      "type int, got {}".format(
+                          type(inverse_min_event_ndims).__name__))
+
     if forward_min_event_ndims < 0:
       raise ValueError("forward_min_event_ndims must be a non-negative "
                        "integer.")
     if inverse_min_event_ndims < 0:
       raise ValueError("inverse_min_event_ndims must be a non-negative "
                        "integer.")
+
     self._forward_min_event_ndims = forward_min_event_ndims
     self._inverse_min_event_ndims = inverse_min_event_ndims
     self._is_constant_jacobian = is_constant_jacobian
@@ -994,7 +1004,6 @@ class Bijector(object):
   def _reduce_jacobian_det_over_event(
       self, y, ildj, min_event_ndims, event_ndims):
     """Reduce jacobian over event_ndims - min_event_ndims."""
-    assert_static(min_event_ndims)
 
     if not self.is_constant_jacobian:
       return math_ops.reduce_sum(
@@ -1012,7 +1021,7 @@ class Bijector(object):
         axis=self._get_event_reduce_dims(min_event_ndims, event_ndims))
     # The multiplication by ones can change the inferred static shape so we try
     # to recover as much as possible.
-    event_ndims_ = get_static_value(event_ndims)
+    event_ndims_ = self._maybe_get_event_ndims_statically(event_ndims)
     if (event_ndims_ is not None and
         y.shape.ndims is not None and
         ildj.shape.ndims is not None):
@@ -1027,8 +1036,7 @@ class Bijector(object):
 
   def _get_event_reduce_dims(self, min_event_ndims, event_ndims):
     """Compute the reduction dimensions given event_ndims."""
-    assert_static(min_event_ndims)
-    event_ndims_ = get_static_value(event_ndims, np.int32)
+    event_ndims_ = self._maybe_get_event_ndims_statically(event_ndims)
 
     if event_ndims_ is not None:
       return [-index for index in range(1, event_ndims_ - min_event_ndims + 1)]
@@ -1038,8 +1046,7 @@ class Bijector(object):
 
   def _check_valid_event_ndims(self, min_event_ndims, event_ndims):
     """Check whether event_ndims is atleast min_event_ndims."""
-    assert_static(min_event_ndims)
-    event_ndims_ = get_static_value(event_ndims, np.int32)
+    event_ndims_ = self._maybe_get_event_ndims_statically(event_ndims)
     assertions = []
     if event_ndims_ is not None:
       if min_event_ndims > event_ndims_:
@@ -1051,21 +1058,15 @@ class Bijector(object):
           check_ops.assert_greater_equal(event_ndims, min_event_ndims)]
     return assertions
 
+  def _maybe_get_event_ndims_statically(self, event_ndims):
+    """Helper which returns tries to return an integer static value."""
+    event_ndims_ = distribution_util.maybe_get_static_value(event_ndims)
 
-def get_static_value(x, dtype=None):
-  """Helper which returns static value; casting when dtype is preferred."""
-  if x is None:
-    return x
-  try:
-    x_ = tensor_util.constant_value(x)
-  except TypeError:
-    x_ = x
-  if x_ is None or dtype is None:
-    return x_
-  return np.array(x_, dtype)
-
+    if isinstance(event_ndims_, np.ndarray):
+      if (event_ndims_.dtype not in (np.int32, np.int64) or
+          len(event_ndims_.shape)):
+        raise ValueError("Expected a scalar integer, got {}".format(
+            event_ndims_))
+      event_ndims_ = event_ndims_.tolist()
 
-def assert_static(x):
-  """Helper which asserts that input arg is known statically."""
-  if x is None or type(x) != type(get_static_value(x)):  # pylint: disable=unidiomatic-typecheck
-    raise TypeError("Input must be known statically.")
+    return event_ndims_
diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index 8f25b1149c3c8b220fe2bea95c33854ee2f6275a..bbdc8c455af66c7c6ad1866302e84f50cf221f9b 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -182,7 +182,7 @@ class Categorical(distribution.Distribution):
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits=logits,
diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
index eafcd5c78f7752dc3f9f5be8f8dbe7ac368fd3be..8d0d1d860bf4a7efadd8b6bf101708974b87e5d1 100644
--- a/tensorflow/python/ops/distributions/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -154,7 +154,7 @@ class Dirichlet(distribution.Distribution):
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[concentration]) as name:
       self._concentration = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration, name="concentration"),
diff --git a/tensorflow/python/ops/distributions/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
index fe0ed7e07d59654b8a4c7ac989212913697c6832..3a35e0caa0f411dbb413aa5e8fe68143e0914db9 100644
--- a/tensorflow/python/ops/distributions/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -191,7 +191,7 @@ class DirichletMultinomial(distribution.Distribution):
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[total_count, concentration]) as name:
       # Broadcasting works because:
       # * The broadcasting convention is to prepend dimensions of size [1], and
diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index 3815abf72de1e2b5278706315a10899dccca4182..a6579e3246d4c2c6bf266754868cba43515e3256 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -357,7 +357,7 @@ class Distribution(_BaseDistribution):
 
   For detailed usage examples of TensorFlow Distributions shapes, see
   [this tutorial](
-  https://github.com/tensorflow/probability/blob/master/tensorflow_probability/examples/jupyter_notebooks/Understanding%20TensorFlow%20Distributions%20Shapes.ipynb)
+  https://github.com/tensorflow/probability/blob/master/tensorflow_probability/examples/jupyter_notebooks/Understanding_TensorFlow_Distributions_Shapes.ipynb)
 
   #### Parameter values leading to undefined statistics or distributions.
 
@@ -524,7 +524,8 @@ class Distribution(_BaseDistribution):
   def parameters(self):
     """Dictionary of parameters used to instantiate this `Distribution`."""
     # Remove "self", "__class__", or other special variables. These can appear
-    # if the subclass used `parameters = locals()`.
+    # if the subclass used:
+    # `parameters = distribution_util.parent_frame_arguments()`.
     return dict((k, v) for k, v in self._parameters.items()
                 if not k.startswith("__") and k != "self")
 
diff --git a/tensorflow/python/ops/distributions/exponential.py b/tensorflow/python/ops/distributions/exponential.py
index cf0e729e1a189d94381d56aedb5a753862a20962..1e08f48d529b164ddbb77d4d36ba0bd3390475b7 100644
--- a/tensorflow/python/ops/distributions/exponential.py
+++ b/tensorflow/python/ops/distributions/exponential.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import gamma
+from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -90,7 +91,7 @@ class Exponential(gamma.Gamma):
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     # Even though all statistics of are defined for valid inputs, this is not
     # true in the parent class "Gamma."  Therefore, passing
     # allow_nan_stats=True
@@ -143,7 +144,7 @@ class ExponentialWithSoftplusRate(Exponential):
                validate_args=False,
                allow_nan_stats=True,
                name="ExponentialWithSoftplusRate"):
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[rate]) as name:
       super(ExponentialWithSoftplusRate, self).__init__(
           rate=nn.softplus(rate, name="softplus_rate"),
diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py
index d39f7c56d39ae1681c32bacd9771f4f2372d3f98..7ca690d9d2f8348a103adb57abe57f2ce058f17c 100644
--- a/tensorflow/python/ops/distributions/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -126,7 +126,7 @@ class Gamma(distribution.Distribution):
     Raises:
       TypeError: if `concentration` and `rate` are different dtypes.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[concentration, rate]) as name:
       with ops.control_dependencies([
           check_ops.assert_positive(concentration),
@@ -261,7 +261,7 @@ class GammaWithSoftplusConcentrationRate(Gamma):
                validate_args=False,
                allow_nan_stats=True,
                name="GammaWithSoftplusConcentrationRate"):
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[concentration, rate]) as name:
       super(GammaWithSoftplusConcentrationRate, self).__init__(
           concentration=nn.softplus(concentration,
diff --git a/tensorflow/python/ops/distributions/laplace.py b/tensorflow/python/ops/distributions/laplace.py
index 3ccfc618d1157722bbc4ce2d5f88fbcd4fb1ba10..ee3a6a40ff78fb95545c3af4eff85e84a1abd515 100644
--- a/tensorflow/python/ops/distributions/laplace.py
+++ b/tensorflow/python/ops/distributions/laplace.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import special_math
+from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -100,7 +101,7 @@ class Laplace(distribution.Distribution):
     Raises:
       TypeError: if `loc` and `scale` are of different dtype.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
@@ -217,7 +218,7 @@ class LaplaceWithSoftplusScale(Laplace):
                validate_args=False,
                allow_nan_stats=True,
                name="LaplaceWithSoftplusScale"):
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[loc, scale]) as name:
       super(LaplaceWithSoftplusScale, self).__init__(
           loc=loc,
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index ab77f5c1f815f36edb989c5b315ef90fb198b001..036ba45cccf49913c339d305e110960c988f8ccf 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -182,7 +182,7 @@ class Multinomial(distribution.Distribution):
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[total_count, logits, probs]) as name:
       self._total_count = ops.convert_to_tensor(total_count, name="total_count")
       if validate_args:
diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py
index 20d4420e91886cf84855bd585cfc4869fc674178..0620aae10d0d3bb1f902171ed5c033630520dd7a 100644
--- a/tensorflow/python/ops/distributions/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import special_math
+from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -131,7 +132,7 @@ class Normal(distribution.Distribution):
     Raises:
       TypeError: if `loc` and `scale` have different `dtype`.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
@@ -243,7 +244,7 @@ class NormalWithSoftplusScale(Normal):
                validate_args=False,
                allow_nan_stats=True,
                name="NormalWithSoftplusScale"):
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[scale]) as name:
       super(NormalWithSoftplusScale, self).__init__(
           loc=loc,
diff --git a/tensorflow/python/ops/distributions/special_math.py b/tensorflow/python/ops/distributions/special_math.py
index 1d605c5dfcca9b709a9178ccbe56619f6a92f869..31b7a36fd3ae40a106ba9f45e0cacef737ba5cb7 100644
--- a/tensorflow/python/ops/distributions/special_math.py
+++ b/tensorflow/python/ops/distributions/special_math.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -42,15 +41,15 @@ __all__ = [
 # then made more conservative just to be safe. (Conservative means use the
 # expansion more than we probably need to.) See `NdtrTest` in
 # special_math_test.py.
-LOGNDTR_FLOAT64_LOWER = -20
-LOGNDTR_FLOAT32_LOWER = -10
+LOGNDTR_FLOAT64_LOWER = np.array(-20, np.float64)
+LOGNDTR_FLOAT32_LOWER = np.array(-10, np.float32)
 
 # Upper bound values were chosen by examining for which values of 'x'
 # Log[cdf(x)] is 0, after which point we need to use the approximation
 # Log[cdf(x)] = Log[1 - cdf(-x)] approx -cdf(-x). We chose a value slightly
 # conservative, meaning we use the approximation earlier than needed.
-LOGNDTR_FLOAT64_UPPER = 8
-LOGNDTR_FLOAT32_UPPER = 5
+LOGNDTR_FLOAT64_UPPER = np.array(8, np.float64)
+LOGNDTR_FLOAT32_UPPER = np.array(5, np.float32)
 
 
 def ndtr(x, name="ndtr"):
@@ -91,7 +90,7 @@ def ndtr(x, name="ndtr"):
 def _ndtr(x):
   """Implements ndtr core logic."""
   half_sqrt_2 = constant_op.constant(
-      0.5 * math.sqrt(2.), dtype=x.dtype, name="half_sqrt_2")
+      0.5 * np.sqrt(2.), dtype=x.dtype, name="half_sqrt_2")
   w = x * half_sqrt_2
   z = math_ops.abs(w)
   y = array_ops.where(math_ops.less(z, half_sqrt_2),
@@ -190,18 +189,18 @@ def _ndtri(p):
 
   def _create_polynomial(var, coeffs):
     """Compute n_th order polynomial via Horner's method."""
-    if not coeffs:
-      return 0.
+    coeffs = np.array(coeffs, var.dtype.as_numpy_dtype)
+    if not coeffs.size:
+      return array_ops.zeros_like(var)
     return coeffs[0] + _create_polynomial(var, coeffs[1:]) * var
 
-  maybe_complement_p = array_ops.where(p > 1. - np.exp(-2.), 1. - p, p)
+  maybe_complement_p = array_ops.where(p > -np.expm1(-2.), 1. - p, p)
   # Write in an arbitrary value in place of 0 for p since 0 will cause NaNs
   # later on. The result from the computation when p == 0 is not used so any
   # number that doesn't result in NaNs is fine.
-  one_half = constant_op.constant(0.5, dtype=p.dtype)
   sanitized_mcp = array_ops.where(
       maybe_complement_p <= 0.,
-      array_ops.fill(array_ops.shape(p), one_half),
+      array_ops.fill(array_ops.shape(p), np.array(0.5, p.dtype.as_numpy_dtype)),
       maybe_complement_p)
 
   # Compute x for p > exp(-2): x/sqrt(2pi) = w + w**3 P0(w**2)/Q0(w**2).
@@ -216,10 +215,12 @@ def _ndtri(p):
   # arrays based on whether p < exp(-32).
   z = math_ops.sqrt(-2. * math_ops.log(sanitized_mcp))
   first_term = z - math_ops.log(z) / z
-  second_term_small_p = (_create_polynomial(1. / z, p2)
-                         / _create_polynomial(1. / z, q2)) / z
-  second_term_otherwise = (_create_polynomial(1. / z, p1)
-                           / _create_polynomial(1. / z, q1)) / z
+  second_term_small_p = (
+      _create_polynomial(1. / z, p2) /
+      _create_polynomial(1. / z, q2) / z)
+  second_term_otherwise = (
+      _create_polynomial(1. / z, p1) /
+      _create_polynomial(1. / z, q1) / z)
   x_for_small_p = first_term - second_term_small_p
   x_otherwise = first_term - second_term_otherwise
 
@@ -330,23 +331,25 @@ def _log_ndtr_lower(x, series_order):
   """Asymptotic expansion version of `Log[cdf(x)]`, appropriate for `x<<-1`."""
   x_2 = math_ops.square(x)
   # Log of the term multiplying (1 + sum)
-  log_scale = -0.5 * x_2 - math_ops.log(-x) - 0.5 * math.log(2. * math.pi)
+  log_scale = -0.5 * x_2 - math_ops.log(-x) - 0.5 * np.log(2. * np.pi)
   return log_scale + math_ops.log(_log_ndtr_asymptotic_series(x, series_order))
 
 
 def _log_ndtr_asymptotic_series(x, series_order):
   """Calculates the asymptotic series used in log_ndtr."""
+  dtype = x.dtype.as_numpy_dtype
   if series_order <= 0:
-    return 1.
+    return np.array(1, dtype)
   x_2 = math_ops.square(x)
-  even_sum = 0.
-  odd_sum = 0.
+  even_sum = array_ops.zeros_like(x)
+  odd_sum = array_ops.zeros_like(x)
   x_2n = x_2  # Start with x^{2*1} = x^{2*n} with n = 1.
   for n in range(1, series_order + 1):
+    y = np.array(_double_factorial(2 * n - 1), dtype) / x_2n
     if n % 2:
-      odd_sum += _double_factorial(2 * n - 1) / x_2n
+      odd_sum += y
     else:
-      even_sum += _double_factorial(2 * n - 1) / x_2n
+      even_sum += y
     x_2n *= x_2
   return 1. + even_sum - odd_sum
 
diff --git a/tensorflow/python/ops/distributions/student_t.py b/tensorflow/python/ops/distributions/student_t.py
index 961b07a7bdac34b76ca1a594fa3df5e97951c76b..9330b930b5140b50e2b8f060c70702bd47b2c4ae 100644
--- a/tensorflow/python/ops/distributions/student_t.py
+++ b/tensorflow/python/ops/distributions/student_t.py
@@ -157,7 +157,7 @@ class StudentT(distribution.Distribution):
     Raises:
       TypeError: if loc and scale are different dtypes.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[df, loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(df)]
                                     if validate_args else []):
@@ -349,7 +349,7 @@ class StudentTWithAbsDfSoftplusScale(StudentT):
                validate_args=False,
                allow_nan_stats=True,
                name="StudentTWithAbsDfSoftplusScale"):
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[df, scale]) as name:
       super(StudentTWithAbsDfSoftplusScale, self).__init__(
           df=math_ops.floor(math_ops.abs(df)),
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index bc321900dcbcfe16a9d4e9ee6a60c2ca3ecf396b..9392464ec11613cf318a9a86124bdba6f46ba595 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -252,7 +252,7 @@ class TransformedDistribution(distribution_lib.Distribution):
       name: Python `str` name prefixed to Ops created by this class. Default:
         `bijector.name + distribution.name`.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     name = name or (("" if bijector is None else bijector.name) +
                     distribution.name)
     with ops.name_scope(name, values=[event_shape, batch_shape]) as name:
diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py
index 087797c653bb3ee294b747dd7da8ff2896e79fc6..dfa10331e3e9d64c5d3cae2c52373edfbcda723a 100644
--- a/tensorflow/python/ops/distributions/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -102,7 +103,7 @@ class Uniform(distribution.Distribution):
     Raises:
       InvalidArgumentError: if `low >= high` and `validate_args=False`.
     """
-    parameters = locals()
+    parameters = distribution_util.parent_frame_arguments()
     with ops.name_scope(name, values=[low, high]) as name:
       with ops.control_dependencies([
           check_ops.assert_less(
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 2e067eab459050e30d220bdb7ff0d65cb9c552f7..59c89d21f9142fbbaf52b1d273873194a30a0f7f 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.util import tf_inspect
 
 
 def assert_close(
@@ -162,6 +163,30 @@ def same_dynamic_shape(a, b):
       lambda: constant_op.constant(False))
 
 
+def maybe_get_static_value(x, dtype=None):
+  """Helper which tries to return a static value.
+
+  Given `x`, extract it's value statically, optionally casting to a specific
+  dtype. If this is not possible, None is returned.
+
+  Args:
+    x: `Tensor` for which to extract a value statically.
+    dtype: Optional dtype to cast to.
+
+  Returns:
+    Statically inferred value if possible, otherwise None.
+  """
+  if x is None:
+    return x
+  try:
+    x_ = tensor_util.constant_value(x)
+  except TypeError:
+    x_ = x
+  if x_ is None or dtype is None:
+    return x_
+  return np.array(x_, dtype)
+
+
 def get_logits_and_probs(logits=None,
                          probs=None,
                          multidimensional=False,
@@ -1273,6 +1298,43 @@ def pad(x, axis, front=False, back=False, value=0, count=1, name=None):
     return x
 
 
+def parent_frame_arguments():
+  """Returns parent frame arguments.
+
+  When called inside a function, returns a dictionary with the caller's function
+  arguments. These are positional arguments and keyword arguments (**kwargs),
+  while variable arguments (*varargs) are excluded.
+
+  When called at global scope, this will return an empty dictionary, since there
+  are no arguments.
+
+  WARNING: If caller function argument names are overloaded before invoking
+  this method, then values will reflect the overloaded value. For this reason,
+  we recommend calling `parent_frame_arguments` at the beginning of the
+  function.
+  """
+  # All arguments and the names used for *varargs, and **kwargs
+  arg_names, variable_arg_name, keyword_arg_name, local_vars = (
+      tf_inspect._inspect.getargvalues(  # pylint: disable=protected-access
+          # Get the first frame of the caller of this method.
+          tf_inspect._inspect.stack()[1][0]))  # pylint: disable=protected-access
+
+  # Remove the *varargs, and flatten the **kwargs. Both are
+  # nested lists.
+  local_vars.pop(variable_arg_name, {})
+  keyword_args = local_vars.pop(keyword_arg_name, {})
+
+  final_args = {}
+  # Copy over arguments and their values. In general, local_vars
+  # may contain more than just the arguments, since this method
+  # can be called anywhere in a function.
+  for arg_name in arg_names:
+    final_args[arg_name] = local_vars.pop(arg_name)
+  final_args.update(keyword_args)
+
+  return final_args
+
+
 class AppendDocstring(object):
   """Helper class to promote private subclass docstring to public counterpart.
 
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index c8a1500e76958579f09625d98fb857a0c777c128..394ad0b1a2284ac147a09f165fb1f50d24f4cedc 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -475,7 +475,7 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
 
 @tf_export("scan")
 def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
-         swap_memory=False, infer_shape=True, name=None):
+         swap_memory=False, infer_shape=True, reverse=False, name=None):
   """scan on the list of tensors unpacked from `elems` on dimension 0.
 
   The simplest version of `scan` repeatedly applies the callable `fn` to a
@@ -487,6 +487,7 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
 
   Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
   of the result tensor is `[len(values)] + fn(initializer, values[0]).shape`.
+  If reverse=True, it's fn(initializer, values[-1]).shape.
 
   This method also allows multi-arity `elems` and accumulator.  If `elems`
   is a (possibly nested) list or tuple of tensors, then each of these tensors
@@ -525,12 +526,15 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     back_prop: (optional) True enables support for back propagation.
     swap_memory: (optional) True enables GPU-CPU memory swapping.
     infer_shape: (optional) False disables tests for consistent output shapes.
+    reverse: (optional) True scans the tensor last to first (instead of first
+      to last).
     name: (optional) Name prefix for the returned tensors.
 
   Returns:
     A tensor or (possibly nested) sequence of tensors.  Each tensor packs the
     results of applying `fn` to tensors unpacked from `elems` along the first
-    dimension, and the previous accumulator value(s), from first to last.
+    dimension, and the previous accumulator value(s), from first to last (or
+    last to first, if `reverse=True`).
 
   Raises:
     TypeError: if `fn` is not callable or the structure of the output of
@@ -543,6 +547,8 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     elems = np.array([1, 2, 3, 4, 5, 6])
     sum = scan(lambda a, x: a + x, elems)
     # sum == [1, 3, 6, 10, 15, 21]
+    sum = scan(lambda a, x: a + x, elems, reverse=True)
+    # sum == [22, 21, 18, 15, 11, 6]
     ```
 
     ```python
@@ -614,7 +620,7 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         elem_ta.unstack(elem) for elem_ta, elem in zip(elems_ta, elems_flat)]
 
     if initializer is None:
-      a_flat = [elem.read(0) for elem in elems_ta]
+      a_flat = [elem.read(n - 1 if reverse else 0) for elem in elems_ta]
       i = constant_op.constant(1)
     else:
       initializer_flat = output_flatten(initializer)
@@ -631,7 +637,8 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         for init in a_flat]
 
     if initializer is None:
-      accs_ta = [acc_ta.write(0, a) for (acc_ta, a) in zip(accs_ta, a_flat)]
+      accs_ta = [acc_ta.write(n - 1 if reverse else 0, a)
+                 for (acc_ta, a) in zip(accs_ta, a_flat)]
 
     def compute(i, a_flat, tas):
       """The loop body of scan.
@@ -656,10 +663,20 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
           elems if initializer is None else initializer, a_out)
       flat_a_out = output_flatten(a_out)
       tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_a_out)]
-      return (i + 1, flat_a_out, tas)
+      if reverse:
+        next_i = i - 1
+      else:
+        next_i = i + 1
+      return (next_i, flat_a_out, tas)
 
+    if reverse:
+      initial_i = n - 1 - i
+      condition = lambda i, _1, _2: i >= 0
+    else:
+      initial_i = i
+      condition = lambda i, _1, _2: i < n
     _, _, r_a = control_flow_ops.while_loop(
-        lambda i, _1, _2: i < n, compute, (i, a_flat, accs_ta),
+        condition, compute, (initial_i, a_flat, accs_ta),
         parallel_iterations=parallel_iterations,
         back_prop=back_prop, swap_memory=swap_memory,
         maximum_iterations=n)
@@ -843,7 +860,9 @@ def _ForUsingWhile(start,
     return (i + 1, n, start, delta) + tuple(for_result) + extra_args
 
   if hostmem is not None:
-    hostmem = [(4 + _) for _ in hostmem]
+    hostmem = [0, 1, 2, 3] + [(4 + _) for _ in hostmem]
+  else:
+    hostmem = [0, 1, 2, 3]
 
   results = While(
       input_=[0, n, start, delta] + inputs + WhileBody.captured_inputs,
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 12afcd0b517d5e85112c067ccaca5693e5a4e231..94c8d7933523a315523cf7b2d34d8263785b6eeb 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -283,10 +283,10 @@ def compute_gradient(x,
   numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
   with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
 
-      J[:m, :n] = d(Re y)/d(Re x)
-      J[:m, n:] = d(Im y)/d(Re x)
-      J[m:, :n] = d(Re y)/d(Im x)
-      J[m:, n:] = d(Im y)/d(Im x)
+      J[::2, ::2] = d(Re y)/d(Re x)
+      J[::2, 1::2] = d(Im y)/d(Re x)
+      J[1::2, ::2] = d(Re y)/d(Im x)
+      J[1::2, 1::2] = d(Im y)/d(Im x)
 
   Args:
     x: a tensor or list of tensors
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 1448151fef4aabee709d1a8306d820b2d7f5ff76..716b54f07cd61b3062ce7f488f5dd806915ac254 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -112,14 +112,14 @@ def _MarkReachedOps(from_ops, reached_ops):
 
   Args:
     from_ops: list of Operations.
-    reached_ops: list of booleans, indexed by operation id.
+    reached_ops: set of Operations.
   """
   queue = collections.deque()
   queue.extend(from_ops)
   while queue:
     op = queue.popleft()
-    if not reached_ops[op._id]:
-      reached_ops[op._id] = True
+    if op not in reached_ops:
+      reached_ops.add(op)
       for output in op.outputs:
         if _IsBackpropagatable(output):
           queue.extend(output.consumers())
@@ -130,7 +130,7 @@ def _GatherInputs(to_ops, reached_ops):
 
   Args:
     to_ops: list of Operations.
-    reached_ops: list of booleans, indexed by operation id.
+    reached_ops: set of Operations.
 
   Returns:
     The list of all inputs of to_ops that are in reached_ops.
@@ -142,58 +142,57 @@ def _GatherInputs(to_ops, reached_ops):
   while queue:
     op = queue.popleft()
     # We are interested in this op.
-    if reached_ops[op._id]:
+    if op in reached_ops:
       inputs.append(op)
       # Clear the boolean so we won't add the inputs again.
-      reached_ops[op._id] = False
+      reached_ops.remove(op)
       for inp in op.inputs:
         queue.append(inp.op)
   return inputs
 
 
-def _PendingCount(graph, to_ops, from_ops, colocate_gradients_with_ops):
+def _PendingCount(to_ops, from_ops, colocate_gradients_with_ops):
   """Initialize the pending count for ops between two lists of Operations.
 
-  'pending_count[op._id]' indicates the number of backprop inputs
+  'pending_count[op]' indicates the number of backprop inputs
   to this operation.
 
   Args:
-    graph: a Graph.
     to_ops: list of Operations.
     from_ops: list of Operations.
     colocate_gradients_with_ops: Python bool.  See docstring of gradients().
 
   Returns:
-    A tuple containing: (1) the subset of to_ops ids reachable from from_ops
-    by a path of zero or more backpropagatable tensors, (2) a list of integers
-    indexed by operation id, indicating the number of backprop inputs to this
-    operation, and (3) a ControlFlowState object which is not None if the ops
-    between from_ops and to_ops contain control flow loops.
+    A tuple containing: (1) the subset of to_ops reachable from from_ops by a
+    path of zero or more backpropagatable tensors, (2) a mapping from operation
+    to the number of backprop inputs to that op, and (3) a ControlFlowState
+    object which is not None if the ops between from_ops and to_ops contain
+    control flow loops.
   """
   # Mark reachable ops from from_ops.
-  reached_ops = [False] * (graph._last_id + 1)
+  reached_ops = set()
   _MarkReachedOps(from_ops, reached_ops)
-  # reached_ops[X] iff X is reachable from from_ops by a path of zero or more
+  # X in reached_ops iff X is reachable from from_ops by a path of zero or more
   # backpropagatable tensors.
 
-  reachable_to_ops = set(op._id for op in to_ops if reached_ops[op._id])  # pylint: disable=protected-access
+  reachable_to_ops = set(op for op in to_ops if op in reached_ops)
 
   # Mark between ops.
-  between_ops = [False] * (graph._last_id + 1)
+  between_ops = set()
   between_op_list = []
   queue = collections.deque()
   queue.extend(to_ops)
   while queue:
     op = queue.popleft()
     # We are interested in this op.
-    if reached_ops[op._id]:
-      between_ops[op._id] = True
+    if op in reached_ops:
+      between_ops.add(op)
       between_op_list.append(op)
       # Clear the boolean so we won't add the inputs again.
-      reached_ops[op._id] = False
+      reached_ops.remove(op)
       for inp in op.inputs:
         queue.append(inp.op)
-  # between_ops[X] iff X is on a path of zero or more backpropagatable tensors
+  # X in between_ops iff X is on a path of zero or more backpropagatable tensors
   # between from_ops and to_ops
 
   # 'loop_state' is None if there are no while loops.
@@ -201,11 +200,11 @@ def _PendingCount(graph, to_ops, from_ops, colocate_gradients_with_ops):
       between_op_list, between_ops, colocate_gradients_with_ops)
 
   # Initialize pending count for between ops.
-  pending_count = [0] * (graph._last_id + 1)
+  pending_count = collections.defaultdict(int)
   for op in between_op_list:
     for x in op.inputs:
-      if between_ops[x.op._id]:
-        pending_count[x.op._id] += 1
+      if x.op in between_ops:
+        pending_count[x.op] += 1
 
   return reachable_to_ops, pending_count, loop_state
 
@@ -297,7 +296,8 @@ def _DefaultGradYs(grad_ys,
 def _IsTrainable(tensor):
   dtype = dtypes.as_dtype(tensor.dtype)
   return dtype.base_dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
-                              dtypes.complex64, dtypes.complex128)
+                              dtypes.complex64, dtypes.complex128,
+                              dtypes.resource)
 
 
 def _IsBackpropagatable(tensor):
@@ -330,15 +330,15 @@ def _StopOps(from_ops, stop_gradient_ops, pending_count):
   should stop. Operations in the returned set will not be differentiated.
   This set is defined as the subset of `from_ops` containing ops that have
   no predecessor in `from_ops`. `pending_count` is the result of
-  `_PendingCount(g, xs, from_ops)`. An 'op' has predecessors in `from_ops`
-  iff pending_count[op._id] > 0.
+  `_PendingCount(xs, from_ops)`. An 'op' has predecessors in `from_ops`
+  iff pending_count[op] > 0.
 
   In addition, none of `stop_gradient_ops` will be differentiated.
 
   Args:
     from_ops: list of Operations.
     stop_gradient_ops: list of Operations never to backprop through.
-    pending_count: List of integers, indexed by operation id.
+    pending_count: mapping from operation to number of backprop inputs.
 
   Returns:
     The set of operations.
@@ -347,12 +347,12 @@ def _StopOps(from_ops, stop_gradient_ops, pending_count):
   for op in from_ops:
     is_stop_op = True
     for inp in op.inputs:
-      if pending_count[inp.op._id] > 0:
+      if pending_count[inp.op] > 0:
         is_stop_op = False
         break
     if is_stop_op:
-      stop_ops.add(op._id)
-  stop_ops.update(op._id for op in stop_gradient_ops)  # pylint: disable=protected-access
+      stop_ops.add(op)
+  stop_ops.update(op for op in stop_gradient_ops)
   return stop_ops
 
 
@@ -374,9 +374,7 @@ def _SymGrad(op, out_grads):
   f.name = op.type
   for k in op.node_def.attr:
     f.attr[k].CopyFrom(op.node_def.attr[k])
-  # pylint: disable=protected-access
   in_grads = functional_ops.symbolic_gradient(input=f_in, Tout=f_types, f=f)
-  # pylint: enable=protected-access
   return in_grads
 
 
@@ -417,6 +415,30 @@ def _MaybeCompile(scope, op, func, grad_fn):
     return grad_fn()
 
 
+def _RaiseNoGradWrtInitialLoopValError(op, from_ops):
+  """Raises an error if we backprop through a loop var."""
+  # Find the nearest 'to_op' reachable from 'op' to provide a more helpful error
+  # message.
+  target_op = None
+  queue = collections.deque([op])
+  visited = set()
+  while queue:
+    curr_op = queue.popleft()
+    if curr_op in visited: continue
+    visited.add(curr_op)
+    if curr_op in from_ops:
+      target_op = curr_op
+      break
+    queue.extend(t.op for t in curr_op.inputs)
+  assert target_op
+  raise ValueError(
+      "Cannot compute gradient inside while loop with respect to op '%s'. "
+      "We do not support taking the gradient wrt or through the initial value "
+      "of a loop variable. Gradients can be computed through loop invariants "
+      "or wrt the input parameters to the loop body."
+      % target_op.name)
+
+
 @tf_export("gradients")
 def gradients(ys,
               xs,
@@ -510,13 +532,23 @@ def gradients(ys,
                             gate_gradients, aggregation_method, stop_gradients)
 
 
-def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
-                     gate_gradients, aggregation_method, stop_gradients):
+def _GradientsHelper(ys,
+                     xs,
+                     grad_ys=None,
+                     name="gradients",
+                     colocate_gradients_with_ops=False,
+                     gate_gradients=False,
+                     aggregation_method=None,
+                     stop_gradients=None,
+                     src_graph=None):
   """Implementation of gradients()."""
   if context.executing_eagerly():
     raise RuntimeError("tf.gradients not supported when eager execution "
                        "is enabled. Use tf.contrib.eager.GradientTape "
                        "instead.")
+  if src_graph is None:
+    src_graph = ops.get_default_graph()
+
   ys = _AsList(ys)
   xs = _AsList(xs)
   stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
@@ -556,7 +588,7 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
     from_ops = [t.op for t in xs]
     stop_gradient_ops = [t.op for t in stop_gradients]
     reachable_to_ops, pending_count, loop_state = _PendingCount(
-        ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops)
+        to_ops, from_ops, colocate_gradients_with_ops)
 
     # Iterate over the collected ops.
     #
@@ -578,12 +610,10 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
     for op in to_ops:
       # 'ready' handles the case where one output gradient relies on
       # another output's gradient.
-      # pylint: disable=protected-access
-      ready = (pending_count[op._id] == 0)
-      if ready and op._id not in to_ops_set and op._id in reachable_to_ops:
-        to_ops_set.add(op._id)
+      ready = (pending_count[op] == 0)
+      if ready and op not in to_ops_set and op in reachable_to_ops:
+        to_ops_set.add(op)
         queue.append(op)
-      # pylint: enable=protected-access
 
     if loop_state:
       loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set)
@@ -607,12 +637,12 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
         grad_fn = None
         func_call = None
         # pylint: disable=protected-access
-        is_func_call = ops.get_default_graph()._is_function(op.type)
+        is_func_call = src_graph._is_function(op.type)
         # pylint: enable=protected-access
         has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads)
-        if has_out_grads and (op._id not in stop_ops):
+        if has_out_grads and (op not in stop_ops):
           if is_func_call:
-            func_call = ops.get_default_graph()._get_function(op.type)
+            func_call = src_graph._get_function(op.type)  # pylint: disable=protected-access
             # Note that __defun is not set if the graph is
             # imported. If it's set, we prefer to access the original
             # defun.
@@ -629,6 +659,21 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
                   (op.name, op.type))
         if loop_state:
           loop_state.EnterGradWhileContext(op, before=False)
+
+        # NOTE(skyewm): We don't support computing gradients wrt a loop variable
+        # unless it's within the context of a single iteration (i.e. the
+        # gradient is wrt to the loop parameter in the body function, not wrt or
+        # through the initial value). This means if we're in a while loop
+        # context, we should never see a switch node from this context.
+        # pylint: disable=protected-access
+        if (control_flow_util.IsSwitch(op) and
+            op._control_flow_context is not None and
+            op._control_flow_context.IsWhileContext() and
+            op._control_flow_context ==
+            ops.get_default_graph()._get_control_flow_context()):
+          _RaiseNoGradWrtInitialLoopValError(op, from_ops)
+        # pylint: enable=protected-access
+
         if (grad_fn or is_func_call) and has_out_grads:
           # NOTE: If _AggregatedGrads didn't compute a value for the i'th
           # output, it means that the cost does not depend on output[i],
@@ -647,7 +692,7 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
                 out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i)
           with ops.name_scope(op.name + "_grad"):
             # pylint: disable=protected-access
-            with ops.get_default_graph()._original_op(op):
+            with src_graph._original_op(op):
               # pylint: enable=protected-access
               if grad_fn:
                 # If grad_fn was found, do not use SymbolicGradient even for
@@ -714,13 +759,10 @@ def _HasAnyNotNoneGrads(grads, op):
 def _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state):
   """Update pending count for the inputs of op and enqueue ready ops."""
   for x in op.inputs:
-    # pylint: disable=protected-access
-    pending_count[x.op._id] -= 1
-    ready = (pending_count[x.op._id] == 0)
+    pending_count[x.op] -= 1
+    ready = (pending_count[x.op] == 0)
     if loop_state and not ready:
-      ready = (
-          pending_count[x.op._id] > 0 and control_flow_util.IsLoopSwitch(x.op))
-    # pylint: enable=protected-access
+      ready = pending_count[x.op] > 0 and control_flow_util.IsLoopSwitch(x.op)
     if ready:
       if control_flow_util.IsLoopExit(x.op):
         # if x is an exit without real gradient, defer processing them.
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 5e8b8822efd6066594aa4d436cc4cafc97e8eb68..70d500a108f0d4633aed3602cbab6938bddcab29 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -57,11 +57,10 @@ from tensorflow.python.ops.nn_ops import bias_add
 from tensorflow.python.platform import googletest
 
 
-def _OpsBetween(graph, to_ops, from_ops):
+def _OpsBetween(to_ops, from_ops):
   """Build the list of operations between two lists of Operations.
 
   Args:
-    graph: a Graph.
     to_ops: list of Operations.
     from_ops: list of Operations.
 
@@ -72,13 +71,12 @@ def _OpsBetween(graph, to_ops, from_ops):
     TODO(touts): Think about returning an empty list if from_ops are not
     reachable from to_ops.  Presently it returns to_ops in that case.
   """
-  # List of booleans, indexed by operation id, indicating if
-  # an op is reached from the output of "input_ops".
-  reached_ops = [False] * (graph._last_id + 1)
+  # Ops that are reachable from the output of "input_ops".
+  reached_ops = set()
   # We only care to reach up to "output_ops" so we mark the
   # output ops as reached to avoid recursing past them.
   for op in to_ops:
-    reached_ops[op._id] = True
+    reached_ops.add(op)
   gradients_impl._MarkReachedOps(from_ops, reached_ops)
   between_ops = gradients_impl._GatherInputs(to_ops, reached_ops)
   between_ops.sort(key=lambda x: -x._id)
@@ -95,18 +93,18 @@ class GradientsTest(test_util.TensorFlowTestCase):
     self.assertEquals(self._OpNames(ops1), self._OpNames(ops2))
 
   def testOpsBetweenSimple(self):
-    with ops.Graph().as_default() as g:
+    with ops.Graph().as_default():
       t1 = constant(1.0)
       t2 = constant(2.0)
       t3 = array_ops.stack([t1, t2])
     # Full graph
     self._assertOpListEqual([t3.op, t2.op, t1.op],
-                            _OpsBetween(g, [t3.op], [t1.op, t2.op]))
+                            _OpsBetween([t3.op], [t1.op, t2.op]))
     # Only t1, t3.
-    self._assertOpListEqual([t3.op, t1.op], _OpsBetween(g, [t3.op], [t1.op]))
+    self._assertOpListEqual([t3.op, t1.op], _OpsBetween([t3.op], [t1.op]))
 
   def testOpsBetweenUnreachable(self):
-    with ops.Graph().as_default() as g:
+    with ops.Graph().as_default():
       t1 = constant(1.0)
       t2 = constant(2.0)
       _ = array_ops.stack([t1, t2])
@@ -114,10 +112,10 @@ class GradientsTest(test_util.TensorFlowTestCase):
       t5 = constant(2.0)
       t6 = array_ops.stack([t4, t5])
     # Elements of to_ops are always listed.
-    self._assertOpListEqual([t6.op], _OpsBetween(g, [t6.op], [t1.op]))
+    self._assertOpListEqual([t6.op], _OpsBetween([t6.op], [t1.op]))
 
   def testOpsBetweenCut(self):
-    with ops.Graph().as_default() as g:
+    with ops.Graph().as_default():
       t1 = constant(1.0)
       t2 = constant(2.0)
       t3 = array_ops.stack([t1, t2])
@@ -126,10 +124,10 @@ class GradientsTest(test_util.TensorFlowTestCase):
       t6 = constant([2.0])
       t7 = array_ops.concat([t5, t6], 0)
     self._assertOpListEqual([t7.op, t5.op, t4.op],
-                            _OpsBetween(g, [t7.op], [t4.op]))
+                            _OpsBetween([t7.op], [t4.op]))
 
   def testOpsBetweenCycle(self):
-    with ops.Graph().as_default() as g:
+    with ops.Graph().as_default():
       t1 = constant(1.0)
       t2 = constant(2.0)
       t3 = array_ops.stack([t1, t2])
@@ -138,11 +136,11 @@ class GradientsTest(test_util.TensorFlowTestCase):
       t6 = array_ops.concat([t4, t5], 0)
       t7 = array_ops.concat([t6, t3], 0)
     self._assertOpListEqual([t6.op, t4.op, t3.op],
-                            _OpsBetween(g, [t6.op], [t3.op]))
+                            _OpsBetween([t6.op], [t3.op]))
     self._assertOpListEqual([t7.op, t6.op, t5.op, t4.op, t3.op, t1.op],
-                            _OpsBetween(g, [t7.op], [t1.op, t5.op]))
+                            _OpsBetween([t7.op], [t1.op, t5.op]))
     self._assertOpListEqual([t6.op, t5.op, t4.op, t3.op, t2.op],
-                            _OpsBetween(g, [t6.op], [t2.op, t5.op]))
+                            _OpsBetween([t6.op], [t2.op, t5.op]))
 
   def testGradients(self):
     with ops.Graph().as_default():
@@ -289,10 +287,6 @@ class GradientsTest(test_util.TensorFlowTestCase):
       self.assertEqual(10.0, grads[1].eval())
 
   def testNoGradientForStringOutputs(self):
-    # This test can't be run twice because the TestStringOutput gradient can
-    # only be registered once. Just run with the C API enabled.
-    if not ops._USE_C_API: return
-
     with ops.Graph().as_default():
 
       def _TestOpGrad(_, float_grad, string_grad):
@@ -438,7 +432,6 @@ class GradientsTest(test_util.TensorFlowTestCase):
         np.testing.assert_allclose(a, b)
 
 
-@test_util.with_c_api
 class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
   @classmethod
@@ -528,7 +521,6 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
         f.add_to_graph(ops.Graph())
 
 
-@test_util.with_c_api
 class StopGradientTest(test_util.TensorFlowTestCase):
 
   def testStopGradient(self):
@@ -539,7 +531,6 @@ class StopGradientTest(test_util.TensorFlowTestCase):
     assert igrad is None
 
 
-@test_util.with_c_api
 class PreventGradientTest(test_util.TensorFlowTestCase):
 
   def testPreventGradient(self):
@@ -550,7 +541,6 @@ class PreventGradientTest(test_util.TensorFlowTestCase):
         _ = gradients.gradients(out, inp)
 
 
-@test_util.with_c_api
 class HessianVectorProductTest(test_util.TensorFlowTestCase):
 
   def testHessianVectorProduct(self):
@@ -579,7 +569,6 @@ class HessianVectorProductTest(test_util.TensorFlowTestCase):
       self.assertAllClose(hess_v_value, hess_v_actual)
 
 
-@test_util.with_c_api
 class HessianTest(test_util.TensorFlowTestCase):
 
   def testHessian1D(self):
@@ -668,7 +657,6 @@ class HessianTest(test_util.TensorFlowTestCase):
     self.assertAllClose(hess_value, hess_actual.reshape((m * n, m * n)))
 
 
-@test_util.with_c_api
 class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
 
   def testIndexedSlicesToTensor(self):
@@ -749,7 +737,6 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
         str(w[0].message))
 
 
-@test_util.with_c_api
 class OnlyRealGradientsTest(test_util.TensorFlowTestCase):
 
   def testRealOnly(self):
@@ -786,7 +773,6 @@ class ResourceCondTest(test_util.TensorFlowTestCase):
     self.assertTrue(None not in grads)
 
 
-@test_util.with_c_api
 class CustomGradientTest(test_util.TensorFlowTestCase):
 
   def testCustomGradientTrivial(self):
@@ -944,6 +930,21 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       # Smoke test to ensure numpy inputs are accepted
       F(x)
 
+  def testRVGradientsDynamicCond(self):
+    with self.test_session():
+      alpha = resource_variable_ops.ResourceVariable(
+          np.random.random((1,)),
+          dtype="float32")
+
+      conditional = array_ops.placeholder_with_default(True, shape=())
+      output = control_flow_ops.cond(
+          conditional, lambda: alpha * 2, lambda: alpha * 3)
+
+      g, = gradients_impl.gradients(output, alpha)
+      variables.global_variables_initializer().run()
+      self.assertAllEqual(g.eval(), [2.0])
+      self.assertAllEqual(g.eval(feed_dict={conditional: False}), [3.0])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/image_grad.py b/tensorflow/python/ops/image_grad.py
index 9f43e3f1466d900ae6d39f3b9ef48043421cb777..102181e68b4d091872f8562e9912c1b517e39044 100644
--- a/tensorflow/python/ops/image_grad.py
+++ b/tensorflow/python/ops/image_grad.py
@@ -107,16 +107,20 @@ def _CropAndResizeGrad(op, grad):
   allowed_types = [dtypes.float16, dtypes.float32, dtypes.float64]
   if op.inputs[0].dtype in allowed_types:
     # pylint: disable=protected-access
-    grad0 = gen_image_ops.crop_and_resize_grad_image(grad,
-                                                     op.inputs[1],
-                                                     op.inputs[2],
-                                                     image_shape,
-                                                     T=op.get_attr("T"))
+    grad0 = gen_image_ops.crop_and_resize_grad_image(
+        grad, op.inputs[1], op.inputs[2], image_shape, T=op.get_attr("T"),
+        method=op.get_attr("method"))
     # pylint: enable=protected-access
   else:
     grad0 = None
 
-  grad1 = gen_image_ops.crop_and_resize_grad_boxes(grad, op.inputs[0],
-                                                   op.inputs[1], op.inputs[2])
+  # `grad0` is the gradient to the input image pixels and it
+  # has been implemented for nearest neighbor and bilinear sampling
+  # respectively. `grad1` is the gradient to the input crop boxes' coordinates.
+  # When using nearest neighbor sampling, the gradient to crop boxes'
+  # coordinates are not well defined. In practice, we still approximate
+  # grad1 using the gradient derived from bilinear sampling.
+  grad1 = gen_image_ops.crop_and_resize_grad_boxes(
+      grad, op.inputs[0], op.inputs[1], op.inputs[2])
 
   return [grad0, grad1, None, None]
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index bd5b2ae83b5dd16e15dcf87de9714c082fa10ef9..3f40e3ff7552ecaf9c81b69addb7f4a35e2301e4 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1727,7 +1727,7 @@ def sample_distorted_bounding_box(image_size,
       width / height within this range.
     area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
       The cropped area of the image must contain a fraction of the
-      supplied image within in this range.
+      supplied image within this range.
     max_attempts: An optional `int`. Defaults to `100`.
       Number of attempts at generating a cropped region of the image
       of the specified constraints. After `max_attempts` failures, return the
@@ -1772,6 +1772,7 @@ def non_max_suppression(boxes,
                         scores,
                         max_output_size,
                         iou_threshold=0.5,
+                        score_threshold=0.0,
                         name=None):
   """Greedily selects a subset of bounding boxes in descending order of score.
 
@@ -1800,6 +1801,8 @@ def non_max_suppression(boxes,
       of boxes to be selected by non max suppression.
     iou_threshold: A float representing the threshold for deciding whether boxes
       overlap too much with respect to IOU.
+    score_threshold: A float representing the threshold for deciding when to
+      remove boxes based on score.
     name: A name for the operation (optional).
 
   Returns:
@@ -1808,8 +1811,10 @@ def non_max_suppression(boxes,
   """
   with ops.name_scope(name, 'non_max_suppression'):
     iou_threshold = ops.convert_to_tensor(iou_threshold, name='iou_threshold')
-    return gen_image_ops.non_max_suppression_v2(boxes, scores, max_output_size,
-                                                iou_threshold)
+    score_threshold = ops.convert_to_tensor(
+        score_threshold, name='score_threshold')
+    return gen_image_ops.non_max_suppression_v3(boxes, scores, max_output_size,
+                                                iou_threshold, score_threshold)
 
 
 _rgb_to_yiq_kernel = [[0.299, 0.59590059,
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index f93bf0a17f31b4451cad9c5bc525d9f237f97bef..1f8d8dc4f3e7b84cea9850f5da08d8c5a189e096 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -488,9 +488,9 @@ class Orthogonal(Initializer):
 
   If the shape of the tensor to initialize is two-dimensional, it is initialized
   with an orthogonal matrix obtained from the QR decomposition of a matrix of
-  uniform random numbers. If the matrix has fewer rows than columns then the
-  output will have orthogonal rows. Otherwise, the output will have orthogonal
-  columns.
+  random numbers drawn from a normal distribution.
+  If the matrix has fewer rows than columns then the output will have orthogonal
+  rows. Otherwise, the output will have orthogonal columns.
 
   If the shape of the tensor to initialize is more than two-dimensional,
   a matrix of shape `(shape[0] * ... * shape[n - 2], shape[n - 1])`
diff --git a/tensorflow/python/ops/linalg/linear_operator_kronecker.py b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
index da959f9a1c6d0c08ffd2e82e09a269f0834159e6..1fd5073c17832f0689616f2842c33c95d186e487 100644
--- a/tensorflow/python/ops/linalg/linear_operator_kronecker.py
+++ b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
@@ -381,10 +381,6 @@ class LinearOperatorKronecker(linear_operator.LinearOperator):
       else:
         matrix_dimensions = [self.range_dimension, column_dim]
 
-      print("x: ", x)
-      print("bathc_shape:", self.batch_shape)
-      print("self.shape:", self.shape)
-      print("output: ", output)
       output.set_shape(broadcast_batch_shape.concatenate(
           matrix_dimensions))
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index 7e4fb6a6fc31960d570c78398ae211ba45a4a4a7..1b5bb9470c4406ad075f2f6d5c38661311472727 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -178,7 +178,9 @@ class LinearOperatorDerivedClassTest(test.TestCase):
       SkipTest Exception, if test_name is in self._tests_to_skip.
     """
     if test_name in self._tests_to_skip:
-      self.skipTest("%s skipped because it was added to self._tests_to_skip.")
+      self.skipTest(
+          "{} skipped because it was added to self._tests_to_skip.".format(
+              test_name))
 
   def test_to_dense(self):
     self._skip_if_tests_to_skip_contains("to_dense")
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 222b8ebc9da6b076f012f8febbd50cc3c4c86c08..8276047cb678f3d340701718156f8a1cfd6831cb 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -35,8 +35,9 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # Assert and Print are special symbols in python, so we must
-# use an upper-case version of them.
-@tf_export("Print")
+# have an upper-case version of them.  For users with Python 3 or Python 2.7
+# with `from __future__ import print_function`, we also allow lowercase.
+@tf_export("Print", "print")
 def Print(input_, data, message=None, first_n=None, summarize=None,
           name=None):
   """Prints a list of tensors.
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 9fc545c9678e7eb33a7ad35e2a84f890885e09af..de9b3c6909ddd9c22ac4bced5ec48e4de354bd19 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -334,8 +334,11 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
 
   Args:
     labels: The ground truth output tensor. Its shape should match the shape of
-      logits. The values of the tensor are expected to be 0.0 or 1.0.
-    logits: The logits, a float tensor.
+      logits. The values of the tensor are expected to be 0.0 or 1.0. Internally
+      the {0,1} labels are converted to {-1,1} when calculating the hinge loss.
+    logits: The logits, a float tensor. Note that logits are assumed to be
+      unbounded and 0-centered. A value > 0 (resp. < 0) is considered a positive
+      (resp. negative) binary prediction.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
       be either `1`, or the same as the corresponding `losses` dimension).
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 02e07dc7b1f5fe6a671da967f6d07cef123d3d1e..563c0b3ab3f6316b89f5ea76f5d075d9f4b77eea 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -171,7 +171,9 @@ def _ProdGrad(op, grad):
   # Calculate product, leaving out the current entry
   left = math_ops.cumprod(reshaped, axis=0, exclusive=True)
   right = math_ops.cumprod(reshaped, axis=0, exclusive=True, reverse=True)
-  y = array_ops.reshape(left * right, permuted_shape)
+  # For complex inputs, the gradient is in the conjugate direction.
+  y = array_ops.reshape(math_ops.conj(left) * math_ops.conj(right),
+                        permuted_shape)
 
   # Invert the transpose and reshape operations.
   # Make sure to set the statically known shape information through a reshape.
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index 04eeb00518a3afa2e2ee36e84eee133304590779..fa47b8f9b8a0e72c5ecf814e6a80e04fb559990c 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -152,6 +152,28 @@ class ProdGradientTest(test.TestCase):
           outputs, outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  def testProdGradientComplex(self):
+    for dtype in dtypes.complex64, dtypes.complex128:
+      inputs = constant_op.constant([[1 + 3j, 2 - 1j], [3j, 4]],
+                                    dtype=dtype)
+      outputs = math_ops.reduce_prod(inputs)
+      with self.test_session():
+        error = gradient_checker.compute_gradient_error(
+            inputs, inputs.get_shape().as_list(),
+            outputs, outputs.get_shape().as_list())
+        self.assertLess(error, 1e-4)
+
+  def testProdGradientForNegativeAxisComplex(self):
+    for dtype in dtypes.complex64, dtypes.complex128:
+      inputs = constant_op.constant([[1 + 3j, 2 - 1j], [3j, 4]],
+                                    dtype=dtype)
+      outputs = math_ops.reduce_prod(inputs, -1)
+      with self.test_session():
+        error = gradient_checker.compute_gradient_error(
+            inputs, inputs.get_shape().as_list(),
+            outputs, outputs.get_shape().as_list())
+        self.assertLess(error, 1e-4)
+
 
 class SegmentMinOrMaxGradientTest(test.TestCase):
 
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 3a31ef7f8814906883665a55f7b90710fd0baf2f..118b02c6c7ffb9e1f8bca6fec20325b3965b888f 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -125,8 +125,8 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`,
-      `int64`, `complex64` or `complex128`.
+    x: A `Tensor` or `SparseTensor` of type `float16`, `float32`, `float64`,
+      `int32`, `int64`, `complex64` or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -430,10 +430,10 @@ def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-     or `complex128`.
-    y: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-     or `complex128`.
+    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
+     `complex64`, or `complex128`.
+    y: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
+     `complex64`, or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -600,7 +600,7 @@ def round(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float32` or `float64`.
+    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, or `int64`.
     name: A name for the operation (optional).
 
   Returns:
@@ -1257,7 +1257,7 @@ def reduce_sum(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1397,7 +1397,7 @@ def reduce_mean(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1469,7 +1469,7 @@ def reduce_prod(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1519,7 +1519,7 @@ def reduce_min(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1568,7 +1568,7 @@ def reduce_max(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 05bcee8801259e4bc6c20c3f61cf20025ba5ea33..980c92b0d592bccc34e1fbee636ebdd39056f2fc 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -35,7 +35,6 @@ exp = np.exp
 log = np.log
 
 
-@test_util.with_c_api
 class ReduceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -70,7 +69,6 @@ class ReduceTest(test_util.TensorFlowTestCase):
       math_ops.reduce_sum(x, axis)
 
 
-@test_util.with_c_api
 class LogSumExpTest(test_util.TensorFlowTestCase):
 
   def testReduceLogSumExp(self):
@@ -150,7 +148,6 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
       self.assertEqual(-np.inf, res)
 
 
-@test_util.with_c_api
 class RoundTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -166,7 +163,6 @@ class RoundTest(test_util.TensorFlowTestCase):
         self.assertAllClose(y_tf_np, y_np, atol=1e-2)
 
 
-@test_util.with_c_api
 class ModTest(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -196,7 +192,6 @@ class ModTest(test_util.TensorFlowTestCase):
         self.assertAllClose(y_tf_np, y_np)
 
 
-@test_util.with_c_api
 class SquaredDifferenceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -210,7 +205,6 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase):
         self.assertAllClose(z, z_tf)
 
 
-@test_util.with_c_api
 class ApproximateEqualTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -242,7 +236,6 @@ class ApproximateEqualTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(z, z_tf)
 
 
-@test_util.with_c_api
 class ScalarMulTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -284,7 +277,6 @@ class ScalarMulTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(self.evaluate(x.indices), [0, 2, 5])
 
 
-@test_util.with_c_api
 class AccumulateNTest(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -304,7 +296,6 @@ class AccumulateNTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(x[0] * 6, math_ops.accumulate_n([tf_x[0]] * 6).eval())
 
 
-@test_util.with_c_api
 class AddNTest(test_util.TensorFlowTestCase):
 
   def testPartials(self):
@@ -358,7 +349,6 @@ class AddNTest(test_util.TensorFlowTestCase):
                             [g.eval() for g in add_n_grad])
 
 
-@test_util.with_c_api
 class DivAndModTest(test_util.TensorFlowTestCase):
   # TODO(aselle): Test more types before exposing new division operators.
 
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index 1508ff44ceaea4195d3311818ebb732f61f2bf87..7d6dd3fb027c9a5aa2e64156e31108b367a41ca7 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -35,7 +35,6 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
-@test_util.with_c_api
 class BatchNormalizationTest(test.TestCase):
 
   def _npBatchNorm(self, x, m, v, beta, gamma, epsilon,
@@ -341,7 +340,6 @@ class BatchNormalizationTest(test.TestCase):
                                        param_dtype=dtypes.float32, atol=0.001)
 
 
-@test_util.with_c_api
 class SufficientStatisticsTest(test.TestCase):
 
   def _npSuffStats(self, x, axes, shift, keep_dims):
@@ -401,7 +399,6 @@ class SufficientStatisticsTest(test.TestCase):
           self._testSuffStats([1, 2, 3], [0, 2], shift, keep_dims, has_shape)
 
 
-@test_util.with_c_api
 class NormalizeMomentsTest(test.TestCase):
 
   def _npNormalizeMoments(self, counts, mean_ss, variance_ss, shift):
@@ -445,7 +442,6 @@ class NormalizeMomentsTest(test.TestCase):
       self._testNormalizeMoments([2, 3], shift)
 
 
-@test_util.with_c_api
 class MomentsTest(test.TestCase):
 
   def _unweighted_moments(self, x, axes, keep_dims=False, extra_out_grads=None):
@@ -583,7 +579,6 @@ class MomentsTest(test.TestCase):
     self._testGlobalGradient(from_y="var")
 
 
-@test_util.with_c_api
 class WeightedMomentsTest(MomentsTest):
   """Tests for nn.weighted_moments.
 
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index cd07550d2ee31e9c830afa8e9c5e17deb35d1135..54b08a564b1d7da3d4e30d593c18ee3811cff3d4 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1596,12 +1596,12 @@ def leaky_relu(features, alpha=0.2, name=None):
   Returns:
     The activation value.
   """
-  with ops.name_scope(name, "LeakyRelu", [features, alpha]):
+  with ops.name_scope(name, "LeakyRelu", [features, alpha]) as name:
     features = ops.convert_to_tensor(features, name="features")
     if features.dtype.is_integer:
       features = math_ops.to_float(features)
     alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
-    return math_ops.maximum(alpha * features, features)
+    return math_ops.maximum(alpha * features, features, name=name)
 
 
 def _flatten_outer_dims(logits):
@@ -2100,11 +2100,10 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   Args:
     value: A 4-D `Tensor` of shape `[batch, height, width, channels]` and type
       `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
-    ksize: A 1-D int Tensor of 4 elements.
-      The size of the window for each dimension of the input tensor.
-    strides: A 1-D int Tensor of 4 elements
-      The stride of the sliding window for each dimension of the
-      input tensor.
+    ksize: A list or tuple of 4 ints. The size of the window for each dimension
+      of the input tensor.
+    strides: A list or tuple of 4 ints. The stride of the sliding window for
+      each dimension of the input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
       See the @{tf.nn.convolution$comment here}
     data_format: A string. 'NHWC' and 'NCHW' are supported.
@@ -2130,10 +2129,10 @@ def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
 
   Args:
     value: A 4-D `Tensor` of the format specified by `data_format`.
-    ksize: A 1-D int Tensor of 4 elements.  The size of the window for
+    ksize: A list or tuple of 4 ints. The size of the window for each dimension
+      of the input tensor.
+    strides: A list or tuple of 4 ints. The stride of the sliding window for
       each dimension of the input tensor.
-    strides: A 1-D int Tensor of 4 elements.  The stride of the sliding
-      window for each dimension of the input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
       See the @{tf.nn.convolution$comment here}
     data_format: A string. 'NHWC', 'NCHW' and 'NCHW_VECT_C' are supported.
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 46a5f4fae6b15766c21011ebeae5437262192df7..035b4735affbd37f9de94057eed6f7b5d9aadd6e 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -962,6 +962,16 @@ class LeakyReluTest(test_lib.TestCase):
       self.assertAllClose(
           outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
 
+  def testName(self):
+    np_values = np.array([-2, -1, 0, 1, 2], dtype=np.float64)
+    outputs_with_name_set = nn_ops.leaky_relu(
+        constant_op.constant(np_values),
+        name='test_relu_op')
+    self.assertEqual(outputs_with_name_set.name, 'test_relu_op:0')
+    outputs_without_name_set = nn_ops.leaky_relu(
+        constant_op.constant(np_values))
+    self.assertEqual(outputs_without_name_set.name, 'LeakyRelu:0')
+
 
 class SwishTest(test_lib.TestCase):
 
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 1e953f658fca08c1e6a3297bc23f1b98269c1dcf..e5b80200c09f867faba96329c3515f88c654a554 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
+
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
 from tensorflow.python import pywrap_tensorflow
@@ -38,7 +40,7 @@ from tensorflow.python.ops import variables
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_resource_variable_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import compat
 
 
@@ -115,6 +117,18 @@ def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
   return handle
 
 
+@contextlib.contextmanager
+def _handle_graph(handle):
+  # Note: might have an eager tensor but not be executing eagerly when building
+  # functions.
+  if (context.executing_eagerly() or isinstance(handle, ops.EagerTensor)
+      or ops.has_default_graph()):
+    yield
+  else:
+    with handle.graph.as_default():
+      yield
+
+
 class EagerResourceDeleter(object):
   """An object which cleans up a resource handle.
 
@@ -159,7 +173,8 @@ class EagerResourceDeleter(object):
 
 def shape_safe_assign_variable_handle(handle, shape, value, name=None):
   """Helper that checks shape compatibility and assigns variable."""
-  value_tensor = ops.convert_to_tensor(value)
+  with _handle_graph(handle):
+    value_tensor = ops.convert_to_tensor(value)
   shape.assert_is_compatible_with(value_tensor.shape)
   return gen_resource_variable_ops.assign_variable_op(handle,
                                                       value_tensor,
@@ -850,8 +865,10 @@ class ResourceVariable(variables.Variable):
     # TODO(apassos): this here and below is not atomic. Consider making it
     # atomic if there's a way to do so without a performance cost for those who
     # don't need it.
-    assign_sub_op = gen_resource_variable_ops.assign_sub_variable_op(
-        self.handle, ops.convert_to_tensor(delta, dtype=self.dtype), name=name)
+    with _handle_graph(self.handle):
+      assign_sub_op = gen_resource_variable_ops.assign_sub_variable_op(
+          self.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
+          name=name)
     if read_value:
       return self._lazy_read(assign_sub_op)
     return assign_sub_op
@@ -872,8 +889,10 @@ class ResourceVariable(variables.Variable):
       it will return the `Operation` that does the assignment, and when in eager
       mode it will return `None`.
     """
-    assign_add_op = gen_resource_variable_ops.assign_add_variable_op(
-        self.handle, ops.convert_to_tensor(delta, dtype=self.dtype), name=name)
+    with _handle_graph(self.handle):
+      assign_add_op = gen_resource_variable_ops.assign_add_variable_op(
+          self.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
+          name=name)
     if read_value:
       return self._lazy_read(assign_add_op)
     return assign_add_op
@@ -902,30 +921,32 @@ class ResourceVariable(variables.Variable):
       it will return the `Operation` that does the assignment, and when in eager
       mode it will return `None`.
     """
-    value_tensor = ops.convert_to_tensor(value, dtype=self.dtype)
-    self._shape.assert_is_compatible_with(value_tensor.shape)
-    assign_op = gen_resource_variable_ops.assign_variable_op(
-        self.handle, value_tensor, name=name)
-    if read_value:
-      return self._lazy_read(assign_op)
+    with _handle_graph(self.handle):
+      value_tensor = ops.convert_to_tensor(value, dtype=self.dtype)
+      self._shape.assert_is_compatible_with(value_tensor.shape)
+      assign_op = gen_resource_variable_ops.assign_variable_op(
+          self.handle, value_tensor, name=name)
+      if read_value:
+        return self._lazy_read(assign_op)
     return assign_op
 
   def _strided_slice_assign(self, begin, end, strides, value, name, begin_mask,
                             end_mask, ellipsis_mask, new_axis_mask,
                             shrink_axis_mask):
-    return self._lazy_read(
-        gen_array_ops.resource_strided_slice_assign(
-            ref=self.handle,
-            begin=begin,
-            end=end,
-            strides=strides,
-            value=value,
-            name=name,
-            begin_mask=begin_mask,
-            end_mask=end_mask,
-            ellipsis_mask=ellipsis_mask,
-            new_axis_mask=new_axis_mask,
-            shrink_axis_mask=shrink_axis_mask))
+    with _handle_graph(self.handle):
+      return self._lazy_read(
+          gen_array_ops.resource_strided_slice_assign(
+              ref=self.handle,
+              begin=begin,
+              end=end,
+              strides=strides,
+              value=ops.convert_to_tensor(value, dtype=self.dtype),
+              name=name,
+              begin_mask=begin_mask,
+              end_mask=end_mask,
+              ellipsis_mask=ellipsis_mask,
+              new_axis_mask=new_axis_mask,
+              shrink_axis_mask=shrink_axis_mask))
 
   def __int__(self):
     if self.dtype != dtypes.int32 and self.dtype != dtypes.int64:
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index e94ad90dfd7fa75f6345b085b530e1eee6183035..10d576c95bc4fd3147da44ee1522dc829bcab83d 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -498,7 +498,7 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
       nested) tuple of Tensors each with dimensions `[batch_size, ...]`.
     sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
       Used to copy-through state and zero-out outputs when past a batch
-      element's sequence length.  So it's more for correctness than performance.
+      element's sequence length.  So it's more for performance than correctness.
     initial_state: (optional) An initial state for the RNN.
       If `cell.state_size` is an integer, this must be
       a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
@@ -1401,6 +1401,13 @@ def static_state_saving_rnn(cell,
     outputs[-1] = nest.pack_sequence_as(
         structure=last_output, flat_sequence=flat_last_output)
 
+    if state_is_tuple:
+      state = nest.pack_sequence_as(
+          structure=state,
+          flat_sequence=[array_ops.identity(s) for s in flat_state])
+    else:
+      state = array_ops.identity(state)
+
   return (outputs, state)
 
 
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 67f753485b8c30e31371c3bc1a24c335df976b5a..05723c6960af3772d9576756ee94bd19f562edd1 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -46,7 +46,7 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -979,6 +979,7 @@ class DropoutWrapper(RNNCell):
         but not `callable`.
       ValueError: if any of the keep_probs are not between 0 and 1.
     """
+    super(DropoutWrapper, self).__init__()
     assert_like_rnncell("cell", cell)
 
     if (dropout_state_filter_visitor is not None
@@ -1005,6 +1006,8 @@ class DropoutWrapper(RNNCell):
 
     # Set cell, variational_recurrent, seed before running the code below
     self._cell = cell
+    if isinstance(cell, checkpointable.CheckpointableBase):
+      self._track_checkpointable(self._cell, name="cell")
     self._variational_recurrent = variational_recurrent
     self._seed = seed
 
@@ -1151,7 +1154,10 @@ class ResidualWrapper(RNNCell):
         Defaults to calling nest.map_structure on (lambda i, o: i + o), inputs
         and outputs.
     """
+    super(ResidualWrapper, self).__init__()
     self._cell = cell
+    if isinstance(cell, checkpointable.CheckpointableBase):
+      self._track_checkpointable(self._cell, name="cell")
     self._residual_fn = residual_fn
 
   @property
@@ -1206,7 +1212,10 @@ class DeviceWrapper(RNNCell):
       cell: An instance of `RNNCell`.
       device: A device string or function, for passing to `tf.device`.
     """
+    super(DeviceWrapper, self).__init__()
     self._cell = cell
+    if isinstance(cell, checkpointable.CheckpointableBase):
+      self._track_checkpointable(self._cell, name="cell")
     self._device = device
 
   @property
@@ -1322,7 +1331,7 @@ class MultiRNNCell(RNNCell):
     return cur_inp, new_states
 
 
-class _SlimRNNCell(RNNCell):
+class _SlimRNNCell(RNNCell, checkpointable.NotCheckpointable):
   """A simple wrapper for slim.rnn_cells."""
 
   def __init__(self, cell_fn):
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 3e398db394402587c18408f136cf7ca2a6b4ee1c..c3b16a7bd5387e006aaea60b8814b1209ce01414 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -84,6 +84,8 @@ def _convert_to_sparse_tensors(sp_inputs):
 
 # pylint: disable=protected-access
 @tf_export("sparse_concat")
+@deprecation.deprecated_args(
+    None, "concat_dim is deprecated, use axis instead", "concat_dim")
 def sparse_concat(axis,
                   sp_inputs,
                   name=None,
@@ -295,7 +297,8 @@ def sparse_add(a, b, thresh=0):
                                                   a.dense_shape, b)
 
 
-def _sparse_cross(inputs, name=None):
+@tf_export("sparse.cross")
+def sparse_cross(inputs, name=None):
   """Generates sparse cross from a list of sparse and dense tensors.
 
   For example, if the inputs are
@@ -324,7 +327,11 @@ def _sparse_cross(inputs, name=None):
   return _sparse_cross_internal(inputs=inputs, hashed_output=False, name=name)
 
 
-def _sparse_cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
+_sparse_cross = sparse_cross
+
+
+@tf_export("sparse.cross_hashed")
+def sparse_cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
   """Generates hashed sparse cross from a list of sparse and dense tensors.
 
   For example, if the inputs are
@@ -368,6 +375,8 @@ def _sparse_cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
       name=name)
 
 
+_sparse_cross_hashed = sparse_cross_hashed
+
 _DEFAULT_HASH_KEY = 0xDECAFCAFFE
 
 
@@ -590,6 +599,8 @@ class KeywordRequired(object):
 
 
 @tf_export("sparse_split")
+@deprecation.deprecated_args(
+    None, "split_dim is deprecated, use axis instead", "split_dim")
 def sparse_split(keyword_required=KeywordRequired(),
                  sp_input=None,
                  num_split=None,
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 9f58c6a476c34df4624e2be2d9ac442bada212fa..ae79c0194954a052db799d7a00ce1ddc584ea6ed 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -39,6 +39,8 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
 
+# Expose regex_full_match in strings namespace
+tf_export("strings.regex_full_match")(regex_full_match)
 
 @tf_export("string_split")
 def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=invalid-name
@@ -101,7 +103,7 @@ def _reduce_join_reduction_dims(x, axis, reduction_indices):
     return axis
   else:
     # Fast path: avoid creating Rank and Range ops if ndims is known.
-    if isinstance(x, ops.Tensor) and x.get_shape().ndims is not None:
+    if x.get_shape().ndims is not None:
       return constant_op.constant(
           np.arange(x.get_shape().ndims - 1, -1, -1), dtype=dtypes.int32)
 
@@ -115,10 +117,11 @@ def reduce_join(inputs, axis=None,
                 separator="",
                 name=None,
                 reduction_indices=None):
+  inputs_t = ops.convert_to_tensor(inputs)
   reduction_indices = _reduce_join_reduction_dims(
-      inputs, axis, reduction_indices)
+      inputs_t, axis, reduction_indices)
   return gen_string_ops.reduce_join(
-      inputs=inputs,
+      inputs=inputs_t,
       reduction_indices=reduction_indices,
       keep_dims=keep_dims,
       separator=separator,
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 9b6b8c508fcd7e73e6259d14b466892544ec65d7..355b0d961e2105bf19105dbc6f8a9ddfc41c0d30 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -26,7 +26,7 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.deprecation import deprecated
@@ -295,42 +295,6 @@ class Template(checkpointable.CheckpointableBase):
     # which is not the same as whether the scope has been created.
     self._variables_created = False
 
-  @property
-  def _checkpoint_dependencies(self):
-    """Sanity checking for object-based saving.
-
-    Does not override Checkpointable dependency tracking, but checks that
-    variables accessible through Checkpointable dependencies on other `Template`
-    objects include all of the variable_scope-filtered `Template.variables`.
-
-    Returns:
-      A list of checkpointable.CheckpointableReference objects.
-    Raises:
-      ValueError: If this object is not compatible with object-based saving.
-    """
-    dependencies = super(Template, self)._checkpoint_dependencies
-    dependency_variables = []
-    for _, dependency in dependencies:
-      if isinstance(dependency, Template):
-        dependency_variables.extend(dependency.variables)
-      else:
-        dependency_variables.append(dependency)
-    dependency_variables = set(dependency_variables)
-    not_included_variables = []
-    for expected_variable in sorted(self.variables, key=lambda v: v.name):
-      if expected_variable not in dependency_variables:
-        not_included_variables.append(expected_variable)
-    if not_included_variables:
-      # Trying to save a Template which improperly tracks its variables.
-      raise ValueError(
-          ("The Template '%s' references variables which are not included via "
-           "object-based dependency tracking. Most likely a custom "
-           "getter/creator was registered which does not call Template's "
-           "custom variable creator (which is responsible for tracking "
-           "dependencies).\n\nExpected these variables to be dependencies: %s")
-          % (self, not_included_variables))
-    return dependencies
-
   def _checkpointable_custom_creator(self, next_creator, name, initial_value,
                                      checkpointable_parent=None, **kwargs):
     """A variable creation hook which adds Checkpointable dependencies.
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index d2f45ce37bbbbb58eb35aaeccd4c9497ee498f41..cc92da4fd7afd49d0dd80bd859d7393f2761303f 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import weakref
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -395,69 +396,8 @@ class _GraphTensorArray(object):
 # pylint: enable=protected-access
 
 
-# pylint: disable=protected-access
-def _eager_write_no_copy(ta, index, value):
-  """Writes value into an _EagerTensorArray without creating a new TensorArray.
-
-  Args:
-    ta: _EagerTensorArray into which to write value.
-    index: 0-D.  int32 scalar with the index to write to.
-    value: N-D.  Tensor of type `dtype`.  The Tensor to write to this index.
-
-  Raises:
-    errors_impl.AlreadyExistsError: attempting to overwrite an entry.
-    errors_impl.InvalidArgumentError: value dtype does not match `ta`'s dtype.
-    errors_impl.OutOfRangeError: `index` is out of bounds.
-    ValueError: shape of `value` is not consistent with inferred shape.
-  """
-
-  if isinstance(index, ops.EagerTensor):
-    index = index.numpy()
-
-  if index < 0:
-    raise errors_impl.OutOfRangeError(
-        None, None,
-        "Writing to negative indices (index %d) is not allowed." % index)
-
-  tensor_array = ta._tensor_array
-  size = len(tensor_array)
-  if index >= size:
-    if not ta._dynamic_size:
-      raise errors_impl.OutOfRangeError(
-          None, None,
-          "Tried to write to index %d but array is not resizeable and size "
-          "is: %d" % (index, size))
-    tensor_array.extend([None for _ in range(index - size + 1)])
-
-  if not isinstance(value, ops.EagerTensor):
-    value = constant_op.constant(value)
-
-  if ta._infer_shape:
-    if ta._element_shape is None:
-      ta._element_shape = value.shape
-    elif ta._element_shape != value.shape:
-      raise ValueError("Incompatible shape for value (%s), expected (%s)" %
-                       (value.shape.as_list(), ta._element_shape.as_list()))
-
-  if ta._dtype != value.dtype:
-    raise errors_impl.InvalidArgumentError(
-        None, None,
-        "TensorArray dtype is %s but Op is trying to write dtype %s" %
-        (ta._dtype.name, value.dtype.name))
-
-  if ta._tensor_array[index] is not None:
-    raise errors_impl.AlreadyExistsError(
-        None, None,
-        "Could not write to TensorArray index %d because it has already been "
-        "written to." % index)
-
-  tensor_array[index] = value
-
-# pylint: enable=protected-access
-
-
 class _EagerTensorArray(object):
-  """Eager-mode implementation of TensorArray.
+  """Eager-compatible implementation of TensorArray.
   """
 
   def __init__(self,
@@ -472,7 +412,7 @@ class _EagerTensorArray(object):
                element_shape=None,
                colocate_with_first_write_call=True,
                name=None):
-    """Constructs an Eager mode TensorArray.
+    """Constructs a TensorArray compatible with eager execution.
 
     Args:
       dtype: (required) data type of the TensorArray.
@@ -495,16 +435,19 @@ class _EagerTensorArray(object):
       ValueError: handle or flow are supplied, or if size is not supplied.
     """
 
-    del (flow, tensor_array_name, name)  # not meaningful in Eager
+    del (flow, tensor_array_name, name)  # Unused.
 
     if handle is not None:
-      raise ValueError("TensorArray handles are not supported in Eager mode.")
+      raise ValueError("TensorArray handles are not supported when eager "
+                       "execution is enabled.")
     if size is None:
-      raise ValueError("Size must be declared for TensorArrays in Eager mode.")
+      raise ValueError("Size must be declared for TensorArrays when eager "
+                       "execution is enabled.")
 
-    # These attributes are not meaningful in Eager, but some library functions
-    # (e.g., those in control_flow_ops.py) access them to create new tensor
-    # arrays; as such, we define them for the sake of compatibility.
+    # These attributes are not meaningful when eager is enabled, but some
+    # library functions (e.g., those in control_flow_ops.py) access them to
+    # create new tensor arrays; as such, we define them for the sake of
+    # compatibility.
     self._handle = None
     # we assign a dummy value to _flow in case other code assumes it to be
     # a Tensor
@@ -525,7 +468,7 @@ class _EagerTensorArray(object):
 
   @property
   def flow(self):
-    """Flows are not meaningful in Eager; this exists for compatibility."""
+    """For compatibility; flows are not meaningful when eager is enabled."""
     return self._flow
 
   @property
@@ -534,42 +477,22 @@ class _EagerTensorArray(object):
 
   @property
   def handle(self):
-    """Handles are not meaningful in Eager; this exists for compatibility."""
+    """For compatibility; handles are not meaningful when eager is enabled."""
     return self._handle
 
-  def _identity_without_array(self):
-    """Returns a new TensorArray with the same properties as this Eager one.
-
-    NB: Does not set the underlying _tensor_array attribute.
-    """
-    ta = TensorArray(
-        dtype=self._dtype,
-        size=len(self._tensor_array),
-        dynamic_size=self._dynamic_size,
-        clear_after_read=self._clear_after_read,
-        handle=self._handle,
-        flow=self._flow,
-        infer_shape=self._infer_shape,
-        element_shape=self._element_shape,
-        colocate_with_first_write_call=self._colocate_with_first_write_call)
-    ta._implementation._previously_read_indices = self._previously_read_indices  # pylint: disable=protected-access
-    return ta
-
   def identity(self):
     """See TensorArray."""
-    ta = self._identity_without_array()
-    ta._implementation._tensor_array = [t for t in self._tensor_array]  # pylint: disable=protected-access
-    return ta
+    return self.parent()
 
   def grad(self, source, flow=None, name=None):
     raise NotImplementedError(
-        "TensorArray.grad is not supported in Eager mode; Eager's gradient "
-        "implementation does not use/need this function to compute gradients "
-        "of operations that use TensorArrays.")
+        "TensorArray.grad is not supported when executing eagerly; eager's "
+        "gradient implementation does not use/need this function to compute "
+        "gradients of operations that use TensorArrays.")
 
   def read(self, index, name=None):
     """See TensorArray."""
-    del name  # not meaningful in Eager mode
+    del name  # not meaningful when executing eagerly.
 
     if isinstance(index, ops.EagerTensor):
       index = index.numpy()
@@ -600,12 +523,58 @@ class _EagerTensorArray(object):
       self._previously_read_indices.append(index)
     return tensor
 
+  def _write(self, index, value):
+    """Writes `value` into index named by `index`.
+
+    Args:
+      index: 0-D.  int32 scalar with the index to write to.
+      value: N-D.  Tensor of type `dtype`.  The `Tensor` to write to `index`.
+
+    Raises:
+      errors_impl.InvalidArgumentError: `value` dtype does not match dtype.
+      errors_impl.OutOfRangeError: `index` is out of bounds.
+      ValueError: shape of `value` is not consistent with inferred shape.
+    """
+
+    if isinstance(index, ops.EagerTensor):
+      index = index.numpy()
+
+    if index < 0:
+      raise errors_impl.OutOfRangeError(
+          None, None,
+          "Writing to negative indices (index %d) is not allowed." % index)
+
+    size = len(self._tensor_array)
+    if index >= size:
+      if not self._dynamic_size:
+        raise errors_impl.OutOfRangeError(
+            None, None,
+            "Tried to write to index %d but array is not resizeable and size "
+            "is: %d" % (index, size))
+      self._tensor_array.extend([None for _ in range(index - size + 1)])
+
+    if not isinstance(value, ops.EagerTensor):
+      value = constant_op.constant(value)
+
+    if self._infer_shape:
+      if self._element_shape is None:
+        self._element_shape = value.shape
+      elif self._element_shape != value.shape:
+        raise ValueError("Incompatible shape for value (%s), expected (%s)" %
+                         (value.shape.as_list(), self._element_shape.as_list()))
+
+    if self._dtype != value.dtype:
+      raise errors_impl.InvalidArgumentError(
+          None, None,
+          "TensorArray dtype is %s but Op is trying to write dtype %s" %
+          (self._dtype.name, value.dtype.name))
+    self._tensor_array[index] = value
+
   def write(self, index, value, name=None):
     """See TensorArray."""
-    del name  # not meaningful in Eager mode
-    ta = self.identity()
-    _eager_write_no_copy(ta._implementation, index, value)  # pylint: disable=protected-access
-    return ta
+    del name  # not meaningful when executing eagerly.
+    self._write(index, value)
+    return self.parent()
 
   def _maybe_zero(self, ix):
     val = self._tensor_array[ix]
@@ -623,7 +592,7 @@ class _EagerTensorArray(object):
 
   def gather(self, indices, name=None):
     """See TensorArray."""
-    del name  # not meaningful in Eager mode
+    del name  # not meaningful when executing eagerly.
     return array_ops.stack([self._maybe_zero(i) for i in indices.numpy()])
 
   def concat(self, name=None):
@@ -651,17 +620,15 @@ class _EagerTensorArray(object):
       raise ValueError(
           "Cannot unstack %d tensors into a TensorArray of static size %d" %
           (len(tensors), len(self._tensor_array)))
-    ta = self._identity_without_array()
-    ta._implementation._tensor_array = tensors  # pylint: disable=protected-access
-    return ta
+    self._tensor_array = tensors
+    return self.parent()
 
   def scatter(self, indices, value, name=None):
     """See TensorArray."""
-    del name  # unused in Eager
-    ta = self.identity()
+    del name  # not meaningful when executing eagerly.
     for index, val in zip(indices.numpy(), array_ops.unstack(value)):
-      _eager_write_no_copy(ta._implementation, index, val)  # pylint: disable=protected-access
-    return ta
+      self._write(index, val)  # pylint: disable=protected-access
+    return self.parent()
 
   def split(self, value, lengths, name=None):
     """See TensorArray."""
@@ -690,20 +657,17 @@ class _EagerTensorArray(object):
           "dynamically resizeable" % (len(self._tensor_array),
                                       lengths.shape[0]))
     else:
-      ta = self._identity_without_array()
-      tensor_array = array_ops.split(value, lengths, name=name)
-      ta._implementation._tensor_array = tensor_array  # pylint: disable=protected-access
-      return ta
+      self._tensor_array = array_ops.split(value, lengths, name=name)
+      return self.parent()
 
   def size(self, name=None):
     """See TensorArray."""
-    del name  # not meaningful in Eager mode
+    del name  # not meaningful when executing eagerly.
     return constant_op.constant(len(self._tensor_array))
 
   def close(self, name=None):
-    del name  # not meaningful in Eager mode
+    del name  # not meaningful when executing eagerly.
     del self._tensor_array[:]
-    return
 
 
 # TensorArray is designed to hide an underlying implementation object
@@ -789,6 +753,8 @@ class TensorArray(object):
         colocate_with_first_write_call=colocate_with_first_write_call,
         name=name)
 
+    self._implementation.parent = weakref.ref(self)
+
   @property
   def flow(self):
     """The flow `Tensor` forcing ops leading to this TensorArray state."""
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index adb0f59948a9be1a6fbc717151ccde8622983c21..8d93d24b149a2fd27b956e73d9e866b61ca97287 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -32,7 +32,6 @@ from six import iteritems
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
-from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -41,6 +40,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
@@ -422,7 +422,7 @@ class _VariableStore(object):
           "use_resource": use_resource,
       }
       # `fn_args` can handle functions, `functools.partial`, `lambda`.
-      if "constraint" in estimator_util.fn_args(custom_getter):
+      if "constraint" in function_utils.fn_args(custom_getter):
         custom_getter_kwargs["constraint"] = constraint
       return custom_getter(**custom_getter_kwargs)
     else:
@@ -840,7 +840,8 @@ class _VariableStore(object):
       initializing_from_value = False
     # If dtype is DT_INT/DT_UINT, provide a default value `zero`
     # If dtype is DT_BOOL, provide a default value `FALSE`
-    elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
+    elif (dtype.is_integer or dtype.is_unsigned or dtype.is_bool
+          or dtype == dtypes.string):
       initializer = init_ops.zeros_initializer()
       initializing_from_value = False
     # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
@@ -1364,7 +1365,9 @@ Args:
   name: The name of the new or existing variable.
   shape: Shape of the new or existing variable.
   dtype: Type of the new or existing variable (defaults to `DT_FLOAT`).
-  initializer: Initializer for the variable if one is created.
+  initializer: Initializer for the variable if one is created. Can either be
+    an initializer object or a Tensor. If it's a Tensor, its shape must be known
+    unless validate_shape is False.
   regularizer: A (Tensor -> Tensor or None) function; the result of
     applying it on a newly created variable will be added to the collection
     @{tf.GraphKeys.REGULARIZATION_LOSSES} and can be used for regularization.
@@ -1380,7 +1383,8 @@ Args:
     partitions for each axis (currently only one axis can be partitioned).
   validate_shape: If False, allows the variable to be initialized with a
       value of unknown shape. If True, the default, the shape of initial_value
-      must be known.
+      must be known. For this to be used the initializer must be a Tensor and
+      not an initializer object.
   use_resource: If False, creates a regular Variable. If true, creates an
     experimental ResourceVariable instead with well-defined semantics.
     Defaults to False (will later change to True). When eager execution is
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index c646f795896f0abfce3eb9a57cadc27299714023..294ee0e32831928d34d6bae5668898609a37dc59 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -29,7 +29,7 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import compat
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
diff --git a/tensorflow/python/platform/base.i b/tensorflow/python/platform/base.i
index 478dd46f7e6965f8727e5741f2ccdfdc69247980..2e06f26fa4c43b934a1c7ff08eba6c9b755f8085 100644
--- a/tensorflow/python/platform/base.i
+++ b/tensorflow/python/platform/base.i
@@ -233,7 +233,7 @@ _COPY_TYPEMAPS(unsigned int, mode_t);
 // Typemaps to automatically raise a Python exception from bad output TF_Status.
 // TODO(b/77295559): expand this to all TF_Status* output params and deprecate
 // raise_exception_on_not_ok_status (currently it only affects the C API).
-%typemap(in, numinputs=0) TF_Status* status (TF_Status* status) {
+%typemap(in, numinputs=0) TF_Status* status {
   $1 = TF_NewStatus();
 }
 
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 75580fc63083451eb0ec7b366a2ea3ebd932e211..9e49188c1ef353d345c97ea0295aa1a68283605e 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -232,7 +232,12 @@ class PrintModelAnalysisTest(test.TestCase):
 
         self.assertLess(0, tfprof_node.total_exec_micros)
         self.assertEqual(2844, tfprof_node.total_parameters)
-        self.assertLess(145660, tfprof_node.total_float_ops)
+        #The graph is modifed when MKL is enabled,total_float_ops will
+        #be different
+        if test_util.IsMklEnabled():
+          self.assertLess(101600, tfprof_node.total_float_ops)
+        else:
+          self.assertLess(145660, tfprof_node.total_float_ops)
         self.assertEqual(8, len(tfprof_node.children))
         self.assertEqual('_TFProfRoot', tfprof_node.name)
         self.assertEqual(
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 5ee55301df986998b22b8b57b5f01b1f6b4918ac..5f1fafb9dcaa441b284dce6b57eae1227ebeeaa1 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -42,6 +42,7 @@ limitations under the License.
 %rename("%s") TFE_Py_RecordGradient;
 %rename("%s") TFE_Py_UID;
 %rename("%s") TFE_Py_TapeSetNew;
+%rename("%s") TFE_Py_TapeSetAdd;
 %rename("%s") TFE_Py_TapeSetRemove;
 %rename("%s") TFE_Py_TapeSetStopOnThread;
 %rename("%s") TFE_Py_TapeSetRestartOnThread;
@@ -152,9 +153,12 @@ limitations under the License.
       if (EagerTensor_CheckExact(elem)) {
         (*$1)[i] = EagerTensor_Handle(elem);
       } else {
-        SWIG_exception_fail(SWIG_TypeError,
-                            "provided list of inputs contains objects other "
-                            "than 'EagerTensor'");
+        SWIG_exception_fail(
+            SWIG_TypeError,
+            tensorflow::strings::StrCat(
+                "provided list of inputs contains objects other "
+                "than 'EagerTensor'. Item ",
+                i, " is ", elem->ob_type->tp_name).c_str());
       }
     }
   }
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 01903ae596b84b36085f89557d2b65fc659ba40f..8f1d5a099f8dce262ae902230a010e532c279e3e 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -169,6 +169,25 @@ class SavedModelBuilder(object):
         raise TypeError("main_op needs to be an Operation: %r" % main_op)
       ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
 
+  def _add_train_op(self, train_op):
+    """Add train op to the SavedModel.
+
+    Note that this functionality is in development, and liable to be
+    moved elsewhere.
+
+    Args:
+      train_op: Op or group of ops that are used for training. These are
+        stored as a collection with key TRAIN_OP_KEY, but not executed.
+
+    Raises:
+      TypeError if Train op is not of type `Operation`.
+    """
+    if train_op is not None:
+      if (not isinstance(train_op, ops.Tensor) and
+          not isinstance(train_op, ops.Operation)):
+        raise TypeError("train_op needs to be a Tensor or Op: %r" % train_op)
+      ops.add_to_collection(constants.TRAIN_OP_KEY, train_op)
+
   def _tag_and_add_meta_graph(self, meta_graph_def, tags, signature_def_map):
     """Tags the meta graph def and adds it to the SavedModel.
 
@@ -239,6 +258,20 @@ class SavedModelBuilder(object):
         for outputs_key in outputs:
           self._validate_tensor_info(outputs[outputs_key])
 
+  def _add_collections(
+      self, assets_collection, legacy_init_op, main_op, train_op):
+    """Add asset and op collections to be saved."""
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(assets_collection)
+
+    if main_op is None:
+      # Add legacy init op to the SavedModel.
+      self._maybe_add_legacy_init_op(legacy_init_op)
+    else:
+      self._add_main_op(main_op)
+
+    self._add_train_op(train_op)
+
   def add_meta_graph(self,
                      tags,
                      signature_def_map=None,
@@ -286,14 +319,8 @@ class SavedModelBuilder(object):
     # properly populated.
     self._validate_signature_def_map(signature_def_map)
 
-    # Save asset files and write them to disk, if any.
-    self._save_and_write_assets(assets_collection)
-
-    if main_op is None:
-      # Add legacy init op to the SavedModel.
-      self._maybe_add_legacy_init_op(legacy_init_op)
-    else:
-      self._add_main_op(main_op)
+    # Add assets and ops
+    self._add_collections(assets_collection, legacy_init_op, main_op, None)
 
     # Initialize a saver to generate a sharded output for all saveables in the
     # current scope.
@@ -352,6 +379,7 @@ class SavedModelBuilder(object):
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
         removed from the NodeDefs. For a detailed guide, see
         [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+
     """
     # pylint: enable=line-too-long
     if self._has_saved_variables:
@@ -363,8 +391,8 @@ class SavedModelBuilder(object):
     # properly populated.
     self._validate_signature_def_map(signature_def_map)
 
-    # Save asset files and write them to disk, if any.
-    self._save_and_write_assets(assets_collection)
+    # Add assets and ops
+    self._add_collections(assets_collection, legacy_init_op, main_op, None)
 
     # Create the variables sub-directory, if it does not exist.
     variables_dir = os.path.join(
@@ -377,12 +405,6 @@ class SavedModelBuilder(object):
         compat.as_text(variables_dir),
         compat.as_text(constants.VARIABLES_FILENAME))
 
-    if main_op is None:
-      # Add legacy init op to the SavedModel.
-      self._maybe_add_legacy_init_op(legacy_init_op)
-    else:
-      self._add_main_op(main_op)
-
     # Initialize a saver to generate a sharded output for all saveables in the
     # current scope.
     saver = tf_saver.Saver(
diff --git a/tensorflow/python/saved_model/constants.py b/tensorflow/python/saved_model/constants.py
index 34206c6f6d49f1901cacd7a8cbd632968b862ad6..61c6ffbd0d11ef48c6dfb8d14a4328df7f7c5df5 100644
--- a/tensorflow/python/saved_model/constants.py
+++ b/tensorflow/python/saved_model/constants.py
@@ -41,6 +41,10 @@ MAIN_OP_KEY = "saved_model_main_op"
 tf_export("saved_model.constants.MAIN_OP_KEY").export_constant(
     __name__, "MAIN_OP_KEY")
 
+# CollectionDef key for the SavedModel train op.
+# Not exported while export_all_saved_models is in contrib.
+TRAIN_OP_KEY = "saved_model_train_op"
+
 # Schema version for SavedModel.
 SAVED_MODEL_SCHEMA_VERSION = 1
 tf_export("saved_model.constants.SAVED_MODEL_SCHEMA_VERSION").export_constant(
@@ -65,3 +69,5 @@ tf_export("saved_model.constants.VARIABLES_DIRECTORY").export_constant(
 VARIABLES_FILENAME = "variables"
 tf_export("saved_model.constants.VARIABLES_FILENAME").export_constant(
     __name__, "VARIABLES_FILENAME")
+
+
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 804255375e7c5215597a5dcca02f3b32f2c0a497..1b83d60df926f727be2469a2ea1409bc0c59d6c5 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -53,12 +53,9 @@ def tearDownModule():
   file_io.delete_recursively(test.get_temp_dir())
 
 
-@test_util.with_c_api
 class SavedModelTest(test.TestCase):
 
   def _get_export_dir(self, label):
-    if ops._USE_C_API:
-      label += "_c_api"
     return os.path.join(test.get_temp_dir(), label)
 
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
@@ -734,6 +731,96 @@ class SavedModelTest(test.TestCase):
         builder.add_meta_graph_and_variables(
             sess, ["foo"], legacy_init_op=legacy_init_op)
 
+  def testTrainOp(self):
+    export_dir = self._get_export_dir("test_train_op")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      # Add `v1` and `v2` variables to the graph.
+      v1 = variables.Variable(1, name="v1")
+      ops.add_to_collection("v", v1)
+      v2 = variables.Variable(2, name="v2")
+      ops.add_to_collection("v", v2)
+
+      sess.run(variables.global_variables_initializer())
+      train_op = state_ops.assign_add(v1, v2)
+
+      sess.run(train_op)
+      # TODO(karmel): remove explicit call when in the public method.
+      builder._add_train_op(train_op)
+      builder.add_meta_graph_and_variables(sess, ["foo"])
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      loader.load(sess, ["foo"], export_dir)
+      self.assertEqual(3, ops.get_collection("v")[0].eval())
+      self.assertEqual(2, ops.get_collection("v")[1].eval())
+      self.assertIsInstance(
+          ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Tensor)
+
+  def testTrainOpGroup(self):
+    export_dir = self._get_export_dir("test_train_op_group")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      # Add `v1` and `v2` variables to the graph.
+      v1 = variables.Variable(1, name="v1")
+      ops.add_to_collection("v", v1)
+      v2 = variables.Variable(2, name="v2")
+      ops.add_to_collection("v", v2)
+
+      sess.run(variables.global_variables_initializer())
+      train_op = control_flow_ops.group()
+
+      sess.run(train_op)
+      # TODO(karmel): remove explicit call when in the public method.
+      builder._add_train_op(train_op)
+      builder.add_meta_graph_and_variables(sess, ["foo"])
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      loader.load(sess, ["foo"], export_dir)
+      self.assertEqual(1, ops.get_collection("v")[0].eval())
+      self.assertEqual(2, ops.get_collection("v")[1].eval())
+      self.assertIsInstance(
+          ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Operation)
+
+  def testTrainOpAfterVariables(self):
+    export_dir = self._get_export_dir("test_train_op_after_variables")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      # Add `v1` and `v2` variables to the graph.
+      v1 = variables.Variable(1, name="v1")
+      ops.add_to_collection("v", v1)
+      v2 = variables.Variable(2, name="v2")
+      ops.add_to_collection("v", v2)
+
+      sess.run(variables.global_variables_initializer())
+      builder.add_meta_graph_and_variables(sess, ["pre_foo"])
+
+      train_op = state_ops.assign_add(v1, v2)
+      sess.run(train_op)
+      # TODO(karmel): remove explicit call when in the public method.
+      builder._add_train_op(train_op)
+      builder.add_meta_graph(["foo"])
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      loader.load(sess, ["foo"], export_dir)
+      self.assertIsInstance(
+          ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Tensor)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      loader.load(sess, ["pre_foo"], export_dir)
+      self.assertFalse(ops.get_collection(constants.TRAIN_OP_KEY))
+
   def testMultipleAssets(self):
     export_dir = self._get_export_dir("test_multiple_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
@@ -1028,12 +1115,8 @@ class SavedModelTest(test.TestCase):
     # does not have any attr values for the "TestAttr" node, and there is no
     # default specified in the TestAttr OpDef.
     sess = session.Session(graph=ops.Graph())
-    if ops._USE_C_API:
-      error_message = "NodeDef missing attr 'T' from Op<name=TestAttr"
-    else:
-      error_message = ("Expected one attr with name .*T(out)?.* in name: "
-                       "\"test_attr\".*")
-    with self.assertRaisesRegexp(ValueError, error_message):
+    with self.assertRaisesRegexp(
+        ValueError, "NodeDef missing attr 'T' from Op<name=TestAttr"):
       loader.load(sess, ["foo"], export_dir)
 
     # Rewrite the SavedModel to change the type of the T attr in "test_attr"
diff --git a/tensorflow/python/saved_model/signature_constants.py b/tensorflow/python/saved_model/signature_constants.py
index 819f351291f2bbf24a433c1bc77578af594c050b..99007a9634b27dc1ad6d06b8f002ffa35a96dbcc 100644
--- a/tensorflow/python/saved_model/signature_constants.py
+++ b/tensorflow/python/saved_model/signature_constants.py
@@ -94,3 +94,9 @@ tf_export("saved_model.signature_constants.REGRESS_OUTPUTS").export_constant(
     __name__, "REGRESS_OUTPUTS")
 
 ################################################################################
+# Train/Eval API constants.
+# Not exported while export_all_saved_models is in contrib.
+
+SUPERVISED_TRAIN_METHOD_NAME = "tensorflow/supervised/training"
+
+SUPERVISED_EVAL_METHOD_NAME = "tensorflow/supervised/eval"
diff --git a/tensorflow/python/saved_model/signature_def_utils.py b/tensorflow/python/saved_model/signature_def_utils.py
index ea0f52f17e56f94054e60713c785257d57fb2d0d..27d6b70e9dce3ff67b7912efcea8e2994d138dc6 100644
--- a/tensorflow/python/saved_model/signature_def_utils.py
+++ b/tensorflow/python/saved_model/signature_def_utils.py
@@ -26,6 +26,8 @@ from tensorflow.python.saved_model.signature_def_utils_impl import classificatio
 from tensorflow.python.saved_model.signature_def_utils_impl import is_valid_signature
 from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import regression_signature_def
+from tensorflow.python.saved_model.signature_def_utils_impl import supervised_eval_signature_def
+from tensorflow.python.saved_model.signature_def_utils_impl import supervised_train_signature_def
 # pylint: enable=unused-import
 
 del absolute_import
diff --git a/tensorflow/python/saved_model/signature_def_utils_impl.py b/tensorflow/python/saved_model/signature_def_utils_impl.py
index d0331591889110df86bdb2ac69c037bc3b968f91..f8ad788f7775b6cd053004843d45560dbcda9840 100644
--- a/tensorflow/python/saved_model/signature_def_utils_impl.py
+++ b/tensorflow/python/saved_model/signature_def_utils_impl.py
@@ -185,6 +185,62 @@ def predict_signature_def(inputs, outputs):
   return signature_def
 
 
+def supervised_train_signature_def(
+    inputs, loss, predictions=None, metrics=None):
+  return _supervised_signature_def(
+      signature_constants.SUPERVISED_TRAIN_METHOD_NAME, inputs, loss=loss,
+      predictions=predictions, metrics=metrics)
+
+
+def supervised_eval_signature_def(
+    inputs, loss, predictions=None, metrics=None):
+  return _supervised_signature_def(
+      signature_constants.SUPERVISED_EVAL_METHOD_NAME, inputs, loss=loss,
+      predictions=predictions, metrics=metrics)
+
+
+def _supervised_signature_def(
+    method_name, inputs, loss=None, predictions=None,
+    metrics=None):
+  """Creates a signature for training and eval data.
+
+  This function produces signatures that describe the inputs and outputs
+  of a supervised process, such as training or evaluation, that
+  results in loss, metrics, and the like. Note that this function only requires
+  inputs to be not None.
+
+  Args:
+    method_name: Method name of the SignatureDef as a string.
+    inputs: dict of string to `Tensor`.
+    loss: dict of string to `Tensor` representing computed loss.
+    predictions: dict of string to `Tensor` representing the output predictions.
+    metrics: dict of string to `Tensor` representing metric ops.
+
+  Returns:
+    A train- or eval-flavored signature_def.
+
+  Raises:
+    ValueError: If inputs or outputs is `None`.
+  """
+  if inputs is None or not inputs:
+    raise ValueError('{} inputs cannot be None or empty.'.format(method_name))
+
+  signature_inputs = {key: utils.build_tensor_info(tensor)
+                      for key, tensor in inputs.items()}
+
+  signature_outputs = {}
+  for output_set in (loss, predictions, metrics):
+    if output_set is not None:
+      sig_out = {key: utils.build_tensor_info(tensor)
+                 for key, tensor in output_set.items()}
+      signature_outputs.update(sig_out)
+
+  signature_def = build_signature_def(
+      signature_inputs, signature_outputs, method_name)
+
+  return signature_def
+
+
 @tf_export('saved_model.signature_def_utils.is_valid_signature')
 def is_valid_signature(signature_def):
   """Determine whether a SignatureDef can be served by TensorFlow Serving."""
diff --git a/tensorflow/python/saved_model/signature_def_utils_test.py b/tensorflow/python/saved_model/signature_def_utils_test.py
index b2bd14db8cdab3b49516d8b7516577ed61ea604d..ebc54506335d41ae559951b9eaf3acd36ca9d870 100644
--- a/tensorflow/python/saved_model/signature_def_utils_test.py
+++ b/tensorflow/python/saved_model/signature_def_utils_test.py
@@ -180,6 +180,101 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_STRING, output2_tensor_info_actual.dtype)
     self.assertEqual(0, len(output2_tensor_info_actual.tensor_shape.dim))
 
+  def testTrainSignatureDef(self):
+    self._testSupervisedSignatureDef(
+        signature_def_utils_impl.supervised_train_signature_def,
+        signature_constants.SUPERVISED_TRAIN_METHOD_NAME)
+
+  def testEvalSignatureDef(self):
+    self._testSupervisedSignatureDef(
+        signature_def_utils_impl.supervised_eval_signature_def,
+        signature_constants.SUPERVISED_EVAL_METHOD_NAME)
+
+  def _testSupervisedSignatureDef(self, fn_to_test, method_name):
+    inputs = {
+        "input-1": constant_op.constant("a", name="input-1"),
+        "input-2": constant_op.constant("b", name="input-2"),
+    }
+    loss = {"loss-1": constant_op.constant(0.45, name="loss-1")}
+    predictions = {
+        "classes": constant_op.constant([100], name="classes"),
+    }
+    metrics_val = constant_op.constant(100.0, name="metrics_val")
+    metrics = {
+        "metrics/value": metrics_val,
+        "metrics/update_op": array_ops.identity(metrics_val, name="metrics_op"),
+    }
+
+    signature_def = fn_to_test(inputs, loss, predictions, metrics)
+
+    self.assertEqual(method_name, signature_def.method_name)
+
+    # Check inputs in signature def.
+    self.assertEqual(2, len(signature_def.inputs))
+    input1_tensor_info_actual = (signature_def.inputs["input-1"])
+    self.assertEqual("input-1:0", input1_tensor_info_actual.name)
+    self.assertEqual(types_pb2.DT_STRING, input1_tensor_info_actual.dtype)
+    self.assertEqual(0, len(input1_tensor_info_actual.tensor_shape.dim))
+    input2_tensor_info_actual = (signature_def.inputs["input-2"])
+    self.assertEqual("input-2:0", input2_tensor_info_actual.name)
+    self.assertEqual(types_pb2.DT_STRING, input2_tensor_info_actual.dtype)
+    self.assertEqual(0, len(input2_tensor_info_actual.tensor_shape.dim))
+
+    # Check outputs in signature def.
+    self.assertEqual(4, len(signature_def.outputs))
+    self.assertEqual("loss-1:0", signature_def.outputs["loss-1"].name)
+    self.assertEqual(types_pb2.DT_FLOAT, signature_def.outputs["loss-1"].dtype)
+
+    self.assertEqual("classes:0", signature_def.outputs["classes"].name)
+    self.assertEqual(1, len(signature_def.outputs["classes"].tensor_shape.dim))
+
+    self.assertEqual(
+        "metrics_val:0", signature_def.outputs["metrics/value"].name)
+    self.assertEqual(
+        types_pb2.DT_FLOAT, signature_def.outputs["metrics/value"].dtype)
+
+    self.assertEqual(
+        "metrics_op:0", signature_def.outputs["metrics/update_op"].name)
+    self.assertEqual(
+        types_pb2.DT_FLOAT, signature_def.outputs["metrics/value"].dtype)
+
+  def testTrainSignatureDefMissingInputs(self):
+    self._testSupervisedSignatureDefMissingInputs(
+        signature_def_utils_impl.supervised_train_signature_def,
+        signature_constants.SUPERVISED_TRAIN_METHOD_NAME)
+
+  def testEvalSignatureDefMissingInputs(self):
+    self._testSupervisedSignatureDefMissingInputs(
+        signature_def_utils_impl.supervised_eval_signature_def,
+        signature_constants.SUPERVISED_EVAL_METHOD_NAME)
+
+  def _testSupervisedSignatureDefMissingInputs(self, fn_to_test, method_name):
+    inputs = {
+        "input-1": constant_op.constant("a", name="input-1"),
+        "input-2": constant_op.constant("b", name="input-2"),
+    }
+    loss = {"loss-1": constant_op.constant(0.45, name="loss-1")}
+    predictions = {
+        "classes": constant_op.constant([100], name="classes"),
+    }
+    metrics_val = constant_op.constant(100, name="metrics_val")
+    metrics = {
+        "metrics/value": metrics_val,
+        "metrics/update_op": array_ops.identity(metrics_val, name="metrics_op"),
+    }
+
+    with self.assertRaises(ValueError):
+      signature_def = fn_to_test(
+          {}, loss=loss, predictions=predictions, metrics=metrics)
+
+    signature_def = fn_to_test(inputs, loss=loss)
+    self.assertEqual(method_name, signature_def.method_name)
+    self.assertEqual(1, len(signature_def.outputs))
+
+    signature_def = fn_to_test(inputs, metrics=metrics, loss=loss)
+    self.assertEqual(method_name, signature_def.method_name)
+    self.assertEqual(3, len(signature_def.outputs))
+
   def testGetShapeAndTypes(self):
     inputs = {
         "input-1": constant_op.constant(["a", "b"]),
diff --git a/tensorflow/python/saved_model/tag_constants.py b/tensorflow/python/saved_model/tag_constants.py
index 5a797da791c82d9c81107ba940aceea7849c0c46..c82154e7b93aaec9bea826c319e3a93fc3b48a5b 100644
--- a/tensorflow/python/saved_model/tag_constants.py
+++ b/tensorflow/python/saved_model/tag_constants.py
@@ -32,6 +32,9 @@ TRAINING = "train"
 tf_export("saved_model.tag_constants.TRAINING").export_constant(
     __name__, "TRAINING")
 
+# Tag for the `eval` graph. Not exported while the export logic is in contrib.
+EVAL = "eval"
+
 # Tag for the `gpu` graph.
 GPU = "gpu"
 tf_export("saved_model.tag_constants.GPU").export_constant(__name__, "GPU")
@@ -39,3 +42,5 @@ tf_export("saved_model.tag_constants.GPU").export_constant(__name__, "GPU")
 # Tag for the `tpu` graph.
 TPU = "tpu"
 tf_export("saved_model.tag_constants.TPU").export_constant(__name__, "TPU")
+
+
diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py
index 084a4500f8e1eb7f75e1e01668fae655b5e06763..fcb3ceac827bd7b7c24f06d1ba43ac3ca0bcf6b6 100644
--- a/tensorflow/python/tools/optimize_for_inference_test.py
+++ b/tensorflow/python/tools/optimize_for_inference_test.py
@@ -39,7 +39,6 @@ from tensorflow.python.platform import test
 from tensorflow.python.tools import optimize_for_inference_lib
 
 
-@test_util.with_c_api
 class OptimizeForInferenceTest(test.TestCase):
 
   def create_node_def(self, op, name, inputs):
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index abcf76a2204f33352f9e3b7bd4a721b4bf871236..df528d54d6503ef1626d0b3bc4b5afe9e0616c31 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -203,7 +203,7 @@ class LoggingTensorHook(session_run_hook.SessionRunHook):
       self._tag_order = tensors
       tensors = {item: item for item in tensors}
     else:
-      self._tag_order = tensors.keys()
+      self._tag_order = sorted(tensors.keys())
     self._tensors = tensors
     self._formatter = formatter
     self._timer = (
diff --git a/tensorflow/python/training/checkpointable/BUILD b/tensorflow/python/training/checkpointable/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a7ae6e50a9975d8d164d5c7455aedb8b39ed802e
--- /dev/null
+++ b/tensorflow/python/training/checkpointable/BUILD
@@ -0,0 +1,93 @@
+# Description:
+#   Utilities for reading and writing object-based checkpoints.
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "base",
+    srcs = ["base.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:io_ops_gen",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:saveable_object",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_test(
+    name = "base_test",
+    srcs = ["base_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_library(
+    name = "util",
+    srcs = ["util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:io_ops_gen",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:saveable_object",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_test(
+    name = "util_test",
+    srcs = ["util_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "notsan",  # b/74395663
+    ],
+    deps = [
+        ":base",
+        ":util",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:template",
+        "//tensorflow/python:training",
+        "//tensorflow/python:training_util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:layers",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/python/training/checkpointable.py b/tensorflow/python/training/checkpointable/base.py
similarity index 86%
rename from tensorflow/python/training/checkpointable.py
rename to tensorflow/python/training/checkpointable/base.py
index 05afd37ccd5e80c3bbb1ecc2638c559b0115866b..e378f0e898c88d6f3e4fa22e93ca2165f2325660 100644
--- a/tensorflow/python/training/checkpointable.py
+++ b/tensorflow/python/training/checkpointable/base.py
@@ -18,14 +18,21 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
+import json
+import weakref
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import saveable_object
 from tensorflow.python.util import nest
+from tensorflow.python.util import serialization
 
 
 # Key where the object graph proto is saved in a TensorBundle
@@ -37,6 +44,7 @@ OBJECT_GRAPH_PROTO_KEY = "_CHECKPOINTABLE_OBJECT_GRAPH"
 # the object has no dependencies, then its value may be restored on object
 # creation (avoiding double assignment when executing eagerly).
 VARIABLE_VALUE_KEY = "VARIABLE_VALUE"
+OBJECT_CONFIG_JSON_KEY = "OBJECT_CONFIG_JSON"
 
 CheckpointableReference = collections.namedtuple(
     "CheckpointableReference",
@@ -85,6 +93,35 @@ class CheckpointInitialValue(ops.Tensor):
     return self._checkpoint_position
 
 
+class PythonStringStateSaveable(saveable_object.SaveableObject):
+  """Saves Python state in a checkpoint."""
+
+  def __init__(self, name, state_callback):
+    """Configure saving.
+
+    Args:
+      name: The checkpoint key to write to.
+      state_callback: A function taking no arguments which returns a
+        string. This function is run every time a checkpoint is written.
+    """
+    if context.executing_eagerly():
+      self._save_string = (
+          lambda: constant_op.constant(state_callback(), dtype=dtypes.string))
+    else:
+      self._save_string = constant_op.constant("", dtype=dtypes.string)
+      self.feed_dict_additions = (
+          lambda: {self._save_string: state_callback()})
+    spec = saveable_object.SaveSpec(
+        self._save_string, "", name, dtype=dtypes.string)
+    super(PythonStringStateSaveable, self).__init__(
+        self._save_string, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    # TODO(allenl): Add a Python hook for state coming out of a checkpoint
+    # (currently PythonStringStateSaveable is write-only).
+    return control_flow_ops.no_op()
+
+
 class _CheckpointPosition(object):
   """Indicates a position within a `_Checkpoint`."""
 
@@ -340,6 +377,21 @@ class CheckpointableBase(object):
           "Internal error: the object had an update UID set before its "
           "initialization code was run.")
     self._update_uid = -1
+    # When executing eagerly, holds a collection of _NameBasedRestoreCoordinator
+    # instances, which should be checked when creating variables or other
+    # saveables. These are passed on recursively to all dependencies, since
+    # unlike object-based checkpoint restores we don't know which subgraph is
+    # being restored in advance. This mechanism is only necessary for
+    # restore-on-create when executing eagerly, and so is unused when graph
+    # building.
+    self._name_based_restores = set()
+
+  def _name_based_attribute_restore(self, checkpoint):
+    """Restore the object's attributes from a name-based checkpoint."""
+    self._name_based_restores.add(checkpoint)
+    if self._update_uid < checkpoint.restore_uid:
+      checkpoint.eager_restore(self)
+      self._update_uid = checkpoint.restore_uid
 
   @property
   def _checkpoint_dependencies(self):
@@ -570,6 +622,7 @@ class CheckpointableBase(object):
         `CheckpointableBase`).
     """
     self._maybe_initialize_checkpointable()
+    checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
     deferred_dependencies_list = self._deferred_dependencies.pop(name, ())
     for checkpoint_position in sorted(
         deferred_dependencies_list,
@@ -577,6 +630,13 @@ class CheckpointableBase(object):
         reverse=True):
       checkpoint_position.restore(checkpointable)
 
+    # Pass on any name-based restores queued in this object.
+    for name_based_restore in sorted(
+        self._name_based_restores,
+        key=lambda checkpoint: checkpoint.restore_uid,
+        reverse=True):
+      checkpointable._name_based_attribute_restore(name_based_restore)  # pylint: disable=protected-access
+
   def _restore_from_checkpoint_position(self, checkpoint_position):
     """Restore this object and its dependencies (may be deferred)."""
     # Attempt a breadth-first traversal, since presumably the user has more
@@ -604,7 +664,6 @@ class CheckpointableBase(object):
     # restoration on to our dependencies.
     if checkpoint.restore_uid > self._update_uid:
       restore_ops = checkpoint_position.restore_ops()
-      # TODO(allenl): Get a list of feeds for saving Python state
       self._update_uid = checkpoint.restore_uid
     else:
       restore_ops = ()
@@ -656,7 +715,60 @@ class CheckpointableBase(object):
        lambda name="global_name_for_this_object":
        SaveableObject(name=name, ...)}
     """
-    return {}
+    if not hasattr(self, "get_config"):
+      return {}
+    try:
+      self.get_config()
+    except NotImplementedError:
+      return {}
+    weak_self = weakref.ref(self)
+    def _state_callback():
+      dereferenced_self = weak_self()
+      if dereferenced_self:
+        return json.dumps(self,
+                          default=serialization.get_json_type,
+                          sort_keys=True).encode("utf8")
+      else:
+        return ""
+    return {OBJECT_CONFIG_JSON_KEY: functools.partial(
+        PythonStringStateSaveable,
+        state_callback=_state_callback)}
+
+
+class NoDependency(object):
+  """Allows attribute assignment to `Checkpointable` objects with no dependency.
+
+  Example usage:
+  ```python
+  obj = Checkpointable()
+  obj.has_dependency = tf.Variable(0., name="dep")
+  obj.no_dependency = NoDependency(tf.Variable(1., name="nodep"))
+  assert obj.no_dependency.name == "nodep:0"
+  ```
+
+  `obj` in this example has a dependency on the variable "dep", and both
+  attributes contain un-wrapped `Variable` objects.
+
+  `NoDependency` also works with `tf.keras.Model`, but only for checkpoint
+  dependencies: wrapping a `Layer` in `NoDependency` will assign the (unwrapped)
+  `Layer` to the attribute without a checkpoint dependency, but the `Model` will
+  still track the `Layer` (so it will appear in `Model.layers`, and its
+  variables will appear in `Model.variables`).
+  """
+
+  def __init__(self, value):
+    self.value = value
+
+
+class NotCheckpointable(object):
+  """Marks instances of child classes as unsaveable using an object-based API.
+
+  Useful for marking objects which would otherwise look checkpointable because
+  of inheritance (e.g. through `Layer`) as not checkpointable. Inheriting from
+  `NotCheckpointable` does not prevent an object from being assigned to any
+  attributes, but will throw an error on save/restore.
+  """
+  pass
 
 
 class Checkpointable(CheckpointableBase):
@@ -691,8 +803,11 @@ class Checkpointable(CheckpointableBase):
     """Support self.foo = checkpointable syntax."""
     # Perform the attribute assignment, and potentially call other __setattr__
     # overrides such as that for tf.keras.Model.
+    no_dependency = isinstance(value, NoDependency)
+    if no_dependency:
+      value = value.value
     super(Checkpointable, self).__setattr__(name, value)
-    if isinstance(value, CheckpointableBase):
+    if not no_dependency and isinstance(value, CheckpointableBase):
       self._track_checkpointable(
           value, name=name,
           # Allow the user to switch the Checkpointable which is tracked by this
diff --git a/tensorflow/python/training/checkpointable_test.py b/tensorflow/python/training/checkpointable/base_test.py
similarity index 74%
rename from tensorflow/python/training/checkpointable_test.py
rename to tensorflow/python/training/checkpointable/base_test.py
index e79acb49758b6a7d69dd084692d434bea808db64..0a274cdfed5af83a69513e9b26bf427f284a4df7 100644
--- a/tensorflow/python/training/checkpointable_test.py
+++ b/tensorflow/python/training/checkpointable/base_test.py
@@ -17,7 +17,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.platform import test
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
 
 
 class InterfaceTests(test.TestCase):
@@ -34,6 +34,16 @@ class InterfaceTests(test.TestCase):
     root.leaf = duplicate_name_dep
     root._track_checkpointable(duplicate_name_dep, name="leaf", overwrite=True)
 
+  def testNoDependency(self):
+    root = checkpointable.Checkpointable()
+    hasdep = checkpointable.Checkpointable()
+    root.hasdep = hasdep
+    nodep = checkpointable.Checkpointable()
+    root.nodep = checkpointable.NoDependency(nodep)
+    self.assertEqual(1, len(root._checkpoint_dependencies))
+    self.assertIs(root._checkpoint_dependencies[0].ref, root.hasdep)
+    self.assertIs(root.hasdep, hasdep)
+    self.assertIs(root.nodep, nodep)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/checkpointable_utils.py b/tensorflow/python/training/checkpointable/util.py
similarity index 80%
rename from tensorflow/python/training/checkpointable_utils.py
rename to tensorflow/python/training/checkpointable/util.py
index cf4112ff99b3d693c1724eea74e239d16db1b93d..96e6d10791f396ad7f9f73cce9356dd4cbe3ce9d 100644
--- a/tensorflow/python/training/checkpointable_utils.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -30,13 +30,16 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import checkpointable as checkpointable_lib
 from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.training import saveable_object as saveable_object_lib
 from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training.checkpointable import base as checkpointable_lib
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -116,6 +119,76 @@ class _CheckpointRestoreCoordinator(object):
                     slot_name=slot_reference.slot_name))
 
 
+class _NameBasedRestoreCoordinator(object):
+  """Keeps the status of a name-based checkpoint restore."""
+
+  def __init__(self, save_path, dtype_map=None):
+    self.save_path = save_path
+    self.dtype_map = dtype_map
+    self.unused_attributes = weakref.WeakKeyDictionary()
+    self.restore_uid = ops.uid()
+
+  def globally_named_object_attributes(self, checkpointable):
+    """Create globally named SaveableObjects from attributes.
+
+    If an object's attribute has no global name specified (default construction
+    for the SaveableObject factory), records the failure in
+    `self.unused_attributes` (which can then be used to make status assertions
+    fail; see `NameBasedSaverStatus`).
+
+    Args:
+      checkpointable: An object to save.
+
+    Yields:
+      SaveableObjects for `checkpointable`'s attributes.
+    """
+    for attribute_name, saveable_factory in (
+        checkpointable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
+      if callable(saveable_factory):
+        try:
+          # This saveable object factory does not have a default name= argument,
+          # which means there's no way to save/restore it using a name-based
+          # checkpoint. Ignore the error now and make sure assert_consumed()
+          # fails.
+          saveable = saveable_factory()
+        except TypeError:
+          self.unused_attributes.setdefault(checkpointable, []).append(
+              attribute_name)
+          continue
+      else:
+        saveable = saveable_factory
+      names_to_saveables = saver_lib.BaseSaverBuilder.OpListToDict(
+          [saveable],
+          convert_variable_to_tensor=False)
+      for name, op in names_to_saveables.items():
+        for saveable_object in saver_lib.BaseSaverBuilder.SaveableObjectsForOp(
+            op=op, name=name):
+          yield saveable_object
+
+  def eager_restore(self, checkpointable):
+    """Runs restore ops for `checkpointable`'s attributes."""
+    # When graph building, we don't add any restore ops to the graph until
+    # run_restore_ops/initialize_or_restore on the status object for name-based
+    # checkpoints.
+    assert context.executing_eagerly()
+    for saveable in self.globally_named_object_attributes(
+        checkpointable):
+      restored_tensors = []
+      for spec in saveable.specs:
+        if spec.name in self.dtype_map:
+          with ops.device("cpu:0"):
+            restored, = io_ops.restore_v2(
+                prefix=self.save_path,
+                tensor_names=[spec.name],
+                shape_and_slices=[""],
+                dtypes=[self.dtype_map[spec.name]],
+                name="%s_checkpoint_read" % (spec.name,))
+          restored_tensors.append(array_ops.identity(restored))
+
+      saveable.restore(restored_tensors=restored_tensors,
+                       restored_shapes=None)
+
+
 # TODO(allenl): If this ends up in a public API, consider adding LINT.IfChange
 # or consolidating the implementation with get_variable.
 def _default_getter(name, shape, dtype, initializer=None,
@@ -204,6 +277,12 @@ def _breadth_first_checkpointable_traversal(root_checkpointable):
   path_to_root = {root_checkpointable: ()}
   while to_visit:
     current_checkpointable = to_visit.popleft()
+    if isinstance(current_checkpointable, checkpointable_lib.NotCheckpointable):
+      raise NotImplementedError(
+          ("The object %s does not support object-based saving. File a feature "
+           "request if this limitation bothers you. In the meantime, you can "
+           "remove the dependency on this object and save everything else.")
+          % (current_checkpointable,))
     current_checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
     bfs_sorted.append(current_checkpointable)
     for child_checkpointable in (
@@ -303,42 +382,93 @@ def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
 
 
 def _serialize_checkpointables(
-    checkpointable_objects, node_ids, object_names, slot_variables):
+    checkpointable_objects, node_ids, object_names, slot_variables,
+    saveables_cache):
   """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
   object_graph_proto = (
       checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-  named_saveables = {}
-
+  named_saveables = []
+  feed_additions = {}
   for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
     assert node_ids[checkpointable] == checkpoint_id
     object_proto = object_graph_proto.nodes.add()
     object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
     object_name = object_names[checkpointable]
+    if saveables_cache is not None:
+      cached_attributes = saveables_cache.setdefault(checkpointable, {})
+    else:
+      cached_attributes = None
     for name, saveable_factory in (
         checkpointable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
       attribute = object_proto.attributes.add()
       attribute.name = name
       attribute.checkpoint_key = "%s/%s/%s" % (
           object_name, _OBJECT_ATTRIBUTES_NAME, _escape_local_name(name))
-      if callable(saveable_factory):
-        saveable = saveable_factory(name=attribute.checkpoint_key)
+      if cached_attributes is None:
+        saveables = None
       else:
-        saveable = saveable_factory
-      # Figure out the name-based Saver's name for this variable.
-      saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
-          [saveable], convert_variable_to_tensor=False)
-      attribute.full_name, = saver_dict.keys()
-      named_saveables[attribute.checkpoint_key] = saveable
+        saveables = cached_attributes.get(name, None)
+        if saveables is not None:
+          for saveable in saveables:
+            if attribute.checkpoint_key not in saveable.name:
+              # The checkpoint key for this SaveableObject is different. We need
+              # to re-create it.
+              saveables = None
+              del cached_attributes[name]
+              break
+      if saveables is None:
+        if callable(saveable_factory):
+          maybe_saveable = saveable_factory(name=attribute.checkpoint_key)
+        else:
+          maybe_saveable = saveable_factory
+        if isinstance(maybe_saveable, saveable_object_lib.SaveableObject):
+          saveables = (maybe_saveable,)
+        else:
+          # Figure out the name-based Saver's name for this variable. If it's
+          # already a SaveableObject we'd just get the checkpoint key back, so
+          # we leave full_name blank.
+          saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
+              [maybe_saveable], convert_variable_to_tensor=False)
+          full_name, = saver_dict.keys()
+          saveables = tuple(saver_lib.BaseSaverBuilder.SaveableObjectsForOp(
+              op=maybe_saveable, name=attribute.checkpoint_key))
+          for saveable in saveables:
+            saveable.full_name = full_name
+        for saveable in saveables:
+          if attribute.checkpoint_key not in saveable.name:
+            raise AssertionError(
+                ("The object %s produced a SaveableObject with name '%s' for "
+                 "attribute '%s'. Expected a name containing '%s'.")
+                % (checkpointable, name, saveable.name,
+                   attribute.checkpoint_key))
+        if cached_attributes is not None:
+          cached_attributes[name] = saveables
+
+      for saveable in saveables:
+        if hasattr(saveable, "full_name"):
+          attribute.full_name = saveable.full_name
+        saveable_feed_dict_fn = getattr(saveable, "feed_dict_additions", None)
+        if saveable_feed_dict_fn is not None:
+          saveable_feed_dict = saveable_feed_dict_fn()  # pylint: disable=not-callable
+          for new_feed_key in saveable_feed_dict.keys():
+            if new_feed_key in feed_additions:
+              raise AssertionError(
+                  ("The object %s tried to feed a value for the Tensor %s "
+                   "when saving, but another object is already feeding a "
+                   "value.")
+                  % (checkpointable, new_feed_key))
+          feed_additions.update(saveable_feed_dict)
+      named_saveables.extend(saveables)
 
     for child in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
       child_proto = object_proto.children.add()
       child_proto.node_id = node_ids[child.ref]
       child_proto.local_name = child.name
 
-  return named_saveables, object_graph_proto
+  return named_saveables, object_graph_proto, feed_additions
 
 
-def _serialize_object_graph(root_checkpointable):
+def _serialize_object_graph(root_checkpointable, saveables_cache):
   """Determine checkpoint keys for variables and build a serialized graph.
 
   Non-slot variables are keyed based on a shortest path from the root saveable
@@ -351,12 +481,17 @@ def _serialize_object_graph(root_checkpointable):
   Args:
     root_checkpointable: A `Checkpointable` object whose variables (including
       the variables of dependencies, recursively) should be saved.
+    saveables_cache: A dictionary mapping `Checkpointable` objects -> attribute
+      names -> SaveableObjects, used to avoid re-creating SaveableObjects when
+      graph building.
 
   Returns:
-    A tuple of (named_variables, object_graph_proto):
+    A tuple of (named_variables, object_graph_proto, feed_additions):
       named_variables: A dictionary mapping names to variable objects.
       object_graph_proto: A CheckpointableObjectGraph protocol buffer containing
         the serialized object graph and variable references.
+      feed_additions: A dictionary mapping from Tensors to values which should
+        be fed when saving.
 
   Raises:
     ValueError: If there are invalid characters in an optimizer's slot names.
@@ -376,7 +511,8 @@ def _serialize_object_graph(root_checkpointable):
       checkpointable_objects=checkpointable_objects,
       node_ids=node_ids,
       object_names=object_names,
-      slot_variables=slot_variables)
+      slot_variables=slot_variables,
+      saveables_cache=saveables_cache)
 
 
 def list_objects(root_checkpointable):
@@ -623,32 +759,61 @@ _DEPRECATED_RESTORE_INSTRUCTIONS = (
     "Restoring a name-based tf.train.Saver checkpoint using the object-based "
     "restore API. This mode uses global names to match variables, and so is "
     "somewhat fragile. It also adds new restore ops to the graph each time it "
-    "is called. Prefer re-encoding training checkpoints in the object-based "
-    "format: run save() on the object-based saver (the same one this message "
-    "is coming from) and use that checkpoint in the future.")
+    "is called when graph building. Prefer re-encoding training checkpoints in "
+    "the object-based format: run save() on the object-based saver (the same "
+    "one this message is coming from) and use that checkpoint in the future.")
 
 
+@deprecation.deprecated(
+    date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
 class NameBasedSaverStatus(_LoadStatus):
   """Status for loading a name-based training checkpoint."""
 
-  def __init__(self, object_saver, save_path):
-    self._object_saver = object_saver
-    self._save_path = save_path
+  def __init__(self, checkpoint, root_checkpointable):
+    self._checkpoint = checkpoint
+    self._root_checkpointable = root_checkpointable
 
   def assert_consumed(self):
-    """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
-    raise AssertionError(
-        "Restoring a name-based checkpoint. No load status is available.")
+    """Raises an exception if any variables/objects are unmatched."""
+    unused_attributes = dict(self._checkpoint.unused_attributes)
+    if unused_attributes:
+      raise AssertionError(
+          "Some objects had attributes which were not restored: %s"
+          % (unused_attributes,))
+    for checkpointable in list_objects(self._root_checkpointable):
+      # pylint: disable=protected-access
+      checkpointable._maybe_initialize_checkpointable()
+      if checkpointable._update_uid < self._checkpoint.restore_uid:
+        raise AssertionError("Object not restored: %s" % (checkpointable,))
+      # pylint: enable=protected-access
+
+  def _gather_saveable_objects(self):
+    """Walk the object graph, using global names for SaveableObjects."""
+    objects = list_objects(self._root_checkpointable)
+    saveable_objects = []
+    for checkpointable in objects:
+      # pylint: disable=protected-access
+      checkpointable._maybe_initialize_checkpointable()
+      if checkpointable._update_uid < self._checkpoint.restore_uid:
+        checkpointable._update_uid = self._checkpoint.restore_uid
+      else:
+        continue
+      # pylint: enable=protected-access
+      saveable_objects.extend(
+          self._checkpoint.globally_named_object_attributes(
+              checkpointable))
+    return saveable_objects
 
-  @deprecation.deprecated(
-      date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
   def run_restore_ops(self, session=None):
     """Load the name-based training checkpoint using a new `tf.train.Saver`."""
-    if session is None and not context.executing_eagerly():
+    if context.executing_eagerly():
+      return  # Nothing to do, variables are restored on creation.
+    if session is None:
       session = ops.get_default_session()
     with ops.device("/cpu:0"):
-      saver_lib.Saver(self._object_saver._global_variable_names()).restore(  # pylint: disable=protected-access
-          sess=session, save_path=self._save_path)
+      saveables = self._gather_saveable_objects()
+      saver_lib.Saver(saveables).restore(
+          sess=session, save_path=self._checkpoint.save_path)
 
   def initialize_or_restore(self, session=None):
     """Alias for `run_restore_ops`."""
@@ -728,6 +893,14 @@ class CheckpointableSaver(object):
     self._last_restore_object_graph = None
     self._last_restore_checkpoint = None
 
+    if context.executing_eagerly():
+      # SaveableObjects are always recreated when executing eagerly.
+      self._saveable_object_cache = None
+    else:
+      # Maps Checkpointable objects -> attribute names -> SaveableObjects, to
+      # avoid re-creating SaveableObjects when graph building.
+      self._saveable_object_cache = weakref.WeakKeyDictionary()
+
   @property
   def _root_checkpointable(self):
     if isinstance(self._root_checkpointable_ref, weakref.ref):
@@ -759,8 +932,9 @@ class CheckpointableSaver(object):
     Returns:
       The full path to the checkpoint.
     """
-    named_variables, graph_proto = _serialize_object_graph(
-        self._root_checkpointable)
+    named_variables, graph_proto, feed_additions = _serialize_object_graph(
+        self._root_checkpointable,
+        saveables_cache=self._saveable_object_cache)
     if not context.executing_eagerly():
       if session is None:
         session = ops.get_default_session()
@@ -769,15 +943,15 @@ class CheckpointableSaver(object):
           self._object_graph_feed_tensor = constant_op.constant(
               "", dtype=dtypes.string)
       object_graph_tensor = self._object_graph_feed_tensor
-      feed_additions = {object_graph_tensor: graph_proto.SerializeToString()}
+      feed_additions.update(
+          {object_graph_tensor: graph_proto.SerializeToString()})
     else:
       session = None
       with ops.device("/cpu:0"):
         object_graph_tensor = constant_op.constant(
             graph_proto.SerializeToString(), dtype=dtypes.string)
-      feed_additions = None
     assert checkpointable_lib.OBJECT_GRAPH_PROTO_KEY not in named_variables
-    named_variables[checkpointable_lib.OBJECT_GRAPH_PROTO_KEY] = (
+    named_variables.append(
         _NoRestoreSaveable(
             tensor=object_graph_tensor,
             name=checkpointable_lib.OBJECT_GRAPH_PROTO_KEY))
@@ -802,17 +976,6 @@ class CheckpointableSaver(object):
           global_step=checkpoint_number)
     return save_path
 
-  def _global_variable_names(self):
-    """Generate a `tf.train.Saver`-style `var_list` using `variable.name`s."""
-    named_saveables, graph_proto = _serialize_object_graph(
-        self._root_checkpointable)
-    saver_names = {}
-    for object_proto in graph_proto.nodes:
-      for attribute_proto in object_proto.attributes:
-        saver_names[attribute_proto.full_name] = named_saveables[
-            attribute_proto.checkpoint_key]
-    return saver_names
-
   def restore(self, save_path):
     """Restore a training checkpoint.
 
@@ -873,8 +1036,32 @@ class CheckpointableSaver(object):
     """
     if save_path is None:
       return InitializationOnlyStatus(self._root_checkpointable, ops.uid())
-    in_graph_mode = not context.executing_eagerly()
-    if in_graph_mode:
+    reader = pywrap_tensorflow.NewCheckpointReader(save_path)
+    graph_building = not context.executing_eagerly()
+    if graph_building:
+      dtype_map = None
+    else:
+      dtype_map = reader.get_variable_to_dtype_map()
+    try:
+      object_graph_string = reader.get_tensor(
+          checkpointable_lib.OBJECT_GRAPH_PROTO_KEY)
+    except errors_impl.NotFoundError:
+      # The object graph proto does not exist in this checkpoint. Try the
+      # name-based compatibility mode.
+      restore_coordinator = _NameBasedRestoreCoordinator(
+          save_path=save_path, dtype_map=dtype_map)
+      if not graph_building:
+        for existing_checkpointable in list_objects(self._root_checkpointable):
+          # pylint: disable=protected-access
+          existing_checkpointable._maybe_initialize_checkpointable()
+          existing_checkpointable._name_based_restores.add(restore_coordinator)
+          existing_checkpointable._name_based_attribute_restore(
+              restore_coordinator)
+          # pylint: enable=protected-access
+      return NameBasedSaverStatus(
+          restore_coordinator, root_checkpointable=self._root_checkpointable)
+
+    if graph_building:
       if self._file_prefix_placeholder is None:
         with ops.device("/cpu:0"):
           self._file_prefix_placeholder = constant_op.constant("model")
@@ -884,30 +1071,17 @@ class CheckpointableSaver(object):
       with ops.device("/cpu:0"):
         file_prefix_tensor = constant_op.constant(save_path)
       file_prefix_feed_dict = None
-    reader = pywrap_tensorflow.NewCheckpointReader(save_path)
-    try:
-      object_graph_string = reader.get_tensor(
-          checkpointable_lib.OBJECT_GRAPH_PROTO_KEY)
-    except errors_impl.NotFoundError:
-      # The object graph proto does not exist in this checkpoint. Try again with
-      # name-based saving.
-      return NameBasedSaverStatus(self, save_path)
-
     object_graph_proto = (
         checkpointable_object_graph_pb2.CheckpointableObjectGraph())
     object_graph_proto.ParseFromString(object_graph_string)
-    if in_graph_mode and object_graph_proto == self._last_restore_object_graph:
+    if graph_building and object_graph_proto == self._last_restore_object_graph:
       checkpoint = self._last_restore_checkpoint
     else:
-      if in_graph_mode:
-        dtype_map = None
-      else:
-        dtype_map = reader.get_variable_to_dtype_map()
       checkpoint = _CheckpointRestoreCoordinator(
           object_graph_proto=object_graph_proto,
           save_path=file_prefix_tensor,
           dtype_map=dtype_map)
-      if in_graph_mode:
+      if graph_building:
         if self._last_restore_object_graph is not None:
           raise NotImplementedError(
               "Using a single Saver to restore different object graphs is not "
@@ -1037,6 +1211,7 @@ class Checkpoint(checkpointable_lib.Checkpointable):
             % (v,))
       setattr(self, k, v)
     self._save_counter = None  # Created lazily for restore-on-create.
+    self._save_assign_op = None
     self._saver = CheckpointableSaver(weakref.ref(self))
 
   def _maybe_create_save_counter(self):
@@ -1044,8 +1219,11 @@ class Checkpoint(checkpointable_lib.Checkpointable):
     if self._save_counter is None:
       # Initialized to 0 and incremented before saving.
       with ops.device("/cpu:0"):
-        self._save_counter = add_variable(
-            self, name="save_counter", initializer=0, dtype=dtypes.int64)
+        # add_variable creates a dependency named "save_counter"; NoDependency
+        # prevents creating a second dependency named "_save_counter".
+        self._save_counter = checkpointable_lib.NoDependency(
+            add_variable(self, name="save_counter", initializer=0,
+                         dtype=dtypes.int64))
 
   @property
   def save_counter(self):
@@ -1077,8 +1255,8 @@ class Checkpoint(checkpointable_lib.Checkpointable):
     Returns:
       The full path to the checkpoint.
     """
-    in_graph_mode = not context.executing_eagerly()
-    if in_graph_mode:
+    graph_building = not context.executing_eagerly()
+    if graph_building:
       if session is None:
         session = ops.get_default_session()
       if self._save_counter is None:
@@ -1086,10 +1264,13 @@ class Checkpoint(checkpointable_lib.Checkpointable):
         # needs to be initialized before assign_add. This is only an issue if
         # restore() has not been called first.
         session.run(self.save_counter.initializer)
-    with ops.colocate_with(self.save_counter):
-      assign_op = self.save_counter.assign_add(1)
-    if in_graph_mode:
-      session.run(assign_op)
+    if not graph_building or self._save_assign_op is None:
+      with ops.colocate_with(self.save_counter):
+        assign_op = self.save_counter.assign_add(1, read_value=False)
+      if graph_building:
+        self._save_assign_op = assign_op
+    if graph_building:
+      session.run(self._save_assign_op)
     return self._saver.save(
         file_prefix=file_prefix,
         checkpoint_number=self.save_counter,
@@ -1134,9 +1315,9 @@ class Checkpoint(checkpointable_lib.Checkpointable):
     ops will grow as more objects are added to the dependency graph.
 
     Name-based `tf.train.Saver` checkpoints can be loaded using this
-    method. There is no deferred loading, and names are used to match
-    variables. No restore ops are created/run until `run_restore_ops()` or
-    `initialize_or_restore()` are called on the returned status object, even
+    method. Names are used to match variables. No restore ops are created/run
+    until `run_restore_ops()` or `initialize_or_restore()` are called on the
+    returned status object when graph building, but there is restore-on-creation
     when executing eagerly. Re-encode name-based checkpoints using
     `tf.train.Checkpoint.save` as soon as possible.
 
@@ -1162,14 +1343,13 @@ class Checkpoint(checkpointable_lib.Checkpointable):
       - `initialize_or_restore(session=None)`:
           When graph building, runs variable initializers if `save_path` is
           `None`, but otherwise runs restore operations. If no `session` is
-          explicitly specified, the default session is used. No effect for
-          object-based checkpoints when executing eagerly (variables are
-          initialized or restored eagerly).
+          explicitly specified, the default session is used. No effect when
+          executing eagerly (variables are initialized or restored eagerly).
       - `run_restore_ops(session=None)`:
           When graph building, runs restore operations. If no `session` is
-          explicitly specified, the default session is used. No effect for
-          object-based checkpoints when executing eagerly (restore operations
-          are run eagerly). May only be called when `save_path` is not `None`.
+          explicitly specified, the default session is used. No effect when
+          executing eagerly (restore operations are run eagerly). May only be
+          called when `save_path` is not `None`.
     """
     status = self._saver.restore(save_path=save_path)
     # Create the save counter now so it gets initialized with other variables
diff --git a/tensorflow/python/training/checkpointable_utils_test.py b/tensorflow/python/training/checkpointable/util_test.py
similarity index 91%
rename from tensorflow/python/training/checkpointable_utils_test.py
rename to tensorflow/python/training/checkpointable/util_test.py
index 3b8166bf37a6a8d37583c21620fc0aac1e4c35bd..8cdf5d78554b01874115d438e7f0fadaf5b6b91c 100644
--- a/tensorflow/python/training/checkpointable_utils_test.py
+++ b/tensorflow/python/training/checkpointable/util_test.py
@@ -17,10 +17,12 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
+import json
 import os
 
 import six
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -30,9 +32,9 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl.keras.engine import sequential
-from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -40,10 +42,10 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import adam
-from tensorflow.python.training import checkpointable
-from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 class NonLayerCheckpointable(checkpointable.Checkpointable):
@@ -120,7 +122,8 @@ class InterfaceTests(test.TestCase):
       # The .name attribute may be globally influenced, but the checkpoint name
       # won't be (tested below).
       self.assertEqual("duplicate_1:0", duplicate.name)
-    named_variables, _ = checkpointable_utils._serialize_object_graph(obj)
+    named_variables, _, _ = checkpointable_utils._serialize_object_graph(
+        obj, saveables_cache=None)
     expected_checkpoint_names = (
         "a_variable/.ATTRIBUTES/VARIABLE_VALUE",
         "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE",
@@ -129,7 +132,7 @@ class InterfaceTests(test.TestCase):
         "ones_initializer/.ATTRIBUTES/VARIABLE_VALUE",
     )
     six.assertCountEqual(
-        self, expected_checkpoint_names, named_variables.keys())
+        self, expected_checkpoint_names, [v.name for v in named_variables])
 
   def testInitNotCalled(self):
 
@@ -171,6 +174,27 @@ class InterfaceTests(test.TestCase):
         all_variable_names.append(attribute.full_name)
     self.assertIn("dense/kernel", all_variable_names)
 
+  def testNotCheckpointable(self):
+
+    class CallsFunctionalStuff(
+        checkpointable.NotCheckpointable, checkpointable.Checkpointable):
+      pass
+
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    checkpoint = checkpointable_utils.Checkpoint(x=CallsFunctionalStuff())
+    with self.assertRaises(NotImplementedError):
+      checkpoint.save(prefix)
+
+    class CallsFunctionalStuffOtherMRO(
+        checkpointable.Checkpointable, checkpointable.NotCheckpointable):
+      pass
+
+    checkpoint_reversed = checkpointable_utils.Checkpoint(
+        x=CallsFunctionalStuffOtherMRO())
+    with self.assertRaises(NotImplementedError):
+      checkpoint_reversed.save(prefix)
+
 
 class _MirroringSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
 
@@ -245,8 +269,9 @@ class CheckpointingTests(test.TestCase):
       self.evaluate(checkpointable_utils.gather_initializers(
           root_checkpointable))
       self.evaluate(train_op)
-    named_variables, serialized_graph = (
-        checkpointable_utils._serialize_object_graph(root_checkpointable))
+    named_variables, serialized_graph, _ = (
+        checkpointable_utils._serialize_object_graph(
+            root_checkpointable, saveables_cache=None))
     expected_checkpoint_names = (
         # Created in the root node, so no prefix.
         "optimizer_step",
@@ -269,24 +294,29 @@ class CheckpointingTests(test.TestCase):
     suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
     expected_checkpoint_names = [
         name + suffix for name in expected_checkpoint_names]
+    # The Dense layers also save get_config() JSON
+    expected_checkpoint_names.extend(
+        ["model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
+         "model/_named_dense/.ATTRIBUTES/OBJECT_CONFIG_JSON"])
+    named_variables = {v.name: v for v in named_variables}
     six.assertCountEqual(self, expected_checkpoint_names,
                          named_variables.keys())
     # Check that we've mapped to the right variable objects (not exhaustive)
     self.assertEqual(
-        "global_step:0",
-        named_variables["optimizer_step" + suffix].name)
+        "global_step",
+        named_variables["optimizer_step" + suffix].full_name)
     self.assertEqual(
-        "my_model/dense_1/kernel:0",
-        named_variables["model/_second/kernel" + suffix].name)
+        "my_model/dense_1/kernel",
+        named_variables["model/_second/kernel" + suffix].full_name)
     self.assertEqual(
-        "my_model/dense/kernel:0",
-        named_variables["model/_named_dense/kernel" + suffix].name)
+        "my_model/dense/kernel",
+        named_variables["model/_named_dense/kernel" + suffix].full_name)
     self.assertEqual(
-        "beta1_power:0",
-        named_variables["optimizer/beta1_power" + suffix].name)
+        "beta1_power",
+        named_variables["optimizer/beta1_power" + suffix].full_name)
     self.assertEqual(
-        "beta2_power:0",
-        named_variables["optimizer/beta2_power" + suffix].name)
+        "beta2_power",
+        named_variables["optimizer/beta2_power" + suffix].full_name)
     # Spot check the generated protocol buffers.
     self.assertEqual("optimizer",
                      serialized_graph.nodes[0].children[1].local_name)
@@ -311,7 +341,7 @@ class CheckpointingTests(test.TestCase):
     self.assertEqual(
         "my_model/dense/kernel/Adam:0",
         optimizer.get_slot(
-            var=named_variables["model/_named_dense/kernel" + suffix],
+            var=model._named_dense.kernel,
             name="m").name)
     self.assertEqual(
         "model/_named_dense/kernel" + suffix,
@@ -563,11 +593,11 @@ class CheckpointingTests(test.TestCase):
     root = checkpointable.Checkpointable()
     checkpointable_utils.add_variable(
         root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    checkpoint_name, = named_variables.keys()
-    with ops.name_scope("root/" + checkpoint_name):
+    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
+        root, saveables_cache=None)
+    with ops.name_scope("root/" + named_variable.name):
       pass  # Make sure we can use this as an op name if we prefix it.
-    return checkpoint_name
+    return named_variable.name
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testVariableNameEscaping(self):
@@ -585,9 +615,9 @@ class CheckpointingTests(test.TestCase):
     leaf = checkpointable.Checkpointable()
     root.leaf = leaf
     checkpointable_utils.add_variable(leaf, name="v", shape=[])
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    variable_name, = named_variables.keys()
-    self.assertEqual(r"leaf/v/.ATTRIBUTES/VARIABLE_VALUE", variable_name)
+    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
+        root, saveables_cache=None)
+    self.assertEqual(r"leaf/v/.ATTRIBUTES/VARIABLE_VALUE", named_variable.name)
 
   @test_util.run_in_graph_and_eager_modes()
   def testLocalNameValidation(self):
@@ -596,9 +626,10 @@ class CheckpointingTests(test.TestCase):
     # Dots are escaped, which avoids conflicts with reserved names.
     root._track_checkpointable(leaf, name=".ATTRIBUTES")
     checkpointable_utils.add_variable(checkpointable=leaf, name="a", shape=[])
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    name, = named_variables.keys()
-    self.assertEqual(name, "..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE")
+    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
+        root, saveables_cache=None)
+    self.assertEqual("..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE",
+                     named_variable.name)
 
   def testAnonymousVarsInInit(self):
 
@@ -1219,14 +1250,20 @@ class TemplateTests(test.TestCase):
 
     def _templated():
       v = variable_scope.get_variable(
-          "v", shape=[1], initializer=init_ops.zeros_initializer())
+          "v", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
       v2 = variable_scope.get_variable(
-          "v2", shape=[1], initializer=init_ops.zeros_initializer())
+          "v2", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
       return v, v + 1., v2
 
     save_template = template.make_template("s1", _templated)
-    save_root = checkpointable_utils.Checkpoint(my_template=save_template)
     v1_save, _, v2_save = save_template()
+    optimizer = adam.AdamOptimizer(0.0)
+    save_root = checkpointable_utils.Checkpoint(
+        my_template=save_template, optimizer=optimizer)
+    optimizer.minimize(v1_save.read_value)
+    self.evaluate([v.initializer for v in optimizer.variables()])
     self.evaluate(v1_save.assign([12.]))
     self.evaluate(v2_save.assign([14.]))
     checkpoint_directory = self.get_temp_dir()
@@ -1234,9 +1271,12 @@ class TemplateTests(test.TestCase):
     save_path = save_root.save(checkpoint_prefix)
 
     load_template = template.make_template("s2", _templated)
-    load_root = checkpointable_utils.Checkpoint(my_template=load_template)
+    load_optimizer = adam.AdamOptimizer(0.0)
+    load_root = checkpointable_utils.Checkpoint(
+        my_template=load_template, optimizer=load_optimizer)
     status = load_root.restore(save_path)
     var, var_plus_one, var2 = load_template()
+    load_optimizer.minimize(var.read_value)
     self.assertEqual(2, len(load_template._checkpoint_dependencies))
     self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
     self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
@@ -1356,12 +1396,22 @@ class CheckpointCompatibilityTests(test.TestCase):
       with self.assertRaises(AssertionError):
         self._check_sentinels(root)
       object_saver = checkpointable_utils.CheckpointableSaver(root)
+      self._set_sentinels(root)
       status = object_saver.restore(save_path)
-      with self.assertRaises(AssertionError):
-        status.assert_consumed()
+      if context.executing_eagerly():
+        self._check_sentinels(root)
+      if context.executing_eagerly():
+        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
+          status.assert_consumed()
+      else:
+        # When graph building, we haven't read any keys, so we don't know
+        # whether the restore will be complete.
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_consumed()
       status.run_restore_ops()
       self._check_sentinels(root)
       self._set_sentinels(root)
+      status = object_saver.restore(save_path)
       status.initialize_or_restore()
       self._check_sentinels(root)
 
@@ -1395,5 +1445,48 @@ class CheckpointCompatibilityTests(test.TestCase):
         root.restore(save_path).assert_consumed().run_restore_ops()
         self._check_sentinels(root)
 
+
+class PythonMetadataTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSaveLoad(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dense = core.Dense(1)
+    checkpoint = checkpointable_utils.Checkpoint(dense=dense)
+    dense(constant_op.constant([[1.]]))
+    checkpoint.restore(None).initialize_or_restore()
+    save_path = checkpoint.save(checkpoint_prefix)
+
+    def _get_dense_node_from_object_graph(object_graph_proto):
+      root_node = object_graph_proto.nodes[0]
+      for child in root_node.children:
+        if child.local_name == "dense":
+          break
+      else:
+        raise AssertionError(
+            "Expected a 'dense' dependency of root, didn't find one.")
+      dense_node = object_graph_proto.nodes[child.node_id]  # pylint: disable=undefined-loop-variable
+      self.assertEqual(1, len(dense_node.attributes))
+      reader = pywrap_tensorflow.NewCheckpointReader(save_path)
+      layer_json = reader.get_tensor(dense_node.attributes[0].checkpoint_key)
+      return json.loads(layer_json.decode("utf-8"))
+
+    layer_data = _get_dense_node_from_object_graph(
+        checkpointable_utils.object_metadata(save_path))
+    self.assertEqual("Dense", layer_data["class_name"])
+    self.assertEqual(1, layer_data["config"]["units"])
+
+    # Check that no new ops are added to the graph the second time we save.
+    ops.get_default_graph().finalize()
+
+    dense.units = 42
+    save_path = checkpoint.save(checkpoint_prefix)
+    layer_data = _get_dense_node_from_object_graph(
+        checkpointable_utils.object_metadata(save_path))
+    self.assertEqual("Dense", layer_data["class_name"])
+    self.assertEqual(42, layer_data["config"]["units"])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index c0d5ea36dd2d4d13ac1c0c90fe3110891ba1f51a..ab8b37bb655bfc3c222ed661b6d48f0ecdc3a858 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -357,14 +357,14 @@ class DistributionStrategy(object):
     on different slices of the input data. This is in contrast to
     _model parallelism_ where we divide up a single copy of a model
     across multiple devices.
-    Note: for now we only support data parallelism at this time, but
+    Note: we only support data parallelism for now, but
     hope to add support for model parallelism in the future.
   * A _tower_ is one copy of the model, running on one slice of the
     input data.
-  * _Synchronous_, or more commonly _sync_, training is when the
+  * _Synchronous_, or more commonly _sync_, training is where the
     updates from each tower are aggregated together before updating
     the model variables. This is in contrast to _asynchronous_, or
-    _async_ training where each tower updates the model variables
+    _async_ training, where each tower updates the model variables
     independently.
   * Furthermore you might run your computation on multiple devices
     on one machine (or "host"), or on multiple machines/hosts.
@@ -386,11 +386,11 @@ class DistributionStrategy(object):
   * Reductions and Allreduce: A _reduction_ is some method of
     aggregating multiple values into one value, like "sum" or
     "mean". If doing sync training, we will perform a reduction on the
-    gradients to a parameter from each tower before applying the
+    gradients to a parameter from all towers before applying the
     update. Allreduce is an algorithm for performing a reduction on
     values from multiple devices and making the result available on
     all of those devices.
-  * In the future we will have support for TensorFlows' partitioned
+  * In the future we will have support for TensorFlow's partitioned
     variables, where a single variable is split across multiple
     devices.
 
@@ -419,9 +419,9 @@ class DistributionStrategy(object):
     `tower_fn` can use the `get_tower_context()` API to get enhanced
     behavior in this case.
 
-    You can also create an initializable iterator instead of one shot iterator.
-    In that case, you will need to ensure that you initialize the iterator
-    before calling get_next.
+    You can also create an initializable iterator instead of a one-shot
+    iterator. In that case, you will need to ensure that you initialize the
+    iterator before calling get_next.
     ```
     iterator = my_distribution.distribute_dataset(
         dataset).make_initializable_iterator())
@@ -816,6 +816,7 @@ class DistributionStrategy(object):
     # TODO(josh11b): Return an unwrapped value if colocate_with is a
     # single device.
     _require_cross_tower_context(self)
+    assert method_string in ("sum", "mean")
     return self._reduce(method_string, value, destinations)
 
   def _reduce(self, method_string, value, destinations):
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index f584a009d946a193f1ab76b3030db4f8a4954d27..fece3370f343173de46bc447c478264864708dca 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -25,7 +25,6 @@ import sys
 import six
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.estimator import util
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -41,6 +40,7 @@ from tensorflow.python.training import queue_runner
 from tensorflow.python.training import saver as training_saver
 from tensorflow.python.training import session_manager as sm
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.util import function_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -620,7 +620,7 @@ class _MonitoredSession(object):
         `step_context`. It may also optionally have `self` for cases when it
         belongs to an object.
     """
-    step_fn_arguments = util.fn_args(step_fn)
+    step_fn_arguments = function_utils.fn_args(step_fn)
     if step_fn_arguments != ('step_context',) and step_fn_arguments != (
         'self',
         'step_context',
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 66914bacf35c75639f4636713cca2dba4ee19e3f..a9287a0f0d0391cc6e0b297cce18eebaf9f64291 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -34,9 +34,9 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import checkpointable
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import slot_creator
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -1175,7 +1175,16 @@ class Optimizer(
     variable_key = _var_key(variable)
     slot_variable = named_slots.get(variable_key, None)
     if (slot_variable is None and context.executing_eagerly() and
-        slot_variable_position.is_simple_variable()):
+        slot_variable_position.is_simple_variable()
+        # Defer slot variable creation if there is an active variable creator
+        # scope. Generally we'd like to eagerly create/restore slot variables
+        # when possible, but this may mean that scopes intended to catch
+        # `variable` also catch its eagerly created slot variable
+        # unintentionally (specifically make_template would add a dependency on
+        # a slot variable if not for this case). Deferring is mostly harmless
+        # (aside from double initialization), and makes variable creator scopes
+        # behave the same way they do when graph building.
+        and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
       initializer = checkpointable.CheckpointInitialValue(
           checkpoint_position=slot_variable_position)
       slot_variable = self._get_or_make_slot(
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 8134fd74aa7f4a1219f131fdd61bf95b0c392ac8..4d464135fd03330134c0a371853d6bc8a228cd21 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -53,10 +53,10 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpointable
 from tensorflow.python.training import saveable_object
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -569,6 +569,76 @@ class BaseSaverBuilder(object):
       # pylint: enable=protected-access
     return names_to_saveables
 
+  @staticmethod
+  def SaveableObjectsForOp(op, name):
+    """Create `SaveableObject`s from an operation.
+
+    Args:
+      op: A variable, operation, or SaveableObject to coerce into a
+        SaveableObject.
+      name: A string name for the SaveableObject.
+
+    Yields:
+      `SaveableObject`s which together save/restore `op`.
+
+    Raises:
+      TypeError: If `name` is not a string.
+      ValueError: For operations with no known conversion to SaveableObject.
+    """
+    if not isinstance(name, six.string_types):
+      raise TypeError(
+          "names_to_saveables must be a dict mapping string names to "
+          "checkpointable operations. Name is not a string: %s" % name)
+    if isinstance(op, BaseSaverBuilder.SaveableObject):
+      yield op
+    elif isinstance(op, (list, tuple, variables.PartitionedVariable)):
+      if isinstance(op, variables.PartitionedVariable):
+        op = list(op)
+      # A set of slices.
+      slice_name = None
+      # pylint: disable=protected-access
+      for variable in op:
+        if not isinstance(variable, variables.Variable):
+          raise ValueError("Slices must all be Variables: %s" % variable)
+        if not variable._save_slice_info:
+          raise ValueError("Slices must all be slices: %s" % variable)
+        if slice_name is None:
+          slice_name = variable._save_slice_info.full_name
+        elif slice_name != variable._save_slice_info.full_name:
+          raise ValueError(
+              "Slices must all be from the same tensor: %s != %s" %
+              (slice_name, variable._save_slice_info.full_name))
+        if variable.op.type in ["Variable", "VariableV2",
+                                "AutoReloadVariable"]:
+          yield BaseSaverBuilder.VariableSaveable(
+              variable, variable._save_slice_info.spec, name)
+        else:
+          yield BaseSaverBuilder.ResourceVariableSaveable(
+              variable, variable._save_slice_info.spec, name)
+      # pylint: enable=protected-access
+    else:
+      # A variable or tensor.
+      if context.executing_eagerly():
+        if not isinstance(op, resource_variable_ops.ResourceVariable):
+          raise ValueError("Can only save/restore ResourceVariable eager "
+                           "mode is enabled, type: %s." % type(op))
+        yield BaseSaverBuilder.ResourceVariableSaveable(op, "", name)
+      else:
+        if isinstance(op, resource_variable_ops.ResourceVariable):
+          variable = op._graph_element  # pylint: disable=protected-access
+        else:
+          variable = ops.internal_convert_to_tensor(op, as_ref=True)
+        if not BaseSaverBuilder._IsVariable(variable):
+          raise TypeError("names_to_saveables must be a dict mapping string "
+                          "names to Tensors/Variables. Not a variable: %s" %
+                          variable)
+        if variable.op.type in ["Variable", "VariableV2",
+                                "AutoReloadVariable"]:
+          yield BaseSaverBuilder.VariableSaveable(variable, "", name)
+        else:
+          yield BaseSaverBuilder.ResourceVariableSaveable(
+              variable, "", name)
+
   def _ValidateAndSliceInputs(self, names_to_saveables):
     """Returns the variables and names that will be used for a Saver.
 
@@ -590,63 +660,11 @@ class BaseSaverBuilder(object):
 
     saveables = []
     seen_ops = set()
-    for name in sorted(names_to_saveables.keys()):
-      if not isinstance(name, six.string_types):
-        raise TypeError(
-            "names_to_saveables must be a dict mapping string names to "
-            "checkpointable operations. Name is not a string: %s" % name)
-      op = names_to_saveables[name]
-      if isinstance(op, BaseSaverBuilder.SaveableObject):
-        self._AddSaveable(saveables, seen_ops, op)
-      elif isinstance(op, (list, tuple, variables.PartitionedVariable)):
-        if isinstance(op, variables.PartitionedVariable):
-          op = list(op)
-        # A set of slices.
-        slice_name = None
-        # pylint: disable=protected-access
-        for variable in op:
-          if not isinstance(variable, variables.Variable):
-            raise ValueError("Slices must all be Variables: %s" % variable)
-          if not variable._save_slice_info:
-            raise ValueError("Slices must all be slices: %s" % variable)
-          if slice_name is None:
-            slice_name = variable._save_slice_info.full_name
-          elif slice_name != variable._save_slice_info.full_name:
-            raise ValueError(
-                "Slices must all be from the same tensor: %s != %s" %
-                (slice_name, variable._save_slice_info.full_name))
-          if variable.op.type in ["Variable", "VariableV2",
-                                  "AutoReloadVariable"]:
-            saveable = BaseSaverBuilder.VariableSaveable(
-                variable, variable._save_slice_info.spec, name)
-          else:
-            saveable = BaseSaverBuilder.ResourceVariableSaveable(
-                variable, variable._save_slice_info.spec, name)
-          self._AddSaveable(saveables, seen_ops, saveable)
-        # pylint: enable=protected-access
-      else:
-        # A variable or tensor.
-        if context.executing_eagerly():
-          if not isinstance(op, resource_variable_ops.ResourceVariable):
-            raise ValueError("Can only save/restore ResourceVariable eager "
-                             "mode is enabled, type: %s." % type(op))
-          saveable = BaseSaverBuilder.ResourceVariableSaveable(op, "", name)
-        else:
-          if isinstance(op, resource_variable_ops.ResourceVariable):
-            variable = op._graph_element  # pylint: disable=protected-access
-          else:
-            variable = ops.internal_convert_to_tensor(op, as_ref=True)
-          if not BaseSaverBuilder._IsVariable(variable):
-            raise TypeError("names_to_saveables must be a dict mapping string "
-                            "names to Tensors/Variables. Not a variable: %s" %
-                            variable)
-          if variable.op.type in ["Variable", "VariableV2",
-                                  "AutoReloadVariable"]:
-            saveable = BaseSaverBuilder.VariableSaveable(variable, "", name)
-          else:
-            saveable = BaseSaverBuilder.ResourceVariableSaveable(
-                variable, "", name)
-        self._AddSaveable(saveables, seen_ops, saveable)
+    for name, op in sorted(names_to_saveables.items(),
+                           # Avoid comparing ops, sort only by name.
+                           key=lambda x: x[0]):
+      for converted_saveable_object in self.SaveableObjectsForOp(op, name):
+        self._AddSaveable(saveables, seen_ops, converted_saveable_object)
     return saveables
 
   def _AddSaveable(self, saveables, seen_ops, saveable):
@@ -1736,13 +1754,17 @@ class Saver(object):
       exception_type, exception_value, exception_traceback = sys.exc_info()
       # The checkpoint would not be loaded successfully as is. Try to parse it
       # as an object-based checkpoint.
+      should_reraise = False
       try:
         reader = pywrap_tensorflow.NewCheckpointReader(save_path)
         object_graph_string = reader.get_tensor(
             checkpointable.OBJECT_GRAPH_PROTO_KEY)
       except errors.NotFoundError:
         # This is not an object-based checkpoint, or the checkpoint doesn't
-        # exist. Re-raise the original exception.
+        # exist. Re-raise the original exception, but do it outside the except
+        # block so the object graph lookup isn't included in the stack trace.
+        should_reraise = True
+      if should_reraise:
         six.reraise(exception_type, exception_value, exception_traceback)
       del exception_traceback  # avoid reference cycles
 
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 70495291bc57629252423d86e35987c5c0d2b1ee..f1991093e0b519da7448809e759a1cd5c57b80d9 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -24,8 +24,10 @@ import math
 import os
 import random
 import shutil
+import sys
 import tempfile
 import time
+import traceback
 
 import numpy as np
 import six
@@ -51,8 +53,8 @@ from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -71,18 +73,17 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary
 from tensorflow.python.training import adam
-from tensorflow.python.training import checkpointable
-from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import saver as saver_module
 from tensorflow.python.training import saver_test_utils
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 from tensorflow.python.util import compat
 
 
-@test_util.with_c_api
 class SaverTest(test.TestCase):
 
   def basicSaveRestore(self, variable_op):
@@ -769,7 +770,6 @@ class SaverTest(test.TestCase):
       save.save(sess, save_path)
 
 
-@test_util.with_c_api
 class SaveRestoreShardedTest(test.TestCase):
 
   _WRITE_VERSION = saver_pb2.SaverDef.V1
@@ -1044,12 +1044,10 @@ class SaveRestoreShardedTest(test.TestCase):
     self._testPartitionedVariables(use_resource=True)
 
 
-@test_util.with_c_api
 class SaveRestoreShardedTestV2(SaveRestoreShardedTest):
   _WRITE_VERSION = saver_pb2.SaverDef.V2
 
 
-@test_util.with_c_api
 class MaxToKeepTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -1390,7 +1388,6 @@ class MaxToKeepTest(test.TestCase):
       self.assertFalse(gfile.Exists(save._MetaGraphFilename(s1)))
 
 
-@test_util.with_c_api
 class KeepCheckpointEveryNHoursTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -1450,7 +1447,6 @@ class KeepCheckpointEveryNHoursTest(test.TestCase):
       self.assertTrue(saver_module.checkpoint_exists(s4))
 
 
-@test_util.with_c_api
 class SaveRestoreWithVariableNameMap(test.TestCase):
 
   def _testNonReshape(self, variable_op):
@@ -1527,7 +1523,6 @@ class SaveRestoreWithVariableNameMap(test.TestCase):
     self._testNonReshape(variables.Variable)
 
 
-@test_util.with_c_api
 class LatestCheckpointWithRelativePaths(test.TestCase):
 
   @staticmethod
@@ -1629,7 +1624,6 @@ class LatestCheckpointWithRelativePaths(test.TestCase):
           self.assertEqual(v0.eval(), 2.0)
 
 
-@test_util.with_c_api
 class CheckpointStateTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -1744,7 +1738,6 @@ class CheckpointStateTest(test.TestCase):
                      os.path.join(save_dir, "./model.ckpt-687529"))
 
 
-@test_util.with_c_api
 class MetaGraphTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -2464,7 +2457,6 @@ class MetaGraphTest(test.TestCase):
           sess.run("new_model/output:0")
 
 
-@test_util.with_c_api
 class CheckpointReaderTest(test.TestCase):
 
   _WRITE_VERSION = saver_pb2.SaverDef.V1
@@ -2517,12 +2509,10 @@ class CheckpointReaderTest(test.TestCase):
       pywrap_tensorflow.NewCheckpointReader("non-existent")
 
 
-@test_util.with_c_api
 class CheckpointReaderForV2Test(CheckpointReaderTest):
   _WRITE_VERSION = saver_pb2.SaverDef.V2
 
 
-@test_util.with_c_api
 class WriteGraphTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -2550,7 +2540,6 @@ class WriteGraphTest(test.TestCase):
     self.assertTrue(os.path.exists(path))
 
 
-@test_util.with_c_api
 class SaverUtilsTest(test.TestCase):
 
   def setUp(self):
@@ -2593,7 +2582,6 @@ class SaverUtilsTest(test.TestCase):
     self.assertTrue(mtimes[1] >= mtimes[0])
 
 
-@test_util.with_c_api
 class ScopedGraphTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -2976,7 +2964,6 @@ class MyModel(training.Model):
     return ret
 
 
-@test_util.with_c_api
 class CheckpointableCompatibilityTests(test.TestCase):
 
   # TODO(allenl): Track down python3 reference cycles in these tests.
@@ -3108,6 +3095,16 @@ class CheckpointableCompatibilityTests(test.TestCase):
           errors.NotFoundError,
           "Failed to find any matching files for path_which_does_not_exist"):
         saver.restore(sess=sess, save_path="path_which_does_not_exist")
+      try:
+        saver.restore(sess=sess, save_path="path_which_does_not_exist")
+      except errors.NotFoundError:
+        # Make sure we don't have a confusing "During handling of the above
+        # exception" block in Python 3.
+        # pylint: disable=no-value-for-parameter
+        exception_string = "\n".join(
+            traceback.format_exception(*sys.exc_info()))
+        # pylint: enable=no-value-for-parameter
+        self.assertNotIn("NewCheckpointReader", exception_string)
 
   def testLoadFromObjectBasedGraph(self):
     checkpoint_directory = self.get_temp_dir()
diff --git a/tensorflow/python/training/slot_creator_test.py b/tensorflow/python/training/slot_creator_test.py
index b0f48e4ecd4d41946a8a5ed5a0c507a2344a943a..08a3c8dc53a5e88559ddeaf1f95d441fa5adfd29 100644
--- a/tensorflow/python/training/slot_creator_test.py
+++ b/tensorflow/python/training/slot_creator_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
@@ -30,7 +29,6 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import slot_creator
 
 
-@test_util.with_c_api
 class SlotCreatorTest(test.TestCase):
 
   def testCreateSlotFromVariable(self):
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index 427e25d0f63a80e559822d77adde350de2b9f09b..3f2dc6797623b4973543b674c3069a3110c59465 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -67,7 +67,7 @@ from tensorflow.python.training.basic_session_run_hooks import FinalOpsHook
 from tensorflow.python.training.basic_session_run_hooks import FeedFnHook
 from tensorflow.python.training.basic_session_run_hooks import ProfilerHook
 from tensorflow.python.training.basic_loops import basic_train_loop
-from tensorflow.python.training.checkpointable_utils import Checkpoint
+from tensorflow.python.training.checkpointable.util import Checkpoint
 from tensorflow.python.training.checkpoint_utils import init_from_checkpoint
 from tensorflow.python.training.checkpoint_utils import list_variables
 from tensorflow.python.training.checkpoint_utils import load_checkpoint
diff --git a/tensorflow/python/training/warm_starting_util.py b/tensorflow/python/training/warm_starting_util.py
index 4d4fb394c1272d2bf510bb594d70b9aa2edb3df2..b0f37f8cb9e4d51e73335cab153d159484803158 100644
--- a/tensorflow/python/training/warm_starting_util.py
+++ b/tensorflow/python/training/warm_starting_util.py
@@ -33,7 +33,7 @@ from tensorflow.python.training import saver
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.VocabInfo", "estimator.VocabInfo")
+@tf_export("train.VocabInfo", allow_multiple_exports=True)
 class VocabInfo(
     collections.namedtuple("VocabInfo", [
         "new_vocab",
diff --git a/tensorflow/python/util/function_utils.py b/tensorflow/python/util/function_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bbbde3cd288a7373c1ac845977a4d92d2a1b7c0
--- /dev/null
+++ b/tensorflow/python/util/function_utils.py
@@ -0,0 +1,57 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility to retrieve function args."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+
+def _is_bounded_method(fn):
+  _, fn = tf_decorator.unwrap(fn)
+  return tf_inspect.ismethod(fn) and (fn.__self__ is not None)
+
+
+def _is_callable_object(obj):
+  return hasattr(obj, '__call__') and tf_inspect.ismethod(obj.__call__)
+
+
+def fn_args(fn):
+  """Get argument names for function-like object.
+
+  Args:
+    fn: Function, or function-like object (e.g., result of `functools.partial`).
+
+  Returns:
+    `tuple` of string argument names.
+
+  Raises:
+    ValueError: if partial function has positionally bound arguments
+  """
+  if isinstance(fn, functools.partial):
+    args = fn_args(fn.func)
+    args = [a for a in args[len(fn.args):] if a not in (fn.keywords or [])]
+  else:
+    if _is_callable_object(fn):
+      fn = fn.__call__
+    args = tf_inspect.getfullargspec(fn).args
+    if _is_bounded_method(fn):
+      args.remove('self')
+  return tuple(args)
diff --git a/tensorflow/python/estimator/util_test.py b/tensorflow/python/util/function_utils_test.py
similarity index 85%
rename from tensorflow/python/estimator/util_test.py
rename to tensorflow/python/util/function_utils_test.py
index 4b2c8d7637e2e67369e8e2a679c328c10791a52e..e78cf6a5b02af317b08ff3a833f7b73b062f106e 100644
--- a/tensorflow/python/estimator/util_test.py
+++ b/tensorflow/python/util/function_utils_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import functools
 
-from tensorflow.python.estimator import util
 from tensorflow.python.platform import test
+from tensorflow.python.util import function_utils
 
 
 class FnArgsTest(test.TestCase):
@@ -29,7 +29,7 @@ class FnArgsTest(test.TestCase):
   def test_simple_function(self):
     def fn(a, b):
       return a + b
-    self.assertEqual(('a', 'b'), util.fn_args(fn))
+    self.assertEqual(('a', 'b'), function_utils.fn_args(fn))
 
   def test_callable(self):
 
@@ -38,7 +38,7 @@ class FnArgsTest(test.TestCase):
       def __call__(self, a, b):
         return a + b
 
-    self.assertEqual(('a', 'b'), util.fn_args(Foo()))
+    self.assertEqual(('a', 'b'), function_utils.fn_args(Foo()))
 
   def test_bounded_method(self):
 
@@ -47,7 +47,7 @@ class FnArgsTest(test.TestCase):
       def bar(self, a, b):
         return a + b
 
-    self.assertEqual(('a', 'b'), util.fn_args(Foo().bar))
+    self.assertEqual(('a', 'b'), function_utils.fn_args(Foo().bar))
 
   def test_partial_function(self):
     expected_test_arg = 123
@@ -59,7 +59,7 @@ class FnArgsTest(test.TestCase):
 
     wrapped_fn = functools.partial(fn, test_arg=123)
 
-    self.assertEqual(('a',), util.fn_args(wrapped_fn))
+    self.assertEqual(('a',), function_utils.fn_args(wrapped_fn))
 
   def test_partial_function_with_positional_args(self):
     expected_test_arg = 123
@@ -71,7 +71,7 @@ class FnArgsTest(test.TestCase):
 
     wrapped_fn = functools.partial(fn, 123)
 
-    self.assertEqual(('a',), util.fn_args(wrapped_fn))
+    self.assertEqual(('a',), function_utils.fn_args(wrapped_fn))
 
     self.assertEqual(3, wrapped_fn(3))
     self.assertEqual(3, wrapped_fn(a=3))
@@ -88,7 +88,7 @@ class FnArgsTest(test.TestCase):
     wrapped_fn = functools.partial(fn, test_arg2=456)
     double_wrapped_fn = functools.partial(wrapped_fn, test_arg1=123)
 
-    self.assertEqual(('a',), util.fn_args(double_wrapped_fn))
+    self.assertEqual(('a',), function_utils.fn_args(double_wrapped_fn))
 
   def test_double_partial_with_positional_args_in_outer_layer(self):
     expected_test_arg1 = 123
@@ -102,7 +102,7 @@ class FnArgsTest(test.TestCase):
     wrapped_fn = functools.partial(fn, test_arg2=456)
     double_wrapped_fn = functools.partial(wrapped_fn, 123)
 
-    self.assertEqual(('a',), util.fn_args(double_wrapped_fn))
+    self.assertEqual(('a',), function_utils.fn_args(double_wrapped_fn))
 
     self.assertEqual(3, double_wrapped_fn(3))
     self.assertEqual(3, double_wrapped_fn(a=3))
@@ -119,7 +119,7 @@ class FnArgsTest(test.TestCase):
     wrapped_fn = functools.partial(fn, 123)  # binds to test_arg1
     double_wrapped_fn = functools.partial(wrapped_fn, 456)  # binds to test_arg2
 
-    self.assertEqual(('a',), util.fn_args(double_wrapped_fn))
+    self.assertEqual(('a',), function_utils.fn_args(double_wrapped_fn))
 
     self.assertEqual(3, double_wrapped_fn(3))
     self.assertEqual(3, double_wrapped_fn(a=3))
diff --git a/tensorflow/python/util/serialization.py b/tensorflow/python/util/serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..faf5164faa7f1c2d085e13358c8266c4eef22524
--- /dev/null
+++ b/tensorflow/python/util/serialization.py
@@ -0,0 +1,64 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for serializing Python objects."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import tensor_shape
+
+
+def get_json_type(obj):
+  """Serializes any object to a JSON-serializable structure.
+
+  Arguments:
+      obj: the object to serialize
+
+  Returns:
+      JSON-serializable structure representing `obj`.
+
+  Raises:
+      TypeError: if `obj` cannot be serialized.
+  """
+  # if obj is a serializable Keras class instance
+  # e.g. optimizer, layer
+  if hasattr(obj, 'get_config'):
+    return {'class_name': obj.__class__.__name__, 'config': obj.get_config()}
+
+  # if obj is any numpy type
+  if type(obj).__module__ == np.__name__:
+    if isinstance(obj, np.ndarray):
+      return {'type': type(obj), 'value': obj.tolist()}
+    else:
+      return obj.item()
+
+  # misc functions (e.g. loss function)
+  if callable(obj):
+    return obj.__name__
+
+  # if obj is a python 'type'
+  if type(obj).__name__ == type.__name__:
+    return obj.__name__
+
+  if isinstance(obj, tensor_shape.Dimension):
+    return obj.value
+
+  if isinstance(obj, tensor_shape.TensorShape):
+    return obj.as_list()
+
+  raise TypeError('Not JSON Serializable:', obj)
diff --git a/tensorflow/python/util/serialization_test.py b/tensorflow/python/util/serialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5000bcfad05900e63bc72c1bd0e31e30434b74ae
--- /dev/null
+++ b/tensorflow/python/util/serialization_test.py
@@ -0,0 +1,76 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for serialization functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.platform import test
+from tensorflow.python.util import serialization
+
+
+class SerializationTests(test.TestCase):
+
+  def test_serialize_dense(self):
+    dense = core.Dense(3)
+    dense(constant_op.constant([[4.]]))
+    round_trip = json.loads(json.dumps(
+        dense, default=serialization.get_json_type))
+    self.assertEqual(3, round_trip["config"]["units"])
+
+  def test_serialize_shape(self):
+    round_trip = json.loads(json.dumps(
+        tensor_shape.TensorShape([None, 2, 3]),
+        default=serialization.get_json_type))
+    self.assertIs(round_trip[0], None)
+    self.assertEqual(round_trip[1], 2)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_serialize_sequential(self):
+    model = sequential.Sequential()
+    model.add(core.Dense(4))
+    model.add(core.Dense(5))
+    model(constant_op.constant([[1.]]))
+    sequential_round_trip = json.loads(
+        json.dumps(model, default=serialization.get_json_type))
+    self.assertEqual(5, sequential_round_trip["config"][1]["config"]["units"])
+    input_round_trip = json.loads(
+        json.dumps(model._input_layers, default=serialization.get_json_type))
+    self.assertAllEqual([1, 1],
+                        input_round_trip[0]["config"]["batch_input_shape"])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_serialize_model(self):
+    x = input_layer.Input(shape=[3])
+    y = core.Dense(10)(x)
+    model = training.Model(x, y)
+    model(constant_op.constant([[1., 1., 1.]]))
+    model_round_trip = json.loads(
+        json.dumps(model, default=serialization.get_json_type))
+    self.assertEqual(
+        10, model_round_trip["config"]["layers"][1]["config"]["units"])
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index a30b8b1336358372bed3b6fa0b853e79c1535036..bf3961c6920c4c6ade0593b28f9eb1fd23ea8e0d 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -59,13 +59,19 @@ class tf_export(object):  # pylint: disable=invalid-name
 
     Args:
       *args: API names in dot delimited format.
-      **kwargs: Optional keyed arguments. Currently only supports 'overrides'
-        argument. overrides: List of symbols that this is overriding
-        (those overrided api exports will be removed). Note: passing overrides
-        has no effect on exporting a constant.
+      **kwargs: Optional keyed arguments.
+          overrides: List of symbols that this is overriding
+          (those overrided api exports will be removed). Note: passing overrides
+          has no effect on exporting a constant.
+          allow_multiple_exports: Allows exporting the same symbol multiple
+          times with multiple `tf_export` usages. Prefer however, to list all
+          of the exported names in a single `tf_export` usage when possible.
+
     """
     self._names = args
     self._overrides = kwargs.get('overrides', [])
+    self._allow_multiple_exports = kwargs.get(
+        'allow_multiple_exports', False)
 
   def __call__(self, func):
     """Calls this decorator.
@@ -77,7 +83,8 @@ class tf_export(object):  # pylint: disable=invalid-name
       The input function with _tf_api_names attribute set.
 
     Raises:
-      SymbolAlreadyExposedError: Raised when a symbol already has API names.
+      SymbolAlreadyExposedError: Raised when a symbol already has API names
+        and kwarg `allow_multiple_exports` not set.
     """
     # Undecorate overridden names
     for f in self._overrides:
@@ -90,16 +97,14 @@ class tf_export(object):  # pylint: disable=invalid-name
     # __dict__ instead of using hasattr to verify that subclasses have
     # their own _tf_api_names as opposed to just inheriting it.
     if '_tf_api_names' in undecorated_func.__dict__:
-      # pylint: disable=protected-access
-      raise SymbolAlreadyExposedError(
-          'Symbol %s is already exposed as %s.' %
-          (undecorated_func.__name__, undecorated_func._tf_api_names))
-      # pylint: enable=protected-access
-
-    # Complete the export by creating/overriding attribute
-    # pylint: disable=protected-access
-    undecorated_func._tf_api_names = self._names
-    # pylint: enable=protected-access
+      if self._allow_multiple_exports:
+        undecorated_func._tf_api_names += self._names  # pylint: disable=protected-access
+      else:
+        raise SymbolAlreadyExposedError(
+            'Symbol %s is already exposed as %s.' %
+            (undecorated_func.__name__, undecorated_func._tf_api_names))  # pylint: disable=protected-access
+    else:
+      undecorated_func._tf_api_names = self._names  # pylint: disable=protected-access
     return func
 
   def export_constant(self, module_name, name):
diff --git a/tensorflow/python/util/tf_export_test.py b/tensorflow/python/util/tf_export_test.py
index b9e26ecb33383f5aa936a6bc92acea6d91eb996e..ace3f054ba952f012aa5ca642e490b1f45f8ba1d 100644
--- a/tensorflow/python/util/tf_export_test.py
+++ b/tensorflow/python/util/tf_export_test.py
@@ -128,6 +128,13 @@ class ValidateExportTest(test.TestCase):
     with self.assertRaises(tf_export.SymbolAlreadyExposedError):
       export_decorator(_test_function)
 
+  def testEAllowMultipleExports(self):
+    _test_function._tf_api_names = ['name1', 'name2']
+    tf_export.tf_export('nameRed', 'nameBlue', allow_multiple_exports=True)(
+        _test_function)
+    self.assertEquals(['name1', 'name2', 'nameRed', 'nameBlue'],
+                      _test_function._tf_api_names)
+
   def testOverridesFunction(self):
     _test_function2._tf_api_names = ['abc']
 
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 663036de8a01c3937ae93f8d2bfa27f7add48e39..7872f521720de0db34377d6fe9a67137fb1af306 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -18,8 +18,11 @@ from __future__ import division
 from __future__ import print_function
 
 from collections import namedtuple
+import functools
 import inspect as _inspect
 
+import six
+
 from tensorflow.python.util import tf_decorator
 
 ArgSpec = _inspect.ArgSpec
@@ -43,16 +46,95 @@ def getargspec(object):  # pylint: disable=redefined-builtin
   """TFDecorator-aware replacement for inspect.getargspec.
 
   Args:
-    object: A callable, possibly decorated.
+    object: A callable (function or partial function), possibly decorated.
 
   Returns:
     The `ArgSpec` that describes the signature of the outermost decorator that
     changes the callable's signature. If the callable is not decorated,
     `inspect.getargspec()` will be called directly on the callable.
+
+  Raises:
+    ValueError: When callable's function signature can not be expressed with
+    ArgSpec.
   """
-  decorators, target = tf_decorator.unwrap(object)
-  return next((d.decorator_argspec for d in decorators
-               if d.decorator_argspec is not None), _inspect.getargspec(target))
+
+  def get_argspec_with_decorator(obj):
+    decorators, target = tf_decorator.unwrap(obj)
+    return next((d.decorator_argspec
+                 for d in decorators
+                 if d.decorator_argspec is not None),
+                _inspect.getargspec(target))
+
+  if not isinstance(object, functools.partial):
+    return get_argspec_with_decorator(object)
+
+  # When callable is a functools.partial object, we construct its ArgSpec with
+  # following strategy:
+  # - If callable partial contains default value for positional arguments (ie.
+  # object.args), then final ArgSpec doesn't contain those positional arguments.
+  # - If callable partial contains default value for keyword arguments (ie.
+  # object.keywords), then we merge them with wrapped target. Default values
+  # from callable partial takes precedence over those from wrapped target.
+  #
+  # However, there is a case where it is impossible to construct a valid
+  # ArgSpec. Python requires arguments that have no default values must be
+  # defined before those with default values. ArgSpec structure is only valid
+  # when this presumption holds true because default values are expressed as a
+  # tuple of values without keywords and they are always assumed to belong to
+  # last K arguments where K is number of default values present.
+  #
+  # Since functools.partial can give default value to any argument, this
+  # presumption may no longer hold in some cases. For example:
+  #
+  # def func(m, n):
+  #   return 2 * m + n
+  # partialed = functools.partial(func, m=1)
+  #
+  # This example will result in m having a default value but n doesn't. This is
+  # usually not allowed in Python and can not be expressed in ArgSpec correctly.
+  #
+  # Thus, we must detect cases like this by finding first argument with default
+  # value and ensures all following arguments also have default values. When
+  # this is not true, a ValueError is raised.
+
+  n_prune_args = len(object.args)
+  partial_keywords = object.keywords or {}
+
+  args, varargs, keywords, defaults = get_argspec_with_decorator(object.func)
+
+  # Pruning first n_prune_args arguments.
+  args = args[n_prune_args:]
+
+  # Partial function may give default value to any argument, therefore length
+  # of default value list must be len(args) to allow each argument to
+  # potentially be given a default value.
+  all_defaults = [None] * len(args)
+  if defaults:
+    all_defaults[-len(defaults):] = defaults
+
+  # Fill in default values provided by partial function in all_defaults.
+  for kw, default in six.iteritems(partial_keywords):
+    idx = args.index(kw)
+    all_defaults[idx] = default
+
+  # Find first argument with default value set.
+  first_default = next((idx for idx, x in enumerate(all_defaults) if x), None)
+
+  # If no default values are found, return ArgSpec with defaults=None.
+  if first_default is None:
+    return ArgSpec(args, varargs, keywords, None)
+
+  # Checks if all arguments have default value set after first one.
+  invalid_default_values = [
+      args[i] for i, j in enumerate(all_defaults) if not j and i > first_default
+  ]
+
+  if invalid_default_values:
+    raise ValueError('Some arguments %s do not have default value, but they '
+                     'are positioned after those with default values. This can '
+                     'not be expressed with ArgSpec.' % invalid_default_values)
+
+  return ArgSpec(args, varargs, keywords, tuple(all_defaults[first_default:]))
 
 
 def getfullargspec(obj):  # pylint: disable=redefined-builtin
@@ -116,7 +198,7 @@ def getcallargs(func, *positional, **named):
   it. If no attached decorators modify argspec, the final unwrapped target's
   argspec will be used.
   """
-  argspec = getargspec(func)
+  argspec = getfullargspec(func)
   call_args = named.copy()
   this = getattr(func, 'im_self', None) or getattr(func, '__self__', None)
   if ismethod(func) and this:
diff --git a/tensorflow/python/util/tf_inspect_test.py b/tensorflow/python/util/tf_inspect_test.py
index 129408449ebb45ac3a322f163a13b705cbb31f0c..325131c4f4759ea06d1266921880d3e3c45e5ecb 100644
--- a/tensorflow/python/util/tf_inspect_test.py
+++ b/tensorflow/python/util/tf_inspect_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import inspect
 
 from tensorflow.python.platform import test
@@ -109,6 +110,141 @@ class TfInspectTest(test.TestCase):
                                                outer_argspec)
     self.assertEqual(outer_argspec, tf_inspect.getargspec(outer_decorator))
 
+  def testGetArgSpecOnPartialPositionalArgumentOnly(self):
+    """Tests getargspec on partial function with only positional arguments."""
+
+    def func(m, n):
+      return 2 * m + n
+
+    partial_func = functools.partial(func, 7)
+    argspec = tf_inspect.ArgSpec(
+        args=['n'], varargs=None, keywords=None, defaults=None)
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialInvalidArgspec(self):
+    """Tests getargspec on partial function that doesn't have valid argspec."""
+
+    def func(m, n, l, k=4):
+      return 2 * m + l + n * k
+
+    partial_func = functools.partial(func, n=7)
+
+    exception_message = (r"Some arguments \['l'\] do not have default value, "
+                         "but they are positioned after those with default "
+                         "values. This can not be expressed with ArgSpec.")
+    with self.assertRaisesRegexp(ValueError, exception_message):
+      tf_inspect.getargspec(partial_func)
+
+  def testGetArgSpecOnPartialValidArgspec(self):
+    """Tests getargspec on partial function with valid argspec."""
+
+    def func(m, n, l, k=4):
+      return 2 * m + l + n * k
+
+    partial_func = functools.partial(func, n=7, l=2)
+    argspec = tf_inspect.ArgSpec(
+        args=['m', 'n', 'l', 'k'],
+        varargs=None,
+        keywords=None,
+        defaults=(7, 2, 4))
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialNoArgumentsLeft(self):
+    """Tests getargspec on partial function that prunes all arguments."""
+
+    def func(m, n):
+      return 2 * m + n
+
+    partial_func = functools.partial(func, 7, 10)
+    argspec = tf_inspect.ArgSpec(
+        args=[], varargs=None, keywords=None, defaults=None)
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialKeywordArgument(self):
+    """Tests getargspec on partial function that prunes some arguments."""
+
+    def func(m, n):
+      return 2 * m + n
+
+    partial_func = functools.partial(func, n=7)
+    argspec = tf_inspect.ArgSpec(
+        args=['m', 'n'], varargs=None, keywords=None, defaults=(7,))
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialKeywordArgumentWithDefaultValue(self):
+    """Tests getargspec on partial function that prunes argument by keyword."""
+
+    def func(m=1, n=2):
+      return 2 * m + n
+
+    partial_func = functools.partial(func, n=7)
+    argspec = tf_inspect.ArgSpec(
+        args=['m', 'n'], varargs=None, keywords=None, defaults=(1, 7))
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialWithVarargs(self):
+    """Tests getargspec on partial function with variable arguments."""
+
+    def func(m, *arg):
+      return m + len(arg)
+
+    partial_func = functools.partial(func, 7, 8)
+    argspec = tf_inspect.ArgSpec(
+        args=[], varargs='arg', keywords=None, defaults=None)
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialWithVarkwargs(self):
+    """Tests getargspec on partial function with variable keyword arguments."""
+
+    def func(m, n, **kwarg):
+      return m * n + len(kwarg)
+
+    partial_func = functools.partial(func, 7)
+    argspec = tf_inspect.ArgSpec(
+        args=['n'], varargs=None, keywords='kwarg', defaults=None)
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialWithDecorator(self):
+    """Tests getargspec on decorated partial function."""
+
+    @test_decorator('decorator')
+    def func(m=1, n=2):
+      return 2 * m + n
+
+    partial_func = functools.partial(func, n=7)
+    argspec = tf_inspect.ArgSpec(
+        args=['m', 'n'], varargs=None, keywords=None, defaults=(1, 7))
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialWithDecoratorThatChangesArgspec(self):
+    """Tests getargspec on partial function with decorated argspec."""
+
+    argspec = tf_inspect.ArgSpec(
+        args=['a', 'b', 'c'],
+        varargs=None,
+        keywords=None,
+        defaults=(1, 'hello'))
+    decorator = tf_decorator.TFDecorator('', test_undecorated_function, '',
+                                         argspec)
+    partial_argspec = tf_inspect.ArgSpec(
+        args=['a', 'b', 'c'],
+        varargs=None,
+        keywords=None,
+        defaults=(2, 1, 'hello'))
+    partial_with_decorator = functools.partial(decorator, a=2)
+
+    self.assertEqual(argspec, tf_inspect.getargspec(decorator))
+    self.assertEqual(partial_argspec,
+                     tf_inspect.getargspec(partial_with_decorator))
+
   def testGetDoc(self):
     self.assertEqual('Test Decorated Function With Defaults Docstring.',
                      tf_inspect.getdoc(test_decorated_function_with_defaults))
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 9c8d50da7351d91d435719565d82a1dd3f19c043..8bfc0ec9584598965c32a630d933a992c85aa18b 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -14,8 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/python/util/util.h"
 
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
 
 namespace tensorflow {
@@ -25,6 +30,7 @@ namespace {
 
 // Type object for collections.Sequence. This is set by RegisterSequenceClass.
 PyObject* CollectionsSequenceType = nullptr;
+PyTypeObject* SparseTensorValueType = nullptr;
 
 bool WarnedThatSetIsNotSequence = false;
 
@@ -135,6 +141,12 @@ class ValIterator {
   Py_ssize_t index_;
 };
 
+mutex g_type_to_sequence_map(LINKER_INITIALIZED);
+std::unordered_map<PyTypeObject*, bool>* IsTypeSequenceMap() {
+  static auto* const m = new std::unordered_map<PyTypeObject*, bool>;
+  return m;
+}
+
 // Returns 1 if `o` is considered a sequence for the purposes of Flatten().
 // Returns 0 otherwise.
 // Returns -1 if an error occurred.
@@ -155,64 +167,137 @@ int IsSequenceHelper(PyObject* o) {
             .c_str());
     return -1;
   }
+
+  // Try not to return to Python - see if the type has already been seen
+  // before.
+
+  // NOTE: It's not clear whether the lock is required (we should be holding the
+  // python GIL in this code already).
+  mutex_lock l(g_type_to_sequence_map);
+  auto* type_to_sequence_map = IsTypeSequenceMap();
+  auto* type = Py_TYPE(o);
+
+  auto it = type_to_sequence_map->find(type);
+  if (it != type_to_sequence_map->end()) {
+    return it->second;
+  }
+
   int is_instance = PyObject_IsInstance(o, CollectionsSequenceType);
+
+  // Don't cache a failed is_instance check.
   if (is_instance == -1) return -1;
-  return static_cast<int>(is_instance != 0 && !IsString(o));
+
+  bool is_sequence = static_cast<int>(is_instance != 0 && !IsString(o));
+
+  // NOTE: This is never decref'd, but we don't want the type to get deleted
+  // as long as it is in the map. This should not be too much of a
+  // leak, as there should only be a relatively small number of types in the
+  // map, and an even smaller number that are eligible for decref.
+  Py_INCREF(type);
+  type_to_sequence_map->insert({type, is_sequence});
+
+  return is_sequence;
 }
 
-bool FlattenHelper(PyObject* nested, PyObject* list) {
-  // if nested is not a sequence, append itself and exit
-  int is_seq = IsSequenceHelper(nested);
-  if (is_seq == -1) return false;
-  if (!is_seq) {
-    return PyList_Append(list, nested) != -1;
+bool IsSparseTensorValueType(PyObject* o) {
+  if (TF_PREDICT_FALSE(SparseTensorValueType == nullptr)) {
+    return false;
   }
 
-  // if nested if dictionary, sort it by key and recurse on each value
-  if (PyDict_Check(nested)) {
-    PyObject* keys = PyDict_Keys(nested);
-    if (PyList_Sort(keys) == -1) return false;
-    Py_ssize_t size = PyList_Size(keys);
-    for (Py_ssize_t i = 0; i < size; ++i) {
-      // We know that key and val will not be deleted because nested owns
-      // a reference to them and callers of flatten must not modify nested
-      // while the method is running.
-      PyObject* key = PyList_GET_ITEM(keys, i);
-      PyObject* val = PyDict_GetItem(nested, key);
-      if (Py_EnterRecursiveCall(" in flatten")) {
-        Py_DECREF(keys);
-        return false;
-      }
-      const bool success = FlattenHelper(val, list);
-      Py_LeaveRecursiveCall();
-      if (!success) {
-        Py_DECREF(keys);
-        return false;
-      }
-    }
-    Py_DECREF(keys);
-    return true;
+  return PyObject_TypeCheck(o, SparseTensorValueType) == 1;
+}
+
+int IsSequenceForDataHelper(PyObject* o) {
+  return IsSequenceHelper(o) == 1 && !PyList_Check(o) &&
+         !IsSparseTensorValueType(o);
+}
+
+bool GetNextValuesForDict(PyObject* nested,
+                          std::vector<Safe_PyObjectPtr>* next_values) {
+  std::vector<PyObject*> result;
+
+  PyObject* keys = PyDict_Keys(nested);
+  if (PyList_Sort(keys) == -1) return false;
+  Py_ssize_t size = PyList_Size(keys);
+  for (Py_ssize_t i = 0; i < size; ++i) {
+    // We know that key and item will not be deleted because nested owns
+    // a reference to them and callers of flatten must not modify nested
+    // while the method is running.
+    PyObject* key = PyList_GET_ITEM(keys, i);
+    PyObject* item = PyDict_GetItem(nested, key);
+    Py_INCREF(item);
+    next_values->emplace_back(item);
   }
+  Py_DECREF(keys);
+  return true;
+}
 
-  // iterate and recurse
+bool GetNextValuesForIterable(PyObject* nested,
+                              std::vector<Safe_PyObjectPtr>* next_values) {
   PyObject* item;
   PyObject* iterator = PyObject_GetIter(nested);
   while ((item = PyIter_Next(iterator)) != nullptr) {
+    next_values->emplace_back(item);
+  }
+  Py_DECREF(iterator);
+  return true;
+}
+
+// GetNextValues returns the values that the FlattenHelper function will recurse
+// over next.
+bool GetNextValues(PyObject* nested,
+                   std::vector<Safe_PyObjectPtr>* next_values) {
+  if (PyDict_Check(nested)) {
+    // if nested is dictionary, sort it by key and recurse on each value
+    return GetNextValuesForDict(nested, next_values);
+  }
+  // iterate and recurse
+  return GetNextValuesForIterable(nested, next_values);
+}
+
+// Similar to above, just specialized for the functions in the data pacakage.
+bool GetNextValuesForData(PyObject* nested,
+                          std::vector<Safe_PyObjectPtr>* next_values) {
+  if (PyDict_Check(nested)) {
+    // if nested is dictionary, sort it by key and recurse on each value
+    return GetNextValuesForDict(nested, next_values);
+  } else if (IsSparseTensorValueType(nested)) {
+    // if nested is a SparseTensorValue, just return itself as a single item
+    Py_INCREF(nested);
+    next_values->emplace_back(nested);
+    return true;
+  }
+  // iterate and recurse
+  return GetNextValuesForIterable(nested, next_values);
+}
+
+bool FlattenHelper(
+    PyObject* nested, PyObject* list,
+    const std::function<int(PyObject*)>& is_sequence_helper,
+    const std::function<bool(PyObject*, std::vector<Safe_PyObjectPtr>*)>&
+        next_values_getter) {
+  // if nested is not a sequence, append itself and exit
+  int is_seq = is_sequence_helper(nested);
+  if (is_seq == -1) return false;
+  if (!is_seq) {
+    return PyList_Append(list, nested) != -1;
+  }
+
+  std::vector<Safe_PyObjectPtr> next_values;
+  // Get the next values to recurse over.
+  if (!next_values_getter(nested, &next_values)) return false;
+
+  for (const auto& item : next_values) {
     if (Py_EnterRecursiveCall(" in flatten")) {
-      Py_DECREF(iterator);
-      Py_DECREF(item);
       return false;
     }
-    bool success = FlattenHelper(item, list);
+    const bool success =
+        FlattenHelper(item.get(), list, is_sequence_helper, next_values_getter);
     Py_LeaveRecursiveCall();
     if (!success) {
-      Py_DECREF(iterator);
-      Py_DECREF(item);
       return false;
     }
-    Py_DECREF(item);
   }
-  Py_DECREF(iterator);
   return true;
 }
 
@@ -351,7 +436,7 @@ bool AssertSameStructureHelper(PyObject* o1, PyObject* o2, bool check_types,
   }
 }
 
-}  // anonymous namespace
+}  // namespace
 
 void RegisterSequenceClass(PyObject* sequence_class) {
   if (!PyType_Check(sequence_class)) {
@@ -366,11 +451,38 @@ void RegisterSequenceClass(PyObject* sequence_class) {
   CollectionsSequenceType = sequence_class;
 }
 
+void RegisterSparseTensorValueClass(PyObject* sparse_tensor_value_class) {
+  if (!PyType_Check(sparse_tensor_value_class)) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        tensorflow::strings::StrCat(
+            "Expecting a class definition for `SparseTensorValue`. Got ",
+            Py_TYPE(sparse_tensor_value_class)->tp_name)
+            .c_str());
+    return;
+  }
+  SparseTensorValueType =
+      reinterpret_cast<PyTypeObject*>(sparse_tensor_value_class);
+}
+
 bool IsSequence(PyObject* o) { return IsSequenceHelper(o) == 1; }
 
 PyObject* Flatten(PyObject* nested) {
   PyObject* list = PyList_New(0);
-  if (FlattenHelper(nested, list)) {
+  if (FlattenHelper(nested, list, IsSequenceHelper, GetNextValues)) {
+    return list;
+  } else {
+    Py_DECREF(list);
+    return nullptr;
+  }
+}
+
+bool IsSequenceForData(PyObject* o) { return IsSequenceForDataHelper(o) == 1; }
+
+PyObject* FlattenForData(PyObject* nested) {
+  PyObject* list = PyList_New(0);
+  if (FlattenHelper(nested, list, IsSequenceForDataHelper,
+                    GetNextValuesForData)) {
     return list;
   } else {
     Py_DECREF(list);
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index 4bb80d8289e958074fa7beb133e95908283a5b7b..70efc10c9abe7c57da61311bb2eb7ae362a48e3d 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -118,6 +118,30 @@ PyObject* Flatten(PyObject* nested);
 // the type from the module. This approach also requires some trigger from
 // Python so that we know that Python interpreter had been initialzied.
 void RegisterSequenceClass(PyObject* sequence_class);
+// Similar to the above function, except for the
+// sparse_tensor.SparseTensorValue class.
+void RegisterSparseTensorValueClass(PyObject* sparse_tensor_value_class);
+
+// The tensorflow.python.data package has its own nest utility that follows very
+// slightly different semantics for its functions than the tensorflow.python
+// nest utility. Returns a true if its input is a collections.Sequence (except
+// strings).
+//
+// Main differences are (this is copied from nest.py in the
+// tensorflow.data.util):
+//
+// 1. It removes support for lists as a level of nesting in nested structures.
+// 2. It adds support for `SparseTensorValue` as an atomic element.
+
+// IsSequence specialized for the data package. Additional comments about
+// difference in functionality can be found in nest.py in tensorflow.data.util
+// and in the comments for Flatten above.
+bool IsSequenceForData(PyObject* o);
+
+// IsSequence specialized for the data package. Additional comments about
+// difference in functionality can be found in nest.py in tensorflow.data.util
+// and in the comments for Flatten above.
+PyObject* FlattenForData(PyObject* nested);
 
 }  // namespace swig
 }  // namespace tensorflow
diff --git a/tensorflow/python/util/util.i b/tensorflow/python/util/util.i
index b7f201b6fe6fd18af2bb833df2d08bfedb23a185..9f3b11b982bb0d52f903b09975cc7029fa8cb013 100644
--- a/tensorflow/python/util/util.i
+++ b/tensorflow/python/util/util.i
@@ -31,6 +31,9 @@ limitations under the License.
 %unignore tensorflow::swig::RegisterSequenceClass;
 %noexception tensorflow::swig::RegisterSequenceClass;
 
+%unignore tensorflow::swig::RegisterSparseTensorValueClass;
+%noexception tensorflow::swig::RegisterSparseTensorValueClass;
+
 %unignore tensorflow::swig::IsSequence;
 %noexception tensorflow::swig::IsSequence;
 
@@ -46,6 +49,12 @@ limitations under the License.
 %unignore tensorflow::swig::Flatten;
 %noexception tensorflow::swig::Flatten;
 
+%unignore tensorflow::swig::IsSequenceForData;
+%noexception tensorflow::swig::IsSequenceForData;
+
+%unignore tensorflow::swig::FlattenForData;
+%noexception tensorflow::swig::FlattenForData;
+
 %include "tensorflow/python/util/util.h"
 
 %unignoreall
diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h
index be0b0bf5fb20b2c0cfa99dee81e01d319387ef36..ea87744b225215ceb24b926f1ef7bace017cb2b8 100644
--- a/tensorflow/stream_executor/blas.h
+++ b/tensorflow/stream_executor/blas.h
@@ -1083,6 +1083,13 @@ class BlasSupport {
   // This is a batched version of DoBlasGemm.
   // The batched GEMM computes matrix product for each input/output in a, b,
   // and c, which contain batch_count DeviceMemory objects.
+  virtual bool DoBlasGemmBatched(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, float alpha,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator) = 0;
   virtual bool DoBlasGemmBatched(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
       uint64 n, uint64 k, float alpha,
@@ -1945,6 +1952,13 @@ class BlasSupport {
       DeviceMemory<std::complex<double>> *c, int ldc,                          \
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
       blas::ProfileResult *output_profile_result) override;                    \
+  bool DoBlasGemmBatched(                                                      \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64 m, uint64 n, uint64 k, float alpha,                               \
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,         \
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,         \
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,      \
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator) override; \
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64 m, uint64 n, uint64 k, float alpha,                               \
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.cc b/tensorflow/stream_executor/cuda/cuda_activation.cc
index cf6b9e2c6e4b327c06ecce318f8e8809308f6f02..02371c3c3ab403e9b3303fbbafdef18c30196f4f 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.cc
+++ b/tensorflow/stream_executor/cuda/cuda_activation.cc
@@ -38,5 +38,11 @@ ScopedActivateExecutorContext::~ScopedActivateExecutorContext() {
   delete static_cast<ScopedActivateContext *>(driver_scoped_activate_context_);
 }
 
+ScopedActivateExecutorContext::ScopedActivateExecutorContext(
+    ScopedActivateExecutorContext &&other)
+    : driver_scoped_activate_context_(other.driver_scoped_activate_context_) {
+  other.driver_scoped_activate_context_ = nullptr;
+}
+
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.h b/tensorflow/stream_executor/cuda/cuda_activation.h
index 04ffaef3646bb3df03407f8e74e620455bb9e5cb..ef9807820fda493a9ab926ae0509beaafeebdf2e 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.h
+++ b/tensorflow/stream_executor/cuda/cuda_activation.h
@@ -44,10 +44,11 @@ class ScopedActivateExecutorContext {
   // fatal failure if it is not CUDA inside.
   explicit ScopedActivateExecutorContext(StreamExecutor* stream_exec);
 
+  ScopedActivateExecutorContext(ScopedActivateExecutorContext&& other);
+
   ~ScopedActivateExecutorContext();
 
  private:
-
   // The cuda.h-using datatype that we wrap.
   ScopedActivateContext* driver_scoped_activate_context_;
 
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 3c1353aee3178282cb9a62222ac0e415f08e6de8..08fe153b5909d36eae7848862932bb1359c29fe0 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -16,11 +16,7 @@ limitations under the License.
 #include "cuda/include/cublas_v2.h"
 #include "cuda/include/cuda.h"
 
-#if CUDA_VERSION >= 8000
 #define SE_CUDA_DATA_HALF CUDA_R_16F
-#else
-#define SE_CUDA_DATA_HALF CUBLAS_DATA_HALF
-#endif
 
 #include "tensorflow/stream_executor/cuda/cuda_blas.h"
 
@@ -45,10 +41,8 @@ limitations under the License.
 // approach when the issue is fixed.
 #if CUDA_VERSION < 9000
 #include "cuda/include/cuda_fp16.h"
-#if CUDA_VERSION >= 7050
 #define EIGEN_HAS_CUDA_FP16
 #endif
-#endif
 
 #include "third_party/eigen3/Eigen/Core"
 
@@ -292,6 +286,10 @@ STREAM_EXECUTOR_CUBLAS_WRAP(cublasGetMathMode)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasSetMathMode)
 #endif
 
+#if CUDA_VERSION >= 9010
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasGemmBatchedEx)
+#endif
+
 }  // namespace wrap
 
 static string ToString(cublasStatus_t status) {
@@ -543,9 +541,7 @@ cublasSideMode_t CUDABlasSide(blas::Side side) {
 // blas::ComputationType to a cudaDataType_t.
 //
 // These are used to build the argument type and computation type args to
-// cublasGemmEx.  cublasGemmEx and cudaDataType_t are available only on
-// CUDA >= 8.0.
-#if CUDA_VERSION >= 8000
+// cublasGemmEx.
 template <typename T>
 struct CUDADataType;
 
@@ -620,15 +616,13 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
       return CUDA_C_64F;
   }
 }
-#endif
-
 }  // namespace
 
 template <typename FuncT, typename... Args>
 bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                   bool pointer_mode_host, bool err_on_failure,
                                   bool use_tensor_op_math, Args... args) {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
 
   CHECK(blas_ != nullptr);
   if (!SetStream(stream)) {
@@ -2229,7 +2223,6 @@ bool CUDABlas::GetBlasGemmAlgorithms(
 // Note that when CUDA version and compute capability is not sufficient, we
 // still return the out_algorithms. Caller needs to make sure that in this case,
 // the returned vector is empty.
-#if CUDA_VERSION >= 8000
   for (cublasGemmAlgo_t algo : {
          CUBLAS_GEMM_DFALT, CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1,
              CUBLAS_GEMM_ALGO2, CUBLAS_GEMM_ALGO3, CUBLAS_GEMM_ALGO4,
@@ -2245,7 +2238,6 @@ bool CUDABlas::GetBlasGemmAlgorithms(
        }) {
     out_algorithms->push_back(algo);
   }
-#endif
   return true;
 }
 
@@ -2342,13 +2334,23 @@ bool CUDABlas::DoBlasGemmWithAlgorithm(
       computation_type, algorithm, output_profile_result);
 }
 
-template <typename T, typename FuncT>
+template <typename T>
+struct HalfAsFloat {
+  typedef T type;
+};
+
+template <>
+struct HalfAsFloat<Eigen::half> {
+  typedef float type;
+};
+
+template <typename T, typename Scalar, typename FuncT>
 port::Status CUDABlas::DoBlasGemmBatchedInternal(
     FuncT cublas_func, Stream *stream, blas::Transpose transa,
-    blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+    blas::Transpose transb, uint64 m, uint64 n, uint64 k, Scalar alpha,
     const port::ArraySlice<DeviceMemory<T> *> &a_ptrs_to_wrappers, int lda,
     const port::ArraySlice<DeviceMemory<T> *> &b_ptrs_to_wrappers, int ldb,
-    T beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
+    Scalar beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   std::vector<T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs;
   for (int i = 0; i < batch_count; ++i) {
@@ -2357,7 +2359,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     c_raw_ptrs.push_back(static_cast<T *>(c_ptrs_to_wrappers[i]->opaque()));
   }
 
-  typedef typename CUDAComplexT<T>::type CUDA_T;
+  typedef typename HalfAsFloat<typename CUDAComplexT<T>::type>::type CUDA_T;
 
   const size_t size = batch_count * sizeof(CUDA_T *);
 
@@ -2409,18 +2411,84 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
                         "CUDABlas::DoBlasGemmBatched");
   }
 
-  bool ok = DoBlasInternal(
-      cublas_func, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), const_cast<const CUDA_T **>(CUDAMemory(a)), lda,
-      const_cast<const CUDA_T **>(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-      const_cast<CUDA_T **>(CUDAMemory(c)), ldc, batch_count);
+  cudaDataType_t data_type = CUDADataType<T>::type;
 
-  if (ok) {
+#if CUDA_VERSION >= 9010
+  int cc_major, cc_minor;
+  if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
+          &cc_major, &cc_minor) &&
+      cc_major >= 5) {
+    bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F;
+    cublasGemmAlgo_t algo =
+        (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
+    cudaDataType_t compute_type =
+        (data_type == CUDA_R_16F ? CUDA_R_32F : data_type);
+    const void **a_void_ptrs = reinterpret_cast<const void **>(
+        const_cast<const CUDA_T **>(CUDAMemory(a)));
+    const void **b_void_ptrs = reinterpret_cast<const void **>(
+        const_cast<const CUDA_T **>(CUDAMemory(b)));
+    void **c_void_ptrs =
+        reinterpret_cast<void **>(const_cast<CUDA_T **>(CUDAMemory(c)));
+    bool ok;
+    ok = DoBlasInternalImpl(
+        wrap::cublasGemmBatchedEx, stream, true /* = pointer_mode_host */,
+        true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
+        CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
+        b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
+        batch_count, compute_type, algo);
+    if (ok) {
+      return port::Status::OK();
+    }
+    return port::Status(port::error::INTERNAL,
+                        "failed BLAS call, see log for details");
+  }
+#endif
+  // either CUDA_VERSION < 9.1 or SM < 5.0
+  if (data_type != CUDA_R_16F) {
+    bool ok = DoBlasInternal(
+        cublas_func, stream, true /* = pointer_mode_host */,
+        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
+        CUDAComplex(&alpha), const_cast<const CUDA_T **>(CUDAMemory(a)), lda,
+        const_cast<const CUDA_T **>(CUDAMemory(b)), ldb, CUDAComplex(&beta),
+        const_cast<CUDA_T **>(CUDAMemory(c)), ldc, batch_count);
+    if (ok) {
+      return port::Status::OK();
+    }
+    return port::Status(port::error::INTERNAL,
+                        "failed BLAS call, see log for details");
+  } else {
+    // Fall back to a loop for fp16
+    for (int b = 0; b < batch_count; ++b) {
+      const DeviceMemory<T> &a_matrix = *a_ptrs_to_wrappers[b];
+      const DeviceMemory<T> &b_matrix = *b_ptrs_to_wrappers[b];
+      DeviceMemory<T> *c_matrix = c_ptrs_to_wrappers[b];
+      bool ok = DoBlasGemm(stream, transa, transb, m, n, k, alpha, a_matrix,
+                           lda, b_matrix, ldb, beta, c_matrix, ldc);
+      if (!ok) {
+        return port::Status(port::error::INTERNAL,
+                            "failed BLAS call, see log for details");
+      }
+    }
     return port::Status::OK();
   }
-  return port::Status(port::error::INTERNAL,
-                      "failed BLAS call, see log for details");
+}
+
+bool CUDABlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a_array, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b_array, int ldb,
+    float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c_array,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  // Note: The func passed here (cublasSgemmBatched) is not actually called,
+  // due to special handling of fp16 inside DoBlasGemmBatchedInternal.
+  port::Status status = DoBlasGemmBatchedInternal(
+      wrap::cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array,
+      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
 }
 
 bool CUDABlas::DoBlasGemmBatched(
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 12dc5e47fd1b9d2accca52e547cc271b94085a5a..42b3fde5b0816f7277cb5d08902af0145a0852aa 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -107,12 +107,12 @@ class CUDABlas : public blas::BlasSupport {
 
   // A helper function to implement DoBlasGemmBatched interfaces for generic
   // types.
-  template <typename T, typename FuncT>
+  template <typename T, typename Scalar, typename FuncT>
   port::Status DoBlasGemmBatchedInternal(
       FuncT cublas_func, Stream *stream, blas::Transpose transa,
-      blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+      blas::Transpose transb, uint64 m, uint64 n, uint64 k, Scalar alpha,
       const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda,
-      const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta,
+      const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, Scalar beta,
       const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc,
       int batch_count, ScratchAllocator *scratch_allocator);
 
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index feb529297e8fff99030cb14fd4f8a9298bbc5935..46e5deed8474dfa0c0ce6402bd6e5e2675491b31 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -76,35 +76,36 @@ string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
 port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) {
   std::vector<string> pieces = port::Split(value, '.');
   if (pieces.size() < 2 || pieces.size() > 4) {
-    return port::Status{
+    return port::Status(
         port::error::INVALID_ARGUMENT,
-        port::Printf("expected %%d.%%d, %%d.%%d.%%d, or %%d.%%d.%%d.%%d form for driver version; got \"%s\"",
-                     value.c_str())};
+        port::Printf("expected %%d.%%d, %%d.%%d.%%d, or %%d.%%d.%%d.%%d form "
+                     "for driver version; got \"%s\"",
+                     value.c_str()));
   }
 
   int major;
   int minor;
   int patch = 0;
   if (!port::safe_strto32(pieces[0], &major)) {
-    return port::Status{
+    return port::Status(
         port::error::INVALID_ARGUMENT,
         port::Printf("could not parse major version number \"%s\" as an "
                      "integer from string \"%s\"",
-                     pieces[0].c_str(), value.c_str())};
+                     pieces[0].c_str(), value.c_str()));
   }
   if (!port::safe_strto32(pieces[1], &minor)) {
-    return port::Status{
+    return port::Status(
         port::error::INVALID_ARGUMENT,
         port::Printf("could not parse minor version number \"%s\" as an "
                      "integer from string \"%s\"",
-                     pieces[1].c_str(), value.c_str())};
+                     pieces[1].c_str(), value.c_str()));
   }
   if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
-    return port::Status{
-      port::error::INVALID_ARGUMENT,
-      port::Printf("could not parse patch version number \"%s\" as an "
+    return port::Status(
+        port::error::INVALID_ARGUMENT,
+        port::Printf("could not parse patch version number \"%s\" as an "
                      "integer from string \"%s\"",
-                   pieces[2].c_str(), value.c_str())};
+                     pieces[2].c_str(), value.c_str()));
   }
 
   DriverVersion result{major, minor, patch};
@@ -204,9 +205,9 @@ void Diagnostician::LogDiagnosticInformation() {
 // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
 // driver-interfacing DSO version number. Returns it as a string.
 port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
-  port::StatusOr<DriverVersion> result{port::Status{
+  port::StatusOr<DriverVersion> result(port::Status(
       port::error::NOT_FOUND,
-      "was unable to find libcuda.so DSO loaded into this program"}};
+      "was unable to find libcuda.so DSO loaded into this program"));
 
 #if defined(__APPLE__)
     // OSX CUDA libraries have names like: libcuda_310.41.15_mercury.dylib
@@ -274,11 +275,11 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
   static const char *kDriverFilePrelude = "Kernel Module  ";
   size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
   if (offset == string::npos) {
-    return port::Status{
+    return port::Status(
         port::error::NOT_FOUND,
         port::StrCat("could not find kernel module information in "
                      "driver version file contents: \"",
-                     driver_version_file_contents, "\"")};
+                     driver_version_file_contents, "\""));
   }
 
   string version_and_rest = driver_version_file_contents.substr(
@@ -334,25 +335,24 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
     return StringToDriverVersion(version);
   }
   CFRelease(kext_infos);
-  auto status =
-    port::Status{port::error::INTERNAL,
-                 port::StrCat("failed to read driver bundle version: ",
-                              CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8))
-    };
+  auto status = port::Status(
+      port::error::INTERNAL,
+      port::StrCat(
+          "failed to read driver bundle version: ",
+          CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8)));
   return status;
 #elif defined(PLATFORM_WINDOWS)
   auto status =
-    port::Status{port::error::UNIMPLEMENTED,
-                 "kernel reported driver version not implemented on Windows"
-    };
+      port::Status(port::error::UNIMPLEMENTED,
+                   "kernel reported driver version not implemented on Windows");
   return status;
 #else
   FILE *driver_version_file = fopen(kDriverVersionPath, "r");
   if (driver_version_file == nullptr) {
-    return port::Status{
+    return port::Status(
         port::error::PERMISSION_DENIED,
         port::StrCat("could not open driver version path for reading: ",
-                     kDriverVersionPath)};
+                     kDriverVersionPath));
   }
 
   static const int kContentsSize = 1024;
@@ -371,11 +371,11 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
     return FindKernelModuleVersion(contents.begin());
   }
 
-  auto status =
-      port::Status{port::error::INTERNAL,
-                   port::StrCat("failed to read driver version file contents: ",
-                                kDriverVersionPath, "; ferror: ",
-                                ferror(driver_version_file))};
+  auto status = port::Status(
+      port::error::INTERNAL,
+      port::StrCat(
+          "failed to read driver version file contents: ", kDriverVersionPath,
+          "; ferror: ", ferror(driver_version_file)));
   fclose(driver_version_file);
   return status;
 #endif
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 773cac2c40c99125171f50838ca3fcac6fff4dd3..5ece80e5518d19e5ed596fe405d20b3360d2f0c0 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -46,8 +46,15 @@ limitations under the License.
 #include "cuda/include/cudnn.h"
 // clang-format on
 
+namespace stream_executor {
+namespace cuda {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin);
+
 namespace {
 
+static_assert(CUDNN_VERSION >= 6000, "cuDNN needs to be version 6.0 or higher");
+
 // Converts (via narrowing) a type T value to a type U, and checks that the
 // value has no value change due to the conversion.
 template <typename WideT, typename NarrowT>
@@ -58,20 +65,6 @@ NarrowT CheckedNarrowing(const WideT& wide) {
   return narrow;
 }
 
-}  // namespace
-
-namespace stream_executor {
-
-using dnn::BatchDescriptor;
-using dnn::FilterDescriptor;
-using dnn::ConvolutionDescriptor;
-using dnn::PoolingDescriptor;
-using dnn::NormalizeDescriptor;
-
-namespace cuda {
-
-PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin);
-
 string ToString(cudnnStatus_t status) {
   switch (status) {
     case CUDNN_STATUS_SUCCESS:
@@ -102,7 +95,6 @@ string ToString(cudnnStatus_t status) {
   }
 }
 
-#if CUDNN_VERSION >= 6000
 string ToString(libraryPropertyType type) {
   switch (type) {
     case MAJOR_VERSION:
@@ -116,7 +108,6 @@ string ToString(libraryPropertyType type) {
           "<unknown libraryPropertyType: ", static_cast<int>(type), ">");
   }
 }
-#endif
 
 template <typename T>
 cudnnDataType_t GetCudnnDataType();
@@ -136,226 +127,82 @@ cudnnDataType_t GetCudnnDataType<Eigen::half>() {
   return CUDNN_DATA_HALF;
 }
 
-namespace wrap {
-
-static port::ThreadPool* InitCudnnThreadpool() {
-  port::ThreadPool* cudnn_threadpool_;
-  port::ThreadOptions options;
-  // TBD(keveman): Conservatively setting the stack size and guard size to 2MB,
-  // until we can get some guarantees from NVIDIA on the minimum stack space
-  // they will work with.
-  options.stack_size = 2 * 1024 * 1024;
-  options.guard_size = 2 * 1024 * 1024;
-  cudnn_threadpool_ = new port::ThreadPool(port::Env::Default(), options,
-                                           "cudnn_threadpool", 1);
-  CHECK(cudnn_threadpool_);
-  return cudnn_threadpool_;
-}
-
-static mutex cudnn_threadpool_mu(LINKER_INITIALIZED);
-static port::ThreadPool* GetCudaThreadpool() {
-  mutex_lock lock(cudnn_threadpool_mu);
-  static port::ThreadPool* cudnn_threadpool = InitCudnnThreadpool();
-  return cudnn_threadpool;
-}
-
-#define STREAM_EXECUTOR_CUDNN_WRAP(__name)                         \
-  struct WrapperShim__##__name {                                   \
-    template <typename... Args>                                    \
-    cudnnStatus_t operator()(CUDAExecutor* parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};             \
-      cudnnStatus_t retval = ::__name(args...);                    \
-      return retval;                                               \
-    }                                                              \
-  } __name;
-
-#define STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM(__name)           \
-  struct WrapperShim__##__name {                                         \
-    template <typename... Args>                                          \
-    cudnnStatus_t operator()(CudnnSupport* dnn, Stream* s, Args... args) \
-        SHARED_LOCKS_REQUIRED(dnn->dnn_handle_mutex_) {                  \
-      CHECK_NOTNULL(s);                                                  \
-      CHECK_EQ(s, dnn->GetCurrentDnnStream())                            \
-          << "Stream is not set correctly!";                             \
-      cuda::ScopedActivateExecutorContext sac{dnn->GetParentExecutor()}; \
-      cudnnStatus_t retval = ::__name(args...);                          \
-      return retval;                                                     \
-    }                                                                    \
-  } __name;
-
-// Handles cudnnSetStream differently in order to add debug information.
-struct WrapperShim__cudnnSetStream {
-  cudnnStatus_t operator()(CudnnSupport* dnn, Stream* stream,
-                           cudnnHandle_t handle)
-      EXCLUSIVE_LOCKS_REQUIRED(dnn->dnn_handle_mutex_) {
-    dnn->SetCurrentDnnStream(stream);
-    cuda::ScopedActivateExecutorContext sac{dnn->GetParentExecutor()};
-    cudnnStatus_t retval = ::cudnnSetStream(handle, AsCUDAStreamValue(stream));
-    return retval;
-  }
-} cudnnSetStream;
-
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnGetConvolutionNdForwardOutputDim)          \
-  __macro(cudnnGetConvolutionForwardAlgorithm)            \
-  __macro(cudnnCreateTensorDescriptor)                    \
-  __macro(cudnnDestroyTensorDescriptor)                   \
-  __macro(cudnnCreateFilterDescriptor)                    \
-  __macro(cudnnSetPoolingNdDescriptor)                    \
-  __macro(cudnnSetLRNDescriptor)                          \
-  __macro(cudnnDestroyFilterDescriptor)                   \
-  __macro(cudnnCreateConvolutionDescriptor)               \
-  __macro(cudnnCreatePoolingDescriptor)                   \
-  __macro(cudnnDestroyPoolingDescriptor)                  \
-  __macro(cudnnCreateLRNDescriptor)                       \
-  __macro(cudnnDestroyLRNDescriptor)                      \
-  __macro(cudnnDestroyConvolutionDescriptor)              \
-  __macro(cudnnCreate)                                    \
-  __macro(cudnnDestroy)                                   \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
-  __macro(cudnnSetConvolutionNdDescriptor)                \
-  __macro(cudnnSetTensor4dDescriptor)                     \
-  __macro(cudnnSetTensorNdDescriptor)                     \
-  __macro(cudnnSetFilterNdDescriptor)
-
-// clang-format on
-CUDNN_DNN_ROUTINE_EACH(STREAM_EXECUTOR_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH
-
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH_WITH_STREAM(__macro)       \
-  __macro(cudnnBatchNormalizationBackward)                \
-  __macro(cudnnBatchNormalizationForwardInference)        \
-  __macro(cudnnBatchNormalizationForwardTraining)         \
-  __macro(cudnnActivationForward)                         \
-  __macro(cudnnConvolutionForward)                        \
-  __macro(cudnnConvolutionBackwardBias)                   \
-  __macro(cudnnTransformTensor)                           \
-  __macro(cudnnPoolingForward)                            \
-  __macro(cudnnPoolingBackward)                           \
-  __macro(cudnnLRNCrossChannelForward)                    \
-  __macro(cudnnLRNCrossChannelBackward)                   \
-  __macro(cudnnAddTensor)                                 \
-  __macro(cudnnConvolutionBackwardData)                   \
-  __macro(cudnnConvolutionBackwardFilter)
-
-// clang-format on
-CUDNN_DNN_ROUTINE_EACH_WITH_STREAM(
-    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
-#undef CUDNN_DNN_ROUTINE_EACH_WITH_STREAM
-
-// APIs available after R3:
-#if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)              \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(STREAM_EXECUTOR_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
-#endif
-
-// APIs in R3 but not in R5
-// clang-format off
-#if CUDNN_VERSION >= 3000 && CUDNN_VERSION < 5000
-#define CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM(__macro)        \
-  __macro(cudnnAddTensor_v3)                                  \
-  __macro(cudnnConvolutionBackwardData_v3)                    \
-  __macro(cudnnConvolutionBackwardFilter_v3)
-// clang-format on
-
-CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM(
-    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
-#undef CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM
-#endif
-
-// APIs in R5
-// clang-format off
-#if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)                    \
-  __macro(cudnnCreateActivationDescriptor)                    \
-  __macro(cudnnSetActivationDescriptor)                       \
-  __macro(cudnnGetActivationDescriptor)                       \
-  __macro(cudnnDestroyActivationDescriptor)                   \
-  __macro(cudnnCreateDropoutDescriptor)                       \
-  __macro(cudnnDestroyDropoutDescriptor)                      \
-  __macro(cudnnSetDropoutDescriptor)                          \
-  __macro(cudnnDropoutGetStatesSize)                          \
-  __macro(cudnnCreateRNNDescriptor)                           \
-  __macro(cudnnDestroyRNNDescriptor)                          \
-  __macro(cudnnGetRNNParamsSize)                              \
-  __macro(cudnnGetRNNWorkspaceSize)                           \
-  __macro(cudnnGetRNNTrainingReserveSize)                     \
-  __macro(cudnnGetRNNLinLayerMatrixParams)                    \
-  __macro(cudnnGetRNNLinLayerBiasParams)                      \
-  __macro(cudnnSetRNNDescriptor)                              \
-  __macro(cudnnGetFilterNdDescriptor)
-
-// clang-format on
-CUDNN_DNN_ROUTINE_EACH_R5(STREAM_EXECUTOR_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R5
-
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM(__macro)        \
-  __macro(cudnnRNNForwardInference)                           \
-  __macro(cudnnRNNForwardTraining)                            \
-  __macro(cudnnRNNBackwardData)                               \
-  __macro(cudnnRNNBackwardWeights)
+// RAII wrapper for all calls to cuDNN with a cuDNN handle argument.
+//
+// See CudnnAccess::GetHandle() for details.
+class CudnnHandle {
+ public:
+  // Takes ownership of the executor context and the lock to access cuDNN
+  // using handle.
+  CudnnHandle(cuda::ScopedActivateExecutorContext context, mutex_lock lock,
+              cudnnHandle_t handle)
+      : context_(std::move(context)), lock_(std::move(lock)), handle_(handle) {}
 
-// clang-format on
-CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM(
-    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
-#undef CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM
-#endif
+  // Returns cuDNN handle. To be passed directly to cuDNN APIs, don't keep
+  // a copy.
+  cudnnHandle_t handle() const { return handle_; }
 
-// APIs in R6
-// clang-format off
-#if CUDNN_VERSION >= 6000
-#define CUDNN_DNN_ROUTINE_EACH_R6(__macro)                    \
-  __macro(cudnnSetRNNDescriptor_v6)                           \
-  __macro(cudnnCreatePersistentRNNPlan)                       \
-  __macro(cudnnDestroyPersistentRNNPlan)                      \
-  __macro(cudnnSetPersistentRNNPlan)
+ private:
+  cuda::ScopedActivateExecutorContext context_;
+  mutex_lock lock_;
+  cudnnHandle_t handle_;  // Not owned.
+};
 
-// clang-format on
-CUDNN_DNN_ROUTINE_EACH_R6(STREAM_EXECUTOR_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R6
+}  // namespace
 
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM(__macro)        \
-  __macro(cudnnConvolutionBiasActivationForward)
+// Wraps a cuDNN handle and provides access to it through CudnnHandle instances,
+// which also locks a mutex, acquires the CUDA context, and sets the stream
+// that cuDNN should use to enqueue any work.
+//
+// Note: CudnnSupport::cudnn_ should be the only instantiation of this class.
+class CudnnAccess {
+ public:
+  // Takes ownership of the handle.
+  explicit CudnnAccess(cudnnHandle_t handle) : handle_(handle) {}
 
-// clang-format on
-CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM(
-    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
-#undef CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM
-#endif
+  ~CudnnAccess() {
+    mutex_lock lock(mutex_);
+    cudnnDestroy(handle_);
+  }
 
-// APIs in R7
-// clang-format off
-#if CUDNN_VERSION >= 7000
-#define CUDNN_DNN_ROUTINE_EACH_R7(__macro)                    \
-  __macro(cudnnSetConvolutionMathType)                        \
-  __macro(cudnnSetRNNMatrixMathType)                          \
-  __macro(cudnnSetConvolutionGroupCount)                      \
-  __macro(cudnnGetConvolutionGroupCount)
+  // Creates a CudnnHandle instance for stream.
+  //
+  // cuDNN API calls using the same handle instance need to be serialized across
+  // threads. This is guaranteed by CudnnHandle instances locking the mutex
+  // owned by this class.
+  //
+  // Most cuDNN APIs taking a handle perform work on a CUDA stream. The
+  // CudnnHandle instance acquires the executor's CUDA context and sets cuDNN to
+  // use the provided stream.
+  //
+  // The stream argument may be null, which translates to the legacy default
+  // stream. See
+  // https://docs.nvidia.com/cuda/cuda-driver-api/stream-sync-behavior.html.
+  // The legacy default stream synchronizes with all other streams and it is
+  // therefore a bad idea (performance wise) to call any cuDNN APIs that
+  // enqueue work in the stream.
+  CudnnHandle GetHandle(CUDAExecutor* executor, Stream* stream) {
+    mutex_lock lock(mutex_);
+    cuda::ScopedActivateExecutorContext context(executor);
+    CUstream cu_stream = stream ? AsCUDAStreamValue(stream) : cudaStreamLegacy;
+    auto status = cudnnSetStream(handle_, cu_stream);
+    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Failed to set cuDNN stream.";
+    using my_mutex_lock = mutex_lock;
+    return CudnnHandle(std::move(context), std::move(lock), handle_);
+  }
 
-// clang-format on
-CUDNN_DNN_ROUTINE_EACH_R7(STREAM_EXECUTOR_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R7
-#endif
+ private:
+  // Guards the enqueueing of cuDNN operations via the handle_ below.
+  mutex mutex_;
 
-}  // namespace wrap
+  // cuDNN library handle.
+  cudnnHandle_t handle_ GUARDED_BY(mutex_);  // Owned.
+};
 
 namespace {
 
 cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
 
-cudnnHandle_t ToHandle(void* opaque_handle) {
-  return static_cast<cudnnHandle_t>(opaque_handle);
-}
-
 cudnnConvolutionFwdAlgo_t ToConvForwardAlgo(dnn::AlgorithmDesc algorithm) {
   cudnnConvolutionFwdAlgo_t algo =
       cudnnConvolutionFwdAlgo_t(algorithm.algo_id());
@@ -366,12 +213,8 @@ cudnnConvolutionFwdAlgo_t ToConvForwardAlgo(dnn::AlgorithmDesc algorithm) {
     case CUDNN_CONVOLUTION_FWD_ALGO_DIRECT:
     case CUDNN_CONVOLUTION_FWD_ALGO_FFT:
     case CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
-#if CUDNN_VERSION >= 5000
     case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
-#endif
-#if CUDNN_VERSION >= 5100
     case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
-#endif
       return algo;
     default:
       LOG(FATAL) << "Unsupported Cudnn convolution forward algorithm: "
@@ -388,12 +231,8 @@ cudnnConvolutionBwdDataAlgo_t ToConvBackwardDataAlgo(
     case CUDNN_CONVOLUTION_BWD_DATA_ALGO_1:
     case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT:
     case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
-#if CUDNN_VERSION >= 5000
     case CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
-#endif
-#if CUDNN_VERSION >= 5100
     case CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED:
-#endif
       return algo;
     default:
       LOG(FATAL)
@@ -411,12 +250,13 @@ cudnnConvolutionBwdFilterAlgo_t ToConvBackwardFilterAlgo(
     case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
     case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
     case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
-#if CUDNN_VERSION >= 5100
     // Based on cudnn.h, the following is not implemented.
     // case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD:
     case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED:
-#endif
       return algo;
+    // Produces incorrect results for some shapes. Disabled for now, see
+    // NVIDIA bug 2072856. TODO(csigg): Only disable for subset of shapes.
+    // case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING:
     default:
       LOG(FATAL)
           << "Unsupported Cudnn convolution backward algorithm for filter: "
@@ -424,7 +264,6 @@ cudnnConvolutionBwdFilterAlgo_t ToConvBackwardFilterAlgo(
   }
 }
 
-#if CUDNN_VERSION >= 6000
 port::Status GetCudnnProperty(libraryPropertyType type, int* value) {
   cudnnStatus_t status = cudnnGetProperty(type, value);
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -432,7 +271,7 @@ port::Status GetCudnnProperty(libraryPropertyType type, int* value) {
         port::StrCat("cudnnGetProperty failed for type: ", ToString(type),
                      " with status: ", ToString(status));
     LOG(ERROR) << error;
-    return port::Status{port::error::INTERNAL, error};
+    return port::Status(port::error::INTERNAL, error);
   }
   return port::Status::OK();
 }
@@ -453,37 +292,21 @@ cudnnRNNAlgo_t ToCudnnRNNAlgo(const dnn::AlgorithmDesc& algorithm) {
     }
   }
 }
-#endif
 
 port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
-#if CUDNN_VERSION >= 6000
   TF_RETURN_IF_ERROR(GetCudnnProperty(MAJOR_VERSION, &version->major_version));
   TF_RETURN_IF_ERROR(GetCudnnProperty(MINOR_VERSION, &version->minor_version));
   TF_RETURN_IF_ERROR(GetCudnnProperty(PATCH_LEVEL, &version->patch_level));
-#else
-  size_t loaded_version = ::cudnnGetVersion();
-  version->major_version = loaded_version / 1000;
-  version->minor_version = (loaded_version / 100) % 10;
-  version->patch_level = loaded_version % 100;
-#endif
   return port::Status::OK();
 }
 
 }  // namespace
 
-CudnnSupport::CudnnSupport(CUDAExecutor* parent)
-    : parent_(parent), dnn_handle_(nullptr), current_dnn_stream_(nullptr) {}
-
-CudnnSupport::~CudnnSupport() {
-  auto status = wrap::cudnnDestroy(parent_, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "could not destroy cudnn handle: " << ToString(status);
-  }
-}
+CudnnSupport::CudnnSupport(CUDAExecutor* parent) : parent_(parent) {}
 
 port::Status CudnnSupport::Init() {
-  auto status = wrap::cudnnCreate(
-      parent_, reinterpret_cast<cudnnHandle_t*>(&dnn_handle_));
+  cudnnHandle_t cudnn_handle = nullptr;
+  auto status = cudnnCreate(&cudnn_handle);
   if (status == CUDNN_STATUS_SUCCESS) {
     CudnnVersion source_version(CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL);
 
@@ -499,9 +322,10 @@ port::Status CudnnSupport::Init() {
           "from sources, make sure the library loaded at runtime is compatible "
           "with the version specified during compile configuration.");
       LOG(ERROR) << error;
-      return port::Status{port::error::INTERNAL, error};
+      return port::Status(port::error::INTERNAL, error);
     }
 
+    cudnn_.reset(new CudnnAccess(cudnn_handle));
     return port::Status::OK();
   }
 
@@ -525,9 +349,9 @@ port::Status CudnnSupport::Init() {
     }
   }
 
-  return port::Status{port::error::INTERNAL,
+  return port::Status(port::error::INTERNAL,
                       port::StrCat("cudnn library could not create a handle: ",
-                                   ToString(status))};
+                                   ToString(status)));
 }
 
 port::StatusOr<perftools::gputools::dnn::VersionInfo>
@@ -538,14 +362,15 @@ CudnnSupport::GetVersion() {
       version.major_version, version.minor_version, version.patch_level);
 }
 
+namespace {
+
 // Turns a BatchDescriptor structure into a cudnn tensor handle within a scope.
 class ScopedTensorDescriptor {
  public:
-  ScopedTensorDescriptor(CUDAExecutor* parent,
-                         const BatchDescriptor& batch_descriptor,
+  ScopedTensorDescriptor(const dnn::BatchDescriptor& batch_descriptor,
                          cudnnDataType_t elem_type)
-      : parent_(parent), handle_(nullptr) {
-    cudnnStatus_t status = wrap::cudnnCreateTensorDescriptor(parent_, &handle_);
+      : handle_(nullptr) {
+    cudnnStatus_t status = cudnnCreateTensorDescriptor(&handle_);
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(FATAL) << "could not create cudnn tensor descriptor: "
                  << ToString(status);
@@ -568,8 +393,8 @@ class ScopedTensorDescriptor {
                        &CheckedNarrowing<int64, int>);
         std::transform(dims64.cbegin(), dims64.cend(), dims.begin(),
                        &CheckedNarrowing<int64, int>);
-        status = wrap::cudnnSetTensorNdDescriptor(
-            parent_, handle_, elem_type, nd, dims.data(), strides.data());
+        status = cudnnSetTensorNdDescriptor(handle_, elem_type, nd, dims.data(),
+                                            strides.data());
 
         if (status != CUDNN_STATUS_SUCCESS) {
           LOG(FATAL) << "could not convert BatchDescriptor "
@@ -577,10 +402,9 @@ class ScopedTensorDescriptor {
                      << " to cudnn tensor descriptor: " << ToString(status);
         }
       } break;
-#if CUDNN_VERSION >= 6000
       case dnn::DataLayout::kBatchDepthYX4: {
-        status = wrap::cudnnSetTensor4dDescriptor(
-            parent_, handle_, CUDNN_TENSOR_NCHW_VECT_C, elem_type,
+        status = cudnnSetTensor4dDescriptor(
+            handle_, CUDNN_TENSOR_NCHW_VECT_C, elem_type,
             batch_descriptor.count(), batch_descriptor.feature_map_count(),
             batch_descriptor.height(), batch_descriptor.width());
         if (status != CUDNN_STATUS_SUCCESS) {
@@ -589,7 +413,6 @@ class ScopedTensorDescriptor {
                      << " to cudnn tensor descriptor: " << ToString(status);
         }
       } break;
-#endif
       default:
         LOG(FATAL) << "Unsupported tensor format "
                    << DataLayoutString(batch_descriptor.layout());
@@ -598,7 +421,7 @@ class ScopedTensorDescriptor {
   }
 
   ~ScopedTensorDescriptor() {
-    cudnnStatus_t status = wrap::cudnnDestroyTensorDescriptor(parent_, handle_);
+    cudnnStatus_t status = cudnnDestroyTensorDescriptor(handle_);
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(ERROR) << "could not destroy cudnn tensor descriptor: "
                  << ToString(status);
@@ -608,7 +431,6 @@ class ScopedTensorDescriptor {
   cudnnTensorDescriptor_t handle() const { return handle_; }
 
  private:
-  CUDAExecutor* parent_;            // Parent executor. Not owned.
   cudnnTensorDescriptor_t handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedTensorDescriptor);
@@ -617,18 +439,15 @@ class ScopedTensorDescriptor {
 // Turns a FilterDescriptor structure into a cudnn filter handle within a scope.
 class ScopedFilterDescriptor {
  public:
-  ScopedFilterDescriptor(CUDAExecutor* parent,
-                         const FilterDescriptor& filter_descriptor,
-                         const BatchDescriptor& batch_descriptor,
+  ScopedFilterDescriptor(const dnn::FilterDescriptor& filter_descriptor,
                          cudnnDataType_t elem_type)
-      : parent_(parent), handle_(nullptr) {
-    cudnnStatus_t status = wrap::cudnnCreateFilterDescriptor(parent_, &handle_);
+      : handle_(nullptr) {
+    cudnnStatus_t status = cudnnCreateFilterDescriptor(&handle_);
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(FATAL) << "could not create cudnn filter descriptor: "
                  << ToString(status);
     }
 
-#if CUDNN_VERSION >= 5000
     // TODO(b/23032134): Even if the filter layout is not supported,
     // cudnnSetFilter4DDescriptor_v4 will return CUDNN_STATUS_SUCCESS because it
     // does not take layout as an input. Maybe force cuDNN by giving wrong
@@ -638,17 +457,14 @@ class ScopedFilterDescriptor {
       case dnn::FilterLayout::kOutputInputYX:
         format = CUDNN_TENSOR_NCHW;
         break;
-#if CUDNN_VERSION >= 6000
       case dnn::FilterLayout::kOutputInputYX4:
         format = CUDNN_TENSOR_NCHW_VECT_C;
         break;
-#endif
       default:
         LOG(FATAL) << "Unsupported filter format "
                    << FilterLayoutString(filter_descriptor.layout());
         break;
     }
-#endif
 
     std::vector<int> dims(2 + filter_descriptor.ndims());
     dims[0] = filter_descriptor.output_feature_map_count();
@@ -656,11 +472,8 @@ class ScopedFilterDescriptor {
     const auto& spatial_dims = filter_descriptor.input_filter_dims();
     std::copy(spatial_dims.begin(), spatial_dims.end(), dims.begin() + 2);
 
-    status = wrap::cudnnSetFilterNdDescriptor(parent_, handle_, elem_type,
-#if CUDNN_VERSION >= 5000
-                                              format,
-#endif
-                                              dims.size(), dims.data());
+    status = cudnnSetFilterNdDescriptor(handle_, elem_type, format, dims.size(),
+                                        dims.data());
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(FATAL) << "could not set cudnn filter descriptor: "
                  << ToString(status);
@@ -668,7 +481,7 @@ class ScopedFilterDescriptor {
   }
 
   ~ScopedFilterDescriptor() {
-    cudnnStatus_t status = wrap::cudnnDestroyFilterDescriptor(parent_, handle_);
+    cudnnStatus_t status = cudnnDestroyFilterDescriptor(handle_);
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(ERROR) << "could not destroy cudnn filter descriptor: "
                  << ToString(status);
@@ -678,11 +491,7 @@ class ScopedFilterDescriptor {
   cudnnFilterDescriptor_t handle() const { return handle_; }
 
  private:
-  // Parent executor object. Not owned.
-  CUDAExecutor* parent_;
-
-  // cudnn filter descriptor this object creates. Owned.
-  cudnnFilterDescriptor_t handle_;
+  cudnnFilterDescriptor_t handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedFilterDescriptor);
 };
@@ -736,11 +545,10 @@ static bool BatchnormSpatialPersistentEnabled() {
 class ScopedConvolutionDescriptor {
  public:
   ScopedConvolutionDescriptor(
-      CUDAExecutor* parent, const ConvolutionDescriptor& convolution_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
       cudnnDataType_t data_type)
-      : parent_(parent), handle_(nullptr) {
-    cudnnStatus_t status =
-        wrap::cudnnCreateConvolutionDescriptor(parent_, &handle_);
+      : handle_(nullptr) {
+    cudnnStatus_t status = cudnnCreateConvolutionDescriptor(&handle_);
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(FATAL) << "could not create cudnn convolution descriptor: "
                  << ToString(status);
@@ -766,9 +574,9 @@ class ScopedConvolutionDescriptor {
     std::transform(dilations64.cbegin(), dilations64.cend(), dilations.begin(),
                    &CheckedNarrowing<int64, int>);
 
-    status = wrap::cudnnSetConvolutionNdDescriptor(
-        parent_, handle_, convolution_descriptor.ndims(), padding.data(),
-        strides.data(), dilations.data(),
+    status = cudnnSetConvolutionNdDescriptor(
+        handle_, convolution_descriptor.ndims(), padding.data(), strides.data(),
+        dilations.data(),
         // NOTE(keveman): cuDNN supports convolution and cross correlation.
         // However, almost all the use cases do cross correlation, so just
         // hard coding it here.
@@ -785,8 +593,8 @@ class ScopedConvolutionDescriptor {
 #if CUDNN_MAJOR >= 7
     VLOG(2) << "Requesting grouped convolution: "
             << convolution_descriptor.group_count();
-    status = wrap::cudnnSetConvolutionGroupCount(
-        parent_, handle_, convolution_descriptor.group_count());
+    status = cudnnSetConvolutionGroupCount(
+        handle_, convolution_descriptor.group_count());
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(FATAL) << "could not set cudnn convolution group count: "
                  << ToString(status);
@@ -802,8 +610,7 @@ class ScopedConvolutionDescriptor {
     cudnnMathType_t math_type =
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
     if (TensorOpMathEnabled()) {
-      cudnnStatus_t status =
-          wrap::cudnnSetConvolutionMathType(parent_, handle_, math_type);
+      cudnnStatus_t status = cudnnSetConvolutionMathType(handle_, math_type);
       if (status != CUDNN_STATUS_SUCCESS) {
         LOG(FATAL) << "could not set cudnn convolution math type: "
                    << ToString(status);
@@ -813,8 +620,7 @@ class ScopedConvolutionDescriptor {
   }
 
   ~ScopedConvolutionDescriptor() {
-    cudnnStatus_t status =
-        wrap::cudnnDestroyConvolutionDescriptor(parent_, handle_);
+    cudnnStatus_t status = cudnnDestroyConvolutionDescriptor(handle_);
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(ERROR) << "could not destroy cudnn convolution descriptor: "
                  << ToString(status);
@@ -824,7 +630,6 @@ class ScopedConvolutionDescriptor {
   cudnnConvolutionDescriptor_t handle() const { return handle_; }
 
  private:
-  CUDAExecutor* parent_;                 // Parent executor. Not owned.
   cudnnConvolutionDescriptor_t handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
@@ -834,11 +639,10 @@ class ScopedConvolutionDescriptor {
 // within a scope.
 class ScopedPoolingDescriptor {
  public:
-  ScopedPoolingDescriptor(CUDAExecutor* parent,
-                          const PoolingDescriptor& pooling_descriptor)
-      : parent_(parent), handle_(nullptr) {
-    cudnnStatus_t status =
-        wrap::cudnnCreatePoolingDescriptor(parent_, &handle_);
+  explicit ScopedPoolingDescriptor(
+      const dnn::PoolingDescriptor& pooling_descriptor)
+      : handle_(nullptr) {
+    cudnnStatus_t status = cudnnCreatePoolingDescriptor(&handle_);
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(FATAL) << "could not create cudnn pooling descriptor: "
                  << ToString(status);
@@ -858,23 +662,20 @@ class ScopedPoolingDescriptor {
     std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
                    &CheckedNarrowing<int64, int>);
     bool propagate_nans = pooling_descriptor.propagate_nans();
-    status = wrap::cudnnSetPoolingNdDescriptor(
-        parent_, handle_,
+    status = cudnnSetPoolingNdDescriptor(
+        handle_,
         (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum
              ? CUDNN_POOLING_MAX
              : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
-#if CUDNN_VERSION >= 5000
-        propagate_nans ? CUDNN_PROPAGATE_NAN : CUDNN_NOT_PROPAGATE_NAN,
-#endif
-        nd, shape.data(), padding.data(), strides.data());
+        propagate_nans ? CUDNN_PROPAGATE_NAN : CUDNN_NOT_PROPAGATE_NAN, nd,
+        shape.data(), padding.data(), strides.data());
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(FATAL) << "could not set cudnn pooling descriptor: "
                  << ToString(status);
     }
   }
   ~ScopedPoolingDescriptor() {
-    cudnnStatus_t status =
-        wrap::cudnnDestroyPoolingDescriptor(parent_, handle_);
+    cudnnStatus_t status = cudnnDestroyPoolingDescriptor(handle_);
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(ERROR) << "could not destroy cudnn pooling descriptor: "
                  << ToString(status);
@@ -884,7 +685,6 @@ class ScopedPoolingDescriptor {
   cudnnPoolingDescriptor_t handle() const { return handle_; }
 
  private:
-  CUDAExecutor* parent_;             // Parent executor. Not owned.
   cudnnPoolingDescriptor_t handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
@@ -893,10 +693,10 @@ class ScopedPoolingDescriptor {
 // Turns a NormalizeDescriptor structure into a cudnn LRN descriptor handle.
 class ScopedNormalizeDescriptor {
  public:
-  ScopedNormalizeDescriptor(CUDAExecutor* parent,
-                            const NormalizeDescriptor& normalize_descriptor)
-      : parent_(parent), handle_(nullptr) {
-    cudnnStatus_t status = wrap::cudnnCreateLRNDescriptor(parent_, &handle_);
+  explicit ScopedNormalizeDescriptor(
+      const dnn::NormalizeDescriptor& normalize_descriptor)
+      : handle_(nullptr) {
+    cudnnStatus_t status = cudnnCreateLRNDescriptor(&handle_);
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(FATAL) << "could not create cudnn LRN descriptor: "
                  << ToString(status);
@@ -922,15 +722,14 @@ class ScopedNormalizeDescriptor {
 
     double lrnBeta = normalize_descriptor.beta();
     double lrnK = normalize_descriptor.bias();
-    status = wrap::cudnnSetLRNDescriptor(parent_, handle_, lrnN, lrnAlpha,
-                                         lrnBeta, lrnK);
+    status = cudnnSetLRNDescriptor(handle_, lrnN, lrnAlpha, lrnBeta, lrnK);
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(FATAL) << "could not set cudnn LRN descriptor: " << ToString(status);
     }
   }
 
   ~ScopedNormalizeDescriptor() {
-    cudnnStatus_t status = wrap::cudnnDestroyLRNDescriptor(parent_, handle_);
+    cudnnStatus_t status = cudnnDestroyLRNDescriptor(handle_);
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(ERROR) << "could not destroy cudnn LRN descriptor: "
                  << ToString(status);
@@ -940,24 +739,20 @@ class ScopedNormalizeDescriptor {
   cudnnLRNDescriptor_t handle() const { return handle_; }
 
  private:
-  CUDAExecutor* parent_;         // Parent executor. Not owned.
   cudnnLRNDescriptor_t handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedNormalizeDescriptor);
 };
 
-#if CUDNN_VERSION >= 5000
 // Turns a ActivationDescriptor structure into a cudnn activation
 // descriptor handle within a scope.
 class ScopedActivationDescriptor {
  public:
-  ScopedActivationDescriptor(CUDAExecutor* parent,
-                             dnn::ActivationMode activation_mode,
+  ScopedActivationDescriptor(dnn::ActivationMode activation_mode,
                              cudnnNanPropagation_t nan_propagation,
                              double value_max)
-      : parent_(parent), handle_(nullptr) {
-    cudnnStatus_t status =
-        wrap::cudnnCreateActivationDescriptor(parent_, &handle_);
+      : handle_(nullptr) {
+    cudnnStatus_t status = cudnnCreateActivationDescriptor(&handle_);
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(FATAL) << "could not create cudnn activation descriptor: "
                  << ToString(status);
@@ -988,8 +783,8 @@ class ScopedActivationDescriptor {
                    << static_cast<int>(activation_mode);
     }
 
-    status = wrap::cudnnSetActivationDescriptor(parent_, handle_, mode,
-                                                nan_propagation, relu_ceiling);
+    status = cudnnSetActivationDescriptor(handle_, mode, nan_propagation,
+                                          relu_ceiling);
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(FATAL) << "could not set cudnn activation descriptor: "
                  << ToString(status);
@@ -997,8 +792,7 @@ class ScopedActivationDescriptor {
   }
 
   ~ScopedActivationDescriptor() {
-    cudnnStatus_t status =
-        wrap::cudnnDestroyActivationDescriptor(parent_, handle_);
+    cudnnStatus_t status = cudnnDestroyActivationDescriptor(handle_);
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(ERROR) << "could not destroy cudnn activation descriptor: "
                  << ToString(status);
@@ -1008,14 +802,11 @@ class ScopedActivationDescriptor {
   cudnnActivationDescriptor_t handle() const { return handle_; }
 
  private:
-  CUDAExecutor* parent_;                // Parent executor. Not owned.
   cudnnActivationDescriptor_t handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivationDescriptor);
 };
-#endif
 
-namespace {
 cudnnDataType_t ToCudnnDataType(
     dnn::DataType data_type,
     dnn::DataLayout data_layout = dnn::DataLayout::kBatchDepthYX) {
@@ -1024,18 +815,14 @@ cudnnDataType_t ToCudnnDataType(
     case dnn::DataType::kDouble:
     case dnn::DataType::kHalf:
       return static_cast<cudnnDataType_t>(data_type);
-#if CUDNN_VERSION >= 6000
     case dnn::DataType::kInt8:
       return data_layout == dnn::DataLayout::kBatchDepthYX4 ? CUDNN_DATA_INT8x4
                                                             : CUDNN_DATA_INT8;
-#endif
     default:
       LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
   }
 }
 
-#if CUDNN_VERSION >= 5000
-
 cudnnRNNInputMode_t ToCudnnRnnInputMode(dnn::RnnInputMode input_mode) {
   switch (input_mode) {
     case dnn::RnnInputMode::kRnnLinearSkip:
@@ -1083,17 +870,11 @@ int CudnnDataTypeToByteSize(cudnnDataType_t data_type) {
   }
 }
 
-#endif  // CUDNN_VERSION
-
 template <typename Base>
 class MixinBase : public Base {};
 template <>
 class MixinBase<void> {};
 
-}  // namespace
-
-#if CUDNN_VERSION >= 5000
-
 #define CUDNN_RETURN_IF_FAIL(STATUS, ...)                                \
   if (!SE_PREDICT_TRUE((STATUS) == CUDNN_STATUS_SUCCESS)) {              \
     string error_msg = port::StrCat(ToString(STATUS), " ", __VA_ARGS__); \
@@ -1102,6 +883,7 @@ class MixinBase<void> {};
     return;                                                              \
   }
 
+// TODO(csigg): Remove inheritance for code reuse.
 template <typename Base>
 class CudnnDescriptorCommon : public MixinBase<Base> {
  public:
@@ -1115,12 +897,11 @@ class CudnnDescriptorCommon : public MixinBase<Base> {
 
 class CudnnDropoutDescriptor : public CudnnDescriptorCommon<void> {
  public:
-  CudnnDropoutDescriptor(CUDAExecutor* parent, cudnnHandle_t cudnn_handle,
-                         float dropout, uint64 seed,
+  CudnnDropoutDescriptor(const CudnnHandle& cudnn, float dropout, uint64 seed,
                          ScratchAllocator* state_allocator)
-      : parent_(parent), handle_(nullptr) {
+      : handle_(nullptr) {
     cudnnStatus_t status;
-    status = wrap::cudnnCreateDropoutDescriptor(parent_, &handle_);
+    status = cudnnCreateDropoutDescriptor(&handle_);
     CUDNN_RETURN_IF_FAIL(status, "Failed to create dropout descriptor");
 
     if (dropout == 0.f) {
@@ -1130,8 +911,7 @@ class CudnnDropoutDescriptor : public CudnnDescriptorCommon<void> {
     DeviceMemory<uint8> state_memory;
     if (state_allocator) {
       size_t state_sizes_in_bytes = 0;
-      status = wrap::cudnnDropoutGetStatesSize(parent_, cudnn_handle,
-                                               &state_sizes_in_bytes);
+      status = cudnnDropoutGetStatesSize(cudnn.handle(), &state_sizes_in_bytes);
       CUDNN_RETURN_IF_FAIL(status, "Failed to query dropout state sizes");
 
       auto allocated =
@@ -1146,9 +926,9 @@ class CudnnDropoutDescriptor : public CudnnDescriptorCommon<void> {
         return;
       }
     }
-    status = wrap::cudnnSetDropoutDescriptor(parent_, handle_, cudnn_handle,
-                                             dropout, state_memory.opaque(),
-                                             state_memory.size(), seed);
+    status = cudnnSetDropoutDescriptor(handle_, cudnn.handle(), dropout,
+                                       state_memory.opaque(),
+                                       state_memory.size(), seed);
     CUDNN_RETURN_IF_FAIL(
         status, port::StrCat(
                     "Failed to set dropout descriptor with state memory size: ",
@@ -1156,11 +936,9 @@ class CudnnDropoutDescriptor : public CudnnDescriptorCommon<void> {
   }
 
   ~CudnnDropoutDescriptor() {
-    if (handle_) {
-      cudnnStatus_t status =
-          wrap::cudnnDestroyDropoutDescriptor(parent_, handle_);
-      CUDNN_RETURN_IF_FAIL(status, "Failed to destroy Cudnn dropout handle: ");
-    }
+    cudnnStatus_t status = cudnnDestroyDropoutDescriptor(handle_);
+    // TODO(csigg): This is a no-op (error is not reported). Same below.
+    CUDNN_RETURN_IF_FAIL(status, "Failed to destroy Cudnn dropout handle: ");
   }
 
   cudnnDropoutDescriptor_t handle() const {
@@ -1169,8 +947,7 @@ class CudnnDropoutDescriptor : public CudnnDescriptorCommon<void> {
   }
 
  private:
-  CUDAExecutor* parent_;
-  cudnnDropoutDescriptor_t handle_;
+  cudnnDropoutDescriptor_t handle_;  // Owned.
   float dropout_;
   uint64 seed_;
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnDropoutDescriptor);
@@ -1180,10 +957,10 @@ class CudnnRnnParamsDescriptor : public CudnnDescriptorCommon<void> {
  public:
   typedef dnn::RnnDescriptor::ParamsRegion ParamsRegion;
   typedef dnn::RnnDescriptor::ParamsRegions ParamsRegions;
-  CudnnRnnParamsDescriptor(CUDAExecutor* parent, cudnnHandle_t cudnn_handle,
+  CudnnRnnParamsDescriptor(const CudnnHandle& cudnn,
                            const CudnnRnnDescriptor& rnn_desc);
   ~CudnnRnnParamsDescriptor() {
-    cudnnStatus_t status = wrap::cudnnDestroyFilterDescriptor(parent_, handle_);
+    cudnnStatus_t status = cudnnDestroyFilterDescriptor(handle_);
     CUDNN_RETURN_IF_FAIL(status, "Failed to destroy RNN filter descriptor");
   }
   cudnnFilterDescriptor_t handle() const {
@@ -1202,7 +979,6 @@ class CudnnRnnParamsDescriptor : public CudnnDescriptorCommon<void> {
 
  private:
   int GetRegionCountPerLayer() const;
-  CUDAExecutor* parent_;
   cudnnFilterDescriptor_t handle_;
   const CudnnRnnDescriptor* rnn_desc_;
   int64 params_size_in_bytes_;
@@ -1211,26 +987,25 @@ class CudnnRnnParamsDescriptor : public CudnnDescriptorCommon<void> {
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnParamsDescriptor);
 };
 
+}  // namespace
+
 class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
  public:
-  CudnnRnnDescriptor(CUDAExecutor* parent, cudnnHandle_t cudnn_handle,
-                     int num_layers, int hidden_size, int input_size,
-                     int batch_size, cudnnRNNInputMode_t input_mode,
+  CudnnRnnDescriptor(const CudnnHandle& cudnn, int num_layers, int hidden_size,
+                     int input_size, int batch_size,
+                     cudnnRNNInputMode_t input_mode,
                      cudnnDirectionMode_t direction_mode,
                      cudnnRNNMode_t rnn_mode, cudnnDataType_t data_type,
                      cudnnDataType_t compute_type,
                      const dnn::AlgorithmConfig& algorithm_config,
                      float dropout, uint64 seed,
                      ScratchAllocator* state_allocator)
-      : parent_(parent),
-        rnn_desc_(nullptr),
+      : rnn_desc_(nullptr),
         num_layers_(num_layers),
         hidden_size_(hidden_size),
         input_size_(input_size),
         batch_size_(batch_size),
-#if CUDNN_VERSION >= 6000
         rnn_plan_(nullptr),
-#endif
         input_mode_(input_mode),
         direction_mode_(direction_mode),
         rnn_mode_(rnn_mode),
@@ -1238,21 +1013,20 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
         compute_type_(compute_type),
         algorithm_config_(algorithm_config) {
     // Create the dropout handle.
-    cudnn_dropout_desc_.reset(new CudnnDropoutDescriptor(
-        parent, cudnn_handle, dropout, seed, state_allocator));
+    cudnn_dropout_desc_.reset(
+        new CudnnDropoutDescriptor(cudnn, dropout, seed, state_allocator));
     if (!cudnn_dropout_desc_->ok()) {
       SetFailure(cudnn_dropout_desc_->Status());
       return;
     }
 
     // Create the RNN handle
-    cudnnStatus_t status = wrap::cudnnCreateRNNDescriptor(parent_, &rnn_desc_);
+    cudnnStatus_t status = cudnnCreateRNNDescriptor(&rnn_desc_);
     CUDNN_RETURN_IF_FAIL(status, "Unable to create RNN descriptor");
-#if CUDNN_VERSION >= 6000
     // TODO: allow the user to choose an algorithm.
     rnn_algo_ = ToCudnnRNNAlgo(algorithm_config_.algorithm());
-    status = wrap::cudnnSetRNNDescriptor_v6(
-        parent, cudnn_handle, /*rnnDesc=*/rnn_desc_, /*hiddenSize=*/hidden_size,
+    status = cudnnSetRNNDescriptor_v6(
+        cudnn.handle(), /*rnnDesc=*/rnn_desc_, /*hiddenSize=*/hidden_size,
         /*numLayers=*/num_layers, /*dropoutDesc=*/dropout_handle(),
         /*inputMode=*/input_mode, /*direction=*/direction_mode,
         /*mode=*/rnn_mode, /*algo=*/rnn_algo_, /*dataType=*/compute_type);
@@ -1264,26 +1038,15 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
 
     if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
       CHECK_GE(batch_size_, 0);
-      status = wrap::cudnnCreatePersistentRNNPlan(
-          parent, rnn_desc_, batch_size_, data_type_, &rnn_plan_);
+      status = cudnnCreatePersistentRNNPlan(rnn_desc_, batch_size_, data_type_,
+                                            &rnn_plan_);
       CUDNN_RETURN_IF_FAIL(status, "Unable to create persistent RNN plan.");
-      status = wrap::cudnnSetPersistentRNNPlan(parent, rnn_desc_, rnn_plan_);
+      status = cudnnSetPersistentRNNPlan(rnn_desc_, rnn_plan_);
       CUDNN_RETURN_IF_FAIL(status, "Unable to update persistent RNN plan.");
     }
-#else
-    CHECK(algorithm_config_.is_default())
-        << "Non-default algorithm not supported for CUDA version < 6.0";
-    status = wrap::cudnnSetRNNDescriptor(
-        parent, rnn_desc_ /*rnnDesc*/, hidden_size /*hiddenSize*/,
-        num_layers /*numLayers*/, dropout_handle() /*dropoutDesc*/,
-        input_mode /*inputMode*/, direction_mode /*direction*/,
-        rnn_mode /*mode*/, compute_type /*dataType*/);
-    CUDNN_RETURN_IF_FAIL(status, "Unable to update RNN descriptor");
-#endif
 
     // Create the params handle.
-    cudnn_params_desc_.reset(
-        new CudnnRnnParamsDescriptor(parent, cudnn_handle, *this));
+    cudnn_params_desc_.reset(new CudnnRnnParamsDescriptor(cudnn, *this));
     if (!cudnn_params_desc_->ok()) {
       SetFailure(cudnn_params_desc_->Status());
       return;
@@ -1293,13 +1056,11 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   ~CudnnRnnDescriptor() override {
     if (rnn_desc_) {
       cudnnStatus_t status;
-#if CUDNN_VERSION >= 6000
       if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC && rnn_plan_) {
-        status = wrap::cudnnDestroyPersistentRNNPlan(parent_, rnn_plan_);
+        status = cudnnDestroyPersistentRNNPlan(rnn_plan_);
         CUDNN_RETURN_IF_FAIL(status, "Unable to destroy persistent RNN plan.");
       }
-#endif
-      status = wrap::cudnnDestroyRNNDescriptor(parent_, rnn_desc_);
+      status = cudnnDestroyRNNDescriptor(rnn_desc_);
       CUDNN_RETURN_IF_FAIL(status, "Unable to destroy RNN descriptor");
     }
   }
@@ -1308,11 +1069,9 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
     cudnnMathType_t math_type =
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
     if (RnnTensorOpMathEnabled()) {
-      cudnnStatus_t status =
-          wrap::cudnnSetRNNMatrixMathType(parent_, rnn_desc_, math_type);
+      cudnnStatus_t status = cudnnSetRNNMatrixMathType(rnn_desc_, math_type);
       if (status != CUDNN_STATUS_SUCCESS) {
-        LOG(FATAL) << "could not set cudnn RNN math type: "
-                   << ToString(status);
+        LOG(FATAL) << "could not set cudnn RNN math type: " << ToString(status);
       }
     }
 #endif
@@ -1354,7 +1113,6 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   }
 
  private:
-  CUDAExecutor* parent_;
   cudnnRNNDescriptor_t rnn_desc_;
   int num_layers_;
   int hidden_size_;
@@ -1362,10 +1120,8 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   // batch_size_ is set to -1 when not using CUDNN_RNN_ALGO_PERSIST_DYNAMIC
   // algorithm.
   int batch_size_;
-#if CUDNN_VERSION >= 6000
   cudnnRNNAlgo_t rnn_algo_;
   cudnnPersistentRNNPlan_t rnn_plan_;
-#endif
   cudnnRNNInputMode_t input_mode_;
   cudnnDirectionMode_t direction_mode_;
   cudnnRNNMode_t rnn_mode_;
@@ -1377,44 +1133,42 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnDescriptor);
 };
 
+namespace {
+
 CudnnRnnParamsDescriptor::CudnnRnnParamsDescriptor(
-    CUDAExecutor* parent, cudnnHandle_t cudnn_handle,
-    const CudnnRnnDescriptor& rnn_desc)
-    : parent_(parent),
-      handle_(nullptr),
-      rnn_desc_(&rnn_desc),
-      params_size_in_bytes_(0) {
+    const CudnnHandle& cudnn, const CudnnRnnDescriptor& rnn_desc)
+    : handle_(nullptr), rnn_desc_(&rnn_desc), params_size_in_bytes_(0) {
   cudnnTensorDescriptor_t input_desc = nullptr;
   {
     // Query the params size.
-    auto status = wrap::cudnnCreateTensorDescriptor(parent, &input_desc);
+    auto status = cudnnCreateTensorDescriptor(&input_desc);
     CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to create tensor descriptor");
     int dims[] = {1, rnn_desc.input_size(), 1};
     int strides[] = {dims[1] * dims[2], dims[2], 1};
-    status = wrap::cudnnSetTensorNdDescriptor(
-        parent, input_desc /*tensorDesc*/, rnn_desc.data_type() /*dataType*/,
-        sizeof(dims) / sizeof(dims[0]) /*nbDims*/, dims /*dimA*/,
-        strides /*strideA*/);
+    status = cudnnSetTensorNdDescriptor(
+        /*tensorDesc=*/input_desc, /*dataType=*/rnn_desc.data_type(),
+        /*nbDims=*/sizeof(dims) / sizeof(dims[0]), /*dimA=*/dims,
+        /*strideA=*/strides);
     CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to set tensor descriptor");
 
     size_t params_size = 0;
-    status = wrap::cudnnGetRNNParamsSize(
-        parent, cudnn_handle /*handle*/, rnn_desc.handle() /*rnnDesc*/,
-        input_desc /*xDesc*/, &params_size /*sizeInBytes*/,
-        rnn_desc.data_type() /*dataType*/);
+    status = cudnnGetRNNParamsSize(
+        /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+        /*xDesc=*/input_desc, /*sizeInBytes=*/&params_size,
+        /*dataType=*/rnn_desc.data_type());
     CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to get RNN parameter size");
     params_size_in_bytes_ = static_cast<int64>(params_size);
   }
 
   {
     // Create the params descriptor.
-    auto status = wrap::cudnnCreateFilterDescriptor(parent, &handle_);
+    auto status = cudnnCreateFilterDescriptor(&handle_);
     CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to create RNN filter descriptor");
     int dims[] = {static_cast<int>(params_size_in_bytes_), 1, 1};
-    status = wrap::cudnnSetFilterNdDescriptor(
-        parent, handle_ /*filterDesc*/, rnn_desc.data_type() /*dataType*/,
-        CUDNN_TENSOR_NCHW /*format*/, sizeof(dims) / sizeof(dims[0]) /*nbDims*/,
-        dims /*filterDimA*/);
+    status = cudnnSetFilterNdDescriptor(
+        /*filterDesc=*/handle_, /*dataType=*/rnn_desc.data_type(),
+        /*format=*/CUDNN_TENSOR_NCHW, /*nbDims=*/sizeof(dims) / sizeof(dims[0]),
+        /*filterDimA=*/dims);
     CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to update RNN filter descriptor");
   }
 
@@ -1422,8 +1176,7 @@ CudnnRnnParamsDescriptor::CudnnRnnParamsDescriptor(
     // Create the weights and biases into the params buffer
     int region_count_per_layer = GetRegionCountPerLayer();
     cudnnFilterDescriptor_t region_desc_handle = nullptr;
-    auto status =
-        wrap::cudnnCreateFilterDescriptor(parent, &region_desc_handle);
+    auto status = cudnnCreateFilterDescriptor(&region_desc_handle);
     CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to create filter descriptor");
     const int layer_count = rnn_desc.direction_mode() == CUDNN_UNIDIRECTIONAL
                                 ? rnn_desc.num_layers()
@@ -1433,21 +1186,21 @@ CudnnRnnParamsDescriptor::CudnnRnnParamsDescriptor(
         for (int type = 0; type < 2; type++) {
           void* offset = nullptr;
           if (type == 0) {
-            status = wrap::cudnnGetRNNLinLayerMatrixParams(
-                parent, cudnn_handle /*handle*/, rnn_desc.handle() /*rnnDesc*/,
-                layer /*layer*/, input_desc /*xDesc*/, handle_ /*wDesc*/,
-                nullptr /*w*/, region /*linLayerID*/,
-                region_desc_handle /*linLayerMatDesc*/,
-                &offset /*linLayerMat*/);
+            status = cudnnGetRNNLinLayerMatrixParams(
+                /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+                /*layer=*/layer, /*xDesc=*/input_desc, /*wDesc=*/handle_,
+                /*w=*/nullptr, /*linLayerID=*/region,
+                /*linLayerMatDesc=*/region_desc_handle,
+                /*linLayerMat=*/&offset);
             CUDNN_RETURN_IF_FAIL(
                 status, "Cudnn fails to call cudnnGetRNNLinLayerMatrixParams");
           } else {
-            status = wrap::cudnnGetRNNLinLayerBiasParams(
-                parent, cudnn_handle /*rnnDesc*/, rnn_desc.handle() /*rnnDesc*/,
-                layer /*layer*/, input_desc /*xDesc*/, handle_ /*wDesc*/,
-                nullptr /*w*/, region /*linLayerID*/,
-                region_desc_handle /*linLayerBiasDesc*/,
-                &offset /*linLayerBias*/);
+            status = cudnnGetRNNLinLayerBiasParams(
+                /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+                /*layer=*/layer, /*xDesc=*/input_desc, /*wDesc=*/handle_,
+                /*w=*/nullptr, /*linLayerID=*/region,
+                /*linLayerBiasDesc=*/region_desc_handle,
+                /*linLayerBias=*/&offset);
             CUDNN_RETURN_IF_FAIL(
                 status, "Cudnn fails to call cudnnGetRNNLinLayerBiasParams");
           }
@@ -1455,15 +1208,15 @@ CudnnRnnParamsDescriptor::CudnnRnnParamsDescriptor(
           cudnnDataType_t data_type;
           cudnnTensorFormat_t tensor_format;
           int n_dims;
-          status = wrap::cudnnGetFilterNdDescriptor(
-              parent, region_desc_handle /*filterDesc*/,
-              sizeof(dims) / sizeof(dims[0]) /*nbDimsRequested*/,
-              &data_type /*dataType*/, &tensor_format /*format*/,
-              &n_dims /*nbDims*/, dims /*filterDimA*/);
+          status = cudnnGetFilterNdDescriptor(
+              /*filterDesc=*/region_desc_handle,
+              /*nbDimsRequested=*/sizeof(dims) / sizeof(dims[0]),
+              /*dataType=*/&data_type, /*format=*/&tensor_format,
+              /*nbDims=*/&n_dims, /*filterDimA=*/dims);
           CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to get filter description");
           int64 size = dims[0] * dims[1] * dims[2] *
                        CudnnDataTypeToByteSize(rnn_desc.data_type());
-          auto region = ParamsRegion{reinterpret_cast<int64>(offset), size};
+          ParamsRegion region = {reinterpret_cast<int64>(offset), size};
           if (type == 0) {
             weights_.push_back(region);
           } else {
@@ -1472,13 +1225,13 @@ CudnnRnnParamsDescriptor::CudnnRnnParamsDescriptor(
         }
       }
     }
-    status = wrap::cudnnDestroyFilterDescriptor(parent, region_desc_handle);
+    status = cudnnDestroyFilterDescriptor(region_desc_handle);
     CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to destroy filter descriptor");
   }
 
   {
     // Release the dummy input tensor descriptor.
-    auto status = wrap::cudnnDestroyTensorDescriptor(parent, input_desc);
+    auto status = cudnnDestroyTensorDescriptor(input_desc);
     CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to destroy tensor descriptor");
   }
 }
@@ -1498,6 +1251,8 @@ int CudnnRnnParamsDescriptor::GetRegionCountPerLayer() const {
   }
 }
 
+}  // namespace
+
 class CudnnRnnSequenceTensorDescriptor
     : public CudnnDescriptorCommon<dnn::RnnSequenceTensorDescriptor> {
  public:
@@ -1517,14 +1272,14 @@ class CudnnRnnSequenceTensorDescriptor
       SetFailure(port::Status(port::error::UNKNOWN, error_msg));
       return;
     }
-    cudnnStatus_t status = wrap::cudnnCreateTensorDescriptor(parent, &handle);
+    cudnnStatus_t status = cudnnCreateTensorDescriptor(&handle);
     CUDNN_RETURN_IF_FAIL(status, "Failed to create tensor descriptor");
     int dims[] = {batch_size, data_size, 1};
     int strides[] = {dims[1] * dims[2], dims[2], 1};
-    status = wrap::cudnnSetTensorNdDescriptor(
-        parent, handle /*tensorDesc*/, data_type /*dataType*/,
-        sizeof(dims) / sizeof(dims[0]) /*nbDims*/, dims /*dimA*/,
-        strides /*strideA*/);
+    status = cudnnSetTensorNdDescriptor(
+        /*tensorDesc=*/handle, /*dataType=*/data_type,
+        /*nbDims=*/sizeof(dims) / sizeof(dims[0]), /*dimA=*/dims,
+        /*strideA=*/strides);
     CUDNN_RETURN_IF_FAIL(status, "Failed to update tensor descriptor");
     // Replicate handle across the number of steps.
     handles_.assign(seq_length, handle);
@@ -1532,8 +1287,7 @@ class CudnnRnnSequenceTensorDescriptor
 
   ~CudnnRnnSequenceTensorDescriptor() override {
     // Only the first one needs to be destroyed. All others are the same.
-    cudnnStatus_t status =
-        wrap::cudnnDestroyTensorDescriptor(parent_, handles_[0]);
+    cudnnStatus_t status = cudnnDestroyTensorDescriptor(handles_[0]);
     CUDNN_RETURN_IF_FAIL(status,
                          "Failed to destroy sequence tensor descriptor");
   }
@@ -1570,21 +1324,20 @@ class CudnnRnnStateTensorDescriptor
         batch_size_(batch_size),
         data_size_(data_size),
         data_type_(data_type) {
-    cudnnStatus_t status = wrap::cudnnCreateTensorDescriptor(parent, &handle_);
+    cudnnStatus_t status = cudnnCreateTensorDescriptor(&handle_);
     CUDNN_RETURN_IF_FAIL(status, "Failed to create tensor descriptor");
     int dims[] = {num_layers, batch_size, data_size};
     int strides[] = {dims[1] * dims[2], dims[2], 1};
-    status = wrap::cudnnSetTensorNdDescriptor(
-        parent, handle_ /*tensorDesc*/, data_type /*dataType*/,
-        sizeof(dims) / sizeof(dims[0]) /*nbDims*/, dims /*dimA*/,
-        strides /*strideA*/);
+    status = cudnnSetTensorNdDescriptor(
+        /*tensorDesc=*/handle_, /*dataType=*/data_type,
+        /*nbDims=*/sizeof(dims) / sizeof(dims[0]), /*dimA=*/dims,
+        /*strideA=*/strides);
     CUDNN_RETURN_IF_FAIL(status, "Failed to update tensor descriptor");
   }
 
   ~CudnnRnnStateTensorDescriptor() override {
     if (!handle_) {
-      cudnnStatus_t status =
-          wrap::cudnnDestroyTensorDescriptor(parent_, handle_);
+      cudnnStatus_t status = cudnnDestroyTensorDescriptor(handle_);
       CUDNN_RETURN_IF_FAIL(status, "Unable to destroy RNN state tensor");
     }
   }
@@ -1679,14 +1432,14 @@ bool ExtractAndCheckRnnForward(
   return true;
 }
 
-bool CheckRNNParameterSize(CUDAExecutor* parent, cudnnHandle_t cudnn_handle,
+bool CheckRNNParameterSize(const CudnnHandle& cudnn,
                            const CudnnRnnDescriptor& rnn_desc,
                            const CudnnRnnSequenceTensorDescriptor& input_desc) {
   size_t params_size_in_bytes = 0;
-  cudnnStatus_t status = wrap::cudnnGetRNNParamsSize(
-      parent, cudnn_handle /*handle*/, rnn_desc.handle() /*rnnDesc*/,
-      input_desc.handles()[0] /*xDesc*/, &params_size_in_bytes /*sizeInBytes*/,
-      rnn_desc.data_type() /*dataType*/);
+  cudnnStatus_t status = cudnnGetRNNParamsSize(
+      /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+      /*xDesc=*/input_desc.handles()[0], /*sizeInBytes=*/&params_size_in_bytes,
+      /*dataType=*/rnn_desc.data_type());
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "Unable to check RNN param size: " << ToString(status);
     return false;
@@ -1695,18 +1448,17 @@ bool CheckRNNParameterSize(CUDAExecutor* parent, cudnnHandle_t cudnn_handle,
          rnn_desc.ParamsSizeInBytes();
 }
 
-bool CreateRnnWorkspace(Stream* stream, CUDAExecutor* parent,
-                        cudnnHandle_t cudnn_handle,
+bool CreateRnnWorkspace(Stream* stream, const CudnnHandle& cudnn,
                         const CudnnRnnDescriptor& rnn_desc,
                         const CudnnRnnSequenceTensorDescriptor& input_desc,
                         ScratchAllocator* workspace_allocator,
                         DeviceMemory<uint8>* workspace) {
   // Query the workspace size.
   size_t workspace_size_in_bytes = 0;
-  cudnnStatus_t status = wrap::cudnnGetRNNWorkspaceSize(
-      parent, cudnn_handle /*handle*/, rnn_desc.handle() /*rnnDesc*/,
-      input_desc.seq_length() /*seqLength*/, input_desc.handles() /*xDesc*/,
-      &workspace_size_in_bytes /*sizeInBytes*/);
+  cudnnStatus_t status = cudnnGetRNNWorkspaceSize(
+      /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+      /*seqLength=*/input_desc.seq_length(), /*xDesc=*/input_desc.handles(),
+      /*sizeInBytes=*/&workspace_size_in_bytes);
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "Unable to query workspace size: " << ToString(status);
     return false;
@@ -1758,25 +1510,18 @@ bool CudnnSupport::DoRnnForwardImpl(
     return false;
   }
 
-  // check params size
-  mutex_lock lock{dnn_handle_mutex_};
-  auto set_stream_status =
-      wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (set_stream_status != CUDNN_STATUS_SUCCESS) {
-    LOG(FATAL) << "failed to set stream for cudnn handle: "
-               << ToString(set_stream_status);
-  }
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  if (!CheckRNNParameterSize(parent_, ToHandle(dnn_handle_), rnn_desc,
-                             input_desc)) {
+  // check params size
+  if (!CheckRNNParameterSize(cudnn, rnn_desc, input_desc)) {
     LOG(ERROR) << "Invalid parameters";
     return false;
   }
 
   // create the workspace
   DeviceMemory<uint8> workspace;
-  if (!CreateRnnWorkspace(stream, parent_, ToHandle(dnn_handle_), rnn_desc,
-                          input_desc, workspace_allocator, &workspace)) {
+  if (!CreateRnnWorkspace(stream, cudnn, rnn_desc, input_desc,
+                          workspace_allocator, &workspace)) {
     LOG(ERROR) << "Unable to create rnn workspace";
     return false;
   }
@@ -1786,11 +1531,10 @@ bool CudnnSupport::DoRnnForwardImpl(
   DeviceMemory<uint8> reserve_space;
   if (is_training) {
     size_t reserve_space_size_in_bytes = 0;
-    cudnnStatus_t status = wrap::cudnnGetRNNTrainingReserveSize(
-        parent_, ToHandle(dnn_handle_) /*handle*/,
-        rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
-        input_desc.handles() /*xDesc*/,
-        &reserve_space_size_in_bytes /*sizeInBytes*/);
+    cudnnStatus_t status = cudnnGetRNNTrainingReserveSize(
+        /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+        /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
+        /*sizeInBytes=*/&reserve_space_size_in_bytes);
     if (status != CUDNN_STATUS_SUCCESS) {
       LOG(ERROR) << "Unable to query reserve space size: " << ToString(status);
       return false;
@@ -1825,33 +1569,31 @@ bool CudnnSupport::DoRnnForwardImpl(
   // make the forward call
   cudnnStatus_t status;
   if (!is_training) {
-    status = wrap::cudnnRNNForwardInference(
-        this, stream, ToHandle(dnn_handle_) /*handle*/,
-        rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
-        input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
-        input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
-        input_c_desc.handle() /*cxDesc*/, input_c_data.opaque() /*cx*/,
-        rnn_desc.params_handle() /*wDesc*/, params.opaque() /*w*/,
-        output_desc.handles() /*yDesc*/, output_data->opaque() /*y*/,
-        output_h_desc.handle() /*hyDesc*/, output_h_data->opaque() /*hy*/,
-        output_c_desc.handle() /*cyDesc*/, output_c_data->opaque() /*cy*/,
-        workspace.opaque() /*workspace*/,
-        workspace.size() /*workSpaceSizeInBytes*/);
+    status = cudnnRNNForwardInference(
+        /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+        /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
+        /*x=*/input_data.opaque(), /*hxDesc=*/input_h_desc.handle(),
+        /*hx=*/input_h_data.opaque(), /*cxDesc=*/input_c_desc.handle(),
+        /*cx=*/input_c_data.opaque(), /*wDesc=*/rnn_desc.params_handle(),
+        /*w=*/params.opaque(), /*yDesc=*/output_desc.handles(),
+        /*y=*/output_data->opaque(), /*hyDesc=*/output_h_desc.handle(),
+        /*hy=*/output_h_data->opaque(), /*cyDesc=*/output_c_desc.handle(),
+        /*cy=*/output_c_data->opaque(), /*workspace=*/workspace.opaque(),
+        /*workSpaceSizeInBytes=*/workspace.size());
   } else {
-    status = wrap::cudnnRNNForwardTraining(
-        this, stream, ToHandle(dnn_handle_) /*handle*/,
-        rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
-        input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
-        input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
-        input_c_desc.handle() /*cxDesc*/, input_c_data.opaque() /*cx*/,
-        rnn_desc.params_handle() /*wDesc*/, params.opaque() /*w*/,
-        output_desc.handles() /*yDesc*/, output_data->opaque() /*y*/,
-        output_h_desc.handle() /*hyDesc*/, output_h_data->opaque() /*hy*/,
-        output_c_desc.handle() /*cyDesc*/, output_c_data->opaque() /*cy*/,
-        workspace.opaque() /*workspace*/,
-        workspace.size() /*workSpaceSizeInBytes*/,
-        reserve_space.opaque() /*reserveSpace*/,
-        reserve_space.size() /*reserveSpaceSizeInBytes*/);
+    status = cudnnRNNForwardTraining(
+        /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+        /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
+        /*x=*/input_data.opaque(), /*hxDesc=*/input_h_desc.handle(),
+        /*hx=*/input_h_data.opaque(), /*cxDesc=*/input_c_desc.handle(),
+        /*cx=*/input_c_data.opaque(), /*wDesc=*/rnn_desc.params_handle(),
+        /*w=*/params.opaque(), /*yDesc=*/output_desc.handles(),
+        /*y=*/output_data->opaque(), /*hyDesc=*/output_h_desc.handle(),
+        /*hy=*/output_h_data->opaque(), /*cyDesc=*/output_c_desc.handle(),
+        /*cy=*/output_c_data->opaque(), /*workspace=*/workspace.opaque(),
+        /*workSpaceSizeInBytes=*/workspace.size(),
+        /*reserveSpace=*/reserve_space.opaque(),
+        /*reserveSpaceSizeInBytes=*/reserve_space.size());
   }
   if (is_profiling) {
     if (!timer->Stop(AsCUDAStream(stream))) {
@@ -1914,25 +1656,18 @@ bool CudnnSupport::DoRnnBackwardImpl(
     return false;
   }
 
-  // check params size
-  mutex_lock lock{dnn_handle_mutex_};
-  auto set_stream_status =
-      wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (set_stream_status != CUDNN_STATUS_SUCCESS) {
-    LOG(FATAL) << "failed to set stream for cudnn handle: "
-               << ToString(set_stream_status);
-  }
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  if (!CheckRNNParameterSize(parent_, ToHandle(dnn_handle_), rnn_desc,
-                             input_desc)) {
+  // check params size
+  if (!CheckRNNParameterSize(cudnn, rnn_desc, input_desc)) {
     LOG(ERROR) << "Invalid parameters";
     return false;
   }
 
   // create the workspace
   DeviceMemory<uint8> workspace;
-  if (!CreateRnnWorkspace(stream, parent_, ToHandle(dnn_handle_), rnn_desc,
-                          input_desc, workspace_allocator, &workspace)) {
+  if (!CreateRnnWorkspace(stream, cudnn, rnn_desc, input_desc,
+                          workspace_allocator, &workspace)) {
     LOG(ERROR) << "Unable to create rnn workspace";
     return false;
   }
@@ -1952,26 +1687,25 @@ bool CudnnSupport::DoRnnBackwardImpl(
     }
   }
   // make the backward data call
-  cudnnStatus_t status = wrap::cudnnRNNBackwardData(
-      this, stream, ToHandle(dnn_handle_) /*handle*/,
-      rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
-      output_desc.handles() /*yDesc*/, output_data.opaque() /*y*/,
-      output_desc.handles() /*dyDesc*/, output_backprop_data.opaque() /*dy*/,
-      output_h_desc.handle() /*dhyDesc*/,
-      output_h_backprop_data.opaque() /*dhy*/,
-      output_c_desc.handle() /*dcyDesc*/,
-      output_c_backprop_data.opaque() /*dcy*/,
-      rnn_desc.params_handle() /*wDesc*/, params.opaque() /*w*/,
-      input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
-      input_c_desc.handle() /*cxDesc*/, input_c_data.opaque() /*cx*/,
-      input_desc.handles() /*dxDesc*/, input_backprop_data->opaque() /*dx*/,
-      input_h_desc.handle() /*dhxDesc*/,
-      input_h_backprop_data->opaque() /*dhx*/,
-      input_c_desc.handle() /*dcxDesc*/,
-      input_c_backprop_data->opaque() /*dcx*/, workspace.opaque() /*workspace*/,
-      workspace.size() /*workSpaceSizeInBytes*/,
-      reserve_space_data->opaque() /*reserveSpace*/,
-      reserve_space_data->size() /*reserveSpaceSizeInBytes*/);
+  cudnnStatus_t status = cudnnRNNBackwardData(
+      /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+      /*seqLength=*/model_dims.seq_length, /*yDesc=*/output_desc.handles(),
+      /*y=*/output_data.opaque(), /*dyDesc=*/output_desc.handles(),
+      /*dy=*/output_backprop_data.opaque(), /*dhyDesc=*/output_h_desc.handle(),
+      /*dhy=*/output_h_backprop_data.opaque(),
+      /*dcyDesc=*/output_c_desc.handle(),
+      /*dcy=*/output_c_backprop_data.opaque(),
+      /*wDesc=*/rnn_desc.params_handle(), /*w=*/params.opaque(),
+      /*hxDesc=*/input_h_desc.handle(), /*hx=*/input_h_data.opaque(),
+      /*cxDesc=*/input_c_desc.handle(), /*cx=*/input_c_data.opaque(),
+      /*dxDesc=*/input_desc.handles(), /*dx=*/input_backprop_data->opaque(),
+      /*dhxDesc=*/input_h_desc.handle(),
+      /*dhx=*/input_h_backprop_data->opaque(),
+      /*dcxDesc=*/input_c_desc.handle(),
+      /*dcx=*/input_c_backprop_data->opaque(), /*workspace=*/workspace.opaque(),
+      /*workSpaceSizeInBytes=*/workspace.size(),
+      /*reserveSpace=*/reserve_space_data->opaque(),
+      /*reserveSpaceSizeInBytes=*/reserve_space_data->size());
 
   if (status != CUDNN_STATUS_SUCCESS) {
     if (is_profiling) {
@@ -1985,18 +1719,17 @@ bool CudnnSupport::DoRnnBackwardImpl(
     // Clear the dw to zeros.
     stream->ThenMemZero(params_backprop_data, params_backprop_data->size());
     // make the backward weight call
-    status = wrap::cudnnRNNBackwardWeights(
-        this, stream, ToHandle(dnn_handle_) /*handle*/,
-        rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
-        input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
-        input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
-        output_desc.handles() /*yDesc*/, output_data.opaque() /*y*/,
-        workspace.opaque() /*workspace*/,
-        workspace.size() /*workSpaceSizeInBytes*/,
-        rnn_desc.params_handle() /*dwDesc*/,
-        params_backprop_data->opaque() /*dw*/,
-        reserve_space_data->opaque() /*reserveSpace*/,
-        reserve_space_data->size() /*reserveSpaceSizeInBytes*/);
+    status = cudnnRNNBackwardWeights(
+        /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+        /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
+        /*x=*/input_data.opaque(), /*hxDesc=*/input_h_desc.handle(),
+        /*hx=*/input_h_data.opaque(), /*yDesc=*/output_desc.handles(),
+        /*y=*/output_data.opaque(), /*workspace=*/workspace.opaque(),
+        /*workSpaceSizeInBytes=*/workspace.size(),
+        /*dwDesc=*/rnn_desc.params_handle(),
+        /*dw=*/params_backprop_data->opaque(),
+        /*reserveSpace=*/reserve_space_data->opaque(),
+        /*reserveSpaceSizeInBytes=*/reserve_space_data->size());
     if (status != CUDNN_STATUS_SUCCESS) {
       if (is_profiling) {
         timer->Stop(AsCUDAStream(stream));
@@ -2019,8 +1752,6 @@ bool CudnnSupport::DoRnnBackwardImpl(
   return true;
 }
 
-#endif  // CUDNN_VERSION
-
 port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
 CudnnSupport::createRnnDescriptor(
     int num_layers, int hidden_size, int input_size, int batch_size,
@@ -2028,33 +1759,26 @@ CudnnSupport::createRnnDescriptor(
     dnn::RnnMode rnn_mode, dnn::DataType data_type,
     const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
     ScratchAllocator* state_allocator) {
-#if CUDNN_VERSION >= 5000
-  mutex_lock lock{dnn_handle_mutex_};
+  // Setting up a cudnnRNNDescriptor requires a cuDNN handle, but because it's
+  // not enqueueing anything into a stream, we pass in the null stream.
+  auto cudnn = cudnn_->GetHandle(parent_, /*stream=*/nullptr);
   std::unique_ptr<CudnnRnnDescriptor> rnn_desc(new CudnnRnnDescriptor(
-      parent_, ToHandle(dnn_handle_), num_layers, hidden_size, input_size,
-      batch_size, ToCudnnRnnInputMode(input_mode),
-      ToCudnnRnnDirectionMode(direction_mode), ToCudnnRnnMode(rnn_mode),
-      ToCudnnDataType(data_type), GetRnnComputeType(data_type),
-      algorithm_config, dropout, seed, state_allocator));
+      cudnn, num_layers, hidden_size, input_size, batch_size,
+      ToCudnnRnnInputMode(input_mode), ToCudnnRnnDirectionMode(direction_mode),
+      ToCudnnRnnMode(rnn_mode), ToCudnnDataType(data_type),
+      GetRnnComputeType(data_type), algorithm_config, dropout, seed,
+      state_allocator));
   if (!rnn_desc->ok()) {
     return rnn_desc->Status();
   }
   return port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>(
       std::move(rnn_desc));
-#else
-  string error_msg =
-      port::StrCat("createRnnDescriptor needs at least Cudnn 5.0 to work. ",
-                   "Current Cudnn version: ", CUDNN_VERSION, ". ");
-  LOG(ERROR) << error_msg;
-  return port::Status{port::error::UNIMPLEMENTED, error_msg};
-#endif  // CUDNN_VERSION
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
 CudnnSupport::createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
                                                 int data_size,
                                                 dnn::DataType data_type) {
-#if CUDNN_VERSION >= 5000
   std::unique_ptr<CudnnRnnSequenceTensorDescriptor> seq_desc(
       new CudnnRnnSequenceTensorDescriptor(parent_, seq_length, batch_size,
                                            data_size,
@@ -2064,20 +1788,12 @@ CudnnSupport::createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
   }
   return port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>(
       std::move(seq_desc));
-#else
-  string error_msg = port::StrCat(
-      "createRnnSequenceTensorDescriptor needs at least Cudnn 5.0 to work. ",
-      "Current Cudnn version: ", CUDNN_VERSION, ". ");
-  LOG(ERROR) << error_msg;
-  return port::Status{port::error::UNIMPLEMENTED, error_msg};
-#endif  // CUDNN_VERSION
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
 CudnnSupport::createRnnStateTensorDescriptor(int num_layer, int batch_size,
                                              int data_size,
                                              dnn::DataType data_type) {
-#if CUDNN_VERSION >= 5000
   std::unique_ptr<CudnnRnnStateTensorDescriptor> state_desc(
       new CudnnRnnStateTensorDescriptor(parent_, num_layer, batch_size,
                                         data_size, ToCudnnDataType(data_type)));
@@ -2086,13 +1802,6 @@ CudnnSupport::createRnnStateTensorDescriptor(int num_layer, int batch_size,
   }
   return port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>(
       std::move(state_desc));
-#else
-  string error_msg = port::StrCat(
-      "createRnnStateTensorDescriptor needs at least Cudnn 5.0 to work. ",
-      "Current Cudnn version: ", CUDNN_VERSION, ". ");
-  LOG(ERROR) << error_msg;
-  return port::Status{port::error::UNIMPLEMENTED, error_msg};
-#endif  // CUDNN_VERSION
 }
 
 bool CudnnSupport::DoRnnForward(
@@ -2113,7 +1822,6 @@ bool CudnnSupport::DoRnnForward(
     ScratchAllocator* reserve_space_allocator,
     ScratchAllocator* workspace_allocator,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
   const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2135,9 +1843,6 @@ bool CudnnSupport::DoRnnForward(
       output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
       output_c_data, is_training, reserve_space_allocator, workspace_allocator,
       output_profile_result);
-#else
-  return false;
-#endif  // CUDNN_VERSION
 }
 
 bool CudnnSupport::DoRnnForward(
@@ -2157,7 +1862,6 @@ bool CudnnSupport::DoRnnForward(
     ScratchAllocator* reserve_space_allocator,
     ScratchAllocator* workspace_allocator,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
   const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2179,9 +1883,6 @@ bool CudnnSupport::DoRnnForward(
       output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
       output_c_data, is_training, reserve_space_allocator, workspace_allocator,
       output_profile_result);
-#else
-  return false;
-#endif  // CUDNN_VERSION
 }
 
 bool CudnnSupport::DoRnnForward(
@@ -2202,7 +1903,6 @@ bool CudnnSupport::DoRnnForward(
     ScratchAllocator* reserve_space_allocator,
     ScratchAllocator* workspace_allocator,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
   const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2224,9 +1924,6 @@ bool CudnnSupport::DoRnnForward(
       output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
       output_c_data, is_training, reserve_space_allocator, workspace_allocator,
       output_profile_result);
-#else
-  return false;
-#endif  // CUDNN_VERSION
 }
 
 bool CudnnSupport::DoRnnBackward(
@@ -2254,7 +1951,6 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<uint8>* reserve_space_data,
     ScratchAllocator* workspace_allocator,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
   const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2278,9 +1974,6 @@ bool CudnnSupport::DoRnnBackward(
       output_c_backprop_data, input_backprop_data, input_h_backprop_data,
       input_c_backprop_data, params_backprop_data, reserve_space_data,
       workspace_allocator, output_profile_result);
-#else
-  return false;
-#endif  // CUDNN_VERSION
 }
 
 bool CudnnSupport::DoRnnBackward(
@@ -2307,7 +2000,6 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<uint8>* reserve_space_data,
     ScratchAllocator* workspace_allocator,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
   const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2331,9 +2023,6 @@ bool CudnnSupport::DoRnnBackward(
       output_c_backprop_data, input_backprop_data, input_h_backprop_data,
       input_c_backprop_data, params_backprop_data, reserve_space_data,
       workspace_allocator, output_profile_result);
-#else
-  return false;
-#endif  // CUDNN_VERSION
 }
 
 bool CudnnSupport::DoRnnBackward(
@@ -2361,7 +2050,6 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<uint8>* reserve_space_data,
     ScratchAllocator* workspace_allocator,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
   const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2385,43 +2073,31 @@ bool CudnnSupport::DoRnnBackward(
       output_c_backprop_data, input_backprop_data, input_h_backprop_data,
       input_c_backprop_data, params_backprop_data, reserve_space_data,
       workspace_allocator, output_profile_result);
-#else
-  return false;
-#endif  // CUDNN_VERSION
 }
 
 namespace {
 
 inline cudnnConvolutionFwdAlgo_t GetCudnnConvolutionForwardAlgo(
-    Stream* stream, CUDAExecutor* parent, void* dnn_handle,
-    const ScopedTensorDescriptor& input_nd,
+    const CudnnHandle& cudnn, const ScopedTensorDescriptor& input_nd,
     const ScopedFilterDescriptor& filter,
     const ScopedConvolutionDescriptor& conv,
     const ScopedTensorDescriptor& output_nd, bool specify_workspace_limit,
-    ScratchAllocator* scratch_allocator) {
+    size_t memory_limit_bytes) {
   cudnnConvolutionFwdPreference_t preference =
       specify_workspace_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
                               : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-  auto memory_limit_bytes =
-      scratch_allocator == nullptr
-          ? 0
-          : scratch_allocator->GetMemoryLimitInBytes(stream);
-  if (memory_limit_bytes < 0) {
-    memory_limit_bytes = 0;
-  }
 
   cudnnConvolutionFwdAlgo_t algo_to_use;
-  auto status = wrap::cudnnGetConvolutionForwardAlgorithm(
-      parent, ToHandle(dnn_handle), input_nd.handle(), filter.handle(),
-      conv.handle(), output_nd.handle(), preference, memory_limit_bytes,
-      &algo_to_use);
+  auto status = cudnnGetConvolutionForwardAlgorithm(
+      cudnn.handle(), input_nd.handle(), filter.handle(), conv.handle(),
+      output_nd.handle(), preference, memory_limit_bytes, &algo_to_use);
   CHECK_EQ(status, CUDNN_STATUS_SUCCESS)
       << "Unable to find a suitable algorithm for doing forward convolution";
   return algo_to_use;
 }
 
 dnn::AlgorithmDesc GetCudnnConvolutionForwardAlgorithm(
-    Stream* stream, CUDAExecutor* parent, void* dnn_handle,
+    Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config, bool is_profiling,
     const ScopedTensorDescriptor& input_nd,
     const ScopedFilterDescriptor& filter,
@@ -2432,19 +2108,29 @@ dnn::AlgorithmDesc GetCudnnConvolutionForwardAlgorithm(
   bool use_tensor_ops;
   if (algorithm_config.algorithm().is_default()) {
     use_tensor_ops = true;
+
+    auto memory_limit_bytes =
+        scratch_allocator == nullptr
+            ? 0
+            : scratch_allocator->GetMemoryLimitInBytes(stream);
+    if (memory_limit_bytes < 0) {
+      memory_limit_bytes = 0;
+    }
+
     algo = GetCudnnConvolutionForwardAlgo(
-        stream, parent, dnn_handle, input_nd, filter, conv, output_nd,
+        cudnn, input_nd, filter, conv, output_nd,
         /*specify_workspace_limit=*/scratch_allocator != nullptr,
-        scratch_allocator);
+        memory_limit_bytes);
   } else {
     use_tensor_ops = algorithm_config.algorithm().tensor_ops_enabled();
     algo = ToConvForwardAlgo(algorithm_config.algorithm());
   }
   size_t size_in_bytes;
-  auto status = wrap::cudnnGetConvolutionForwardWorkspaceSize(
-      parent, ToHandle(dnn_handle), /*srcDesc=*/input_nd.handle(),
-      /*filterDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
-      /*destDesc=*/output_nd.handle(), /*algo=*/algo,
+  auto status = cudnnGetConvolutionForwardWorkspaceSize(
+      cudnn.handle(),
+      /*xDesc=*/input_nd.handle(),
+      /*wDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
+      /*yDesc=*/output_nd.handle(), /*algo=*/algo,
       /*sizeInBytes=*/&size_in_bytes);
   int64 size_in_bytes_int64 = size_in_bytes;
   if (TF_PREDICT_FALSE(status != CUDNN_STATUS_SUCCESS)) {
@@ -2484,8 +2170,8 @@ dnn::AlgorithmDesc GetCudnnConvolutionForwardAlgorithm(
       if (algorithm_config.algorithm_no_scratch().is_default()) {
         use_tensor_ops = true;
         algo = GetCudnnConvolutionForwardAlgo(
-            stream, parent, dnn_handle, input_nd, filter, conv, output_nd,
-            /*specify_workspace_limit=*/false, nullptr);
+            cudnn, input_nd, filter, conv, output_nd,
+            /*specify_workspace_limit=*/false, 0);
       } else {
         use_tensor_ops = algorithm_config.algorithm().tensor_ops_enabled();
         algo = ToConvForwardAlgo(algorithm_config.algorithm_no_scratch());
@@ -2521,16 +2207,14 @@ class CudnnEnvVar {
 };
 
 // A helper struct to decide whether to enable the FFT_TILING algorithms for
-// forward convolution. Before cudnn v5.1 it works fine but since cudnn v5.1
-// it is turned off due to memory corruption caused by some shapes with this
-// algorithm.
-// Before NVIDIA fixes the memory corruption bug, users can explicitly
-// enable the algorithm through an env-var "TF_ENABLE_FFT_TILING_FORWARD=1".
+// forward convolution. It is disabled for cuDNN < 7 due to memory corruption
+// caused by some shapes with this algorithm. Users can explicitly enable the
+// algorithm through an env-var "TF_ENABLE_FFT_TILING_FORWARD=1".
 struct FftTilingForward {
   static constexpr const char* kName = "TF_ENABLE_FFT_TILING_FORWARD";
-  // TODO(yangzihao): turn the default to True when the memory corruption bug
-  // is fixed.
-  static constexpr bool kDefaultFlag = CUDNN_VERSION < 5100;
+  // TODO(csigg): Enabling this algo causes XLA test failures, for example in
+  // platforms/xla/tests/internal:convolution_test_gpu. See b/80018418.
+  static constexpr bool kDefaultFlag = false;  // CUDNN_VERSION >= 7000;
 };
 
 // A helper struct to decide whether to enable the WINOGRAD_NONFUSED algorithms.
@@ -2539,10 +2223,9 @@ struct FftTilingForward {
 // https://github.com/tensorflow/tensorflow/pull/4901
 struct WinogradNonfused {
   static constexpr const char* kName = "TF_ENABLE_WINOGRAD_NONFUSED";
-  // NVIDIA has fixed winograd nonfused bug for cudnn v>=7.
-  // For cudnn v>=5.1, we have a workaround and for any lower version, we
-  // disable it by default.
-  static constexpr bool kDefaultFlag = CUDNN_VERSION >= 5100;
+  // NVIDIA has fixed winograd nonfused bug for cudnn v>=7. For older versions,
+  // we have a workaround.
+  static constexpr bool kDefaultFlag = true;
 };
 
 // A helper struct to decide whether to use FP32 as the internal compute type
@@ -2614,32 +2297,28 @@ cudnnDataType_t GetRnnComputeType(dnn::DataType data_type) {
       LOG(FATAL) << "Invalid RNN data type: " << static_cast<int>(data_type);
   }
 }
+
 }  // namespace
 
 template <class T>
 bool CudnnSupport::DoConvolveImpl(
-    Stream* stream, const BatchDescriptor& batch_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<T>& input_data,
-    const FilterDescriptor& filter_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<T>& filter_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  ScopedTensorDescriptor input_nd{parent_, batch_descriptor, cudnn_type};
-  ScopedTensorDescriptor output_nd{parent_, output_descriptor, cudnn_type};
-  ScopedFilterDescriptor filter{parent_, filter_descriptor, batch_descriptor,
-                                cudnn_type};
-  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
-                                   GetConvComputeType<T>()};
-
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
-  }
+  ScopedTensorDescriptor input_nd(input_descriptor, cudnn_type);
+  ScopedTensorDescriptor output_nd(output_descriptor, cudnn_type);
+  ScopedFilterDescriptor filter(filter_descriptor, cudnn_type);
+  ScopedConvolutionDescriptor conv(convolution_descriptor,
+                                   GetConvComputeType<T>());
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Alpha is the scaling factor for input.
   float falpha = 1.0;
   double dalpha = 1.0;
@@ -2660,42 +2339,41 @@ bool CudnnSupport::DoConvolveImpl(
   //   GetCudnnConvolutionForwardAlgorithm().
   if (algorithm_config.algorithm().is_default()) {
     // With the default algorithm, use Cudnn's heuristics.
-    auto get_algorithm =
-        [&](bool specify_limit) SHARED_LOCKS_REQUIRED(dnn_handle_mutex_) {
-          cudnnConvolutionFwdPreference_t preference =
-              specify_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
-                            : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-
-          auto memory_limit_bytes =
-              scratch_allocator == nullptr
-                  ? 0
-                  : scratch_allocator->GetMemoryLimitInBytes(stream);
-          if (memory_limit_bytes < 0) {
-            memory_limit_bytes = 0;
-          }
+    auto get_algorithm = [&](bool specify_limit) {
+      cudnnConvolutionFwdPreference_t preference =
+          specify_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
+                        : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
 
-          cudnnConvolutionFwdAlgo_t algo_to_use;
-          status = wrap::cudnnGetConvolutionForwardAlgorithm(
-              parent_, ToHandle(dnn_handle_), input_nd.handle(),
-              filter.handle(), conv.handle(), output_nd.handle(),
-              /*preference=*/preference,
-              /*memoryLimitInBytes=*/memory_limit_bytes,
-              /*algo=*/&algo_to_use);
-          CHECK_EQ(status, CUDNN_STATUS_SUCCESS)
-              << "Unable to find a suitable "
-                 "algorithm for doing forward "
-                 "convolution";
-          return algo_to_use;
-        };
+      auto memory_limit_bytes =
+          scratch_allocator == nullptr
+              ? 0
+              : scratch_allocator->GetMemoryLimitInBytes(stream);
+      if (memory_limit_bytes < 0) {
+        memory_limit_bytes = 0;
+      }
+
+      cudnnConvolutionFwdAlgo_t algo_to_use;
+      auto status = cudnnGetConvolutionForwardAlgorithm(
+          cudnn.handle(), input_nd.handle(), filter.handle(), conv.handle(),
+          output_nd.handle(),
+          /*preference=*/preference,
+          /*memoryLimitInBytes=*/memory_limit_bytes,
+          /*algo=*/&algo_to_use);
+      CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Unable to find a suitable "
+                                                "algorithm for doing forward "
+                                                "convolution";
+      return algo_to_use;
+    };
 
     algo = get_algorithm(/*specify_limit=*/scratch_allocator != nullptr);
     use_tensor_ops = true;
     if (scratch_allocator != nullptr) {
       size_t size_in_bytes;
-      status = wrap::cudnnGetConvolutionForwardWorkspaceSize(
-          parent_, ToHandle(dnn_handle_), /*srcDesc=*/input_nd.handle(),
-          /*filterDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
-          /*destDesc=*/output_nd.handle(), /*algo=*/algo,
+      auto status = cudnnGetConvolutionForwardWorkspaceSize(
+          cudnn.handle(),
+          /*xDesc=*/input_nd.handle(),
+          /*wDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
+          /*yDesc=*/output_nd.handle(), /*algo=*/algo,
           /*sizeInBytes=*/&size_in_bytes);
       int64 size_in_bytes_int64 = size_in_bytes;
       if (status == CUDNN_STATUS_SUCCESS && size_in_bytes_int64 != 0) {
@@ -2727,10 +2405,11 @@ bool CudnnSupport::DoConvolveImpl(
     use_tensor_ops = algotype.tensor_ops_enabled();
     conv.set_use_tensor_op_math(use_tensor_ops);
     size_t size_in_bytes;
-    status = wrap::cudnnGetConvolutionForwardWorkspaceSize(
-        parent_, ToHandle(dnn_handle_), /*srcDesc=*/input_nd.handle(),
-        /*filterDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
-        /*destDesc=*/output_nd.handle(), /*algo=*/algo,
+    auto status = cudnnGetConvolutionForwardWorkspaceSize(
+        cudnn.handle(),
+        /*xDesc=*/input_nd.handle(),
+        /*wDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
+        /*yDesc=*/output_nd.handle(), /*algo=*/algo,
         /*sizeInBytes=*/&size_in_bytes);
     if (status != CUDNN_STATUS_SUCCESS) {
       if (is_profiling) {
@@ -2785,8 +2464,8 @@ bool CudnnSupport::DoConvolveImpl(
       return false;
     }
   }
-  status = wrap::cudnnConvolutionForward(
-      this, stream, ToHandle(dnn_handle_),
+  auto status = cudnnConvolutionForward(
+      cudnn.handle(),
       /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(),
       /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
       /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
@@ -2835,35 +2514,22 @@ bool CudnnSupport::DoFusedConvolveImpl(
     DeviceMemory<Type>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION < 6000
-  LOG(ERROR) << "cudnnConvolutionBiasActivationForward() is only "
-                "supported for cuDNN version >= 6";
-  return false;
-#else
-  ScopedTensorDescriptor conv_input_nd{
-      parent_, conv_input_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_data_type)};
-  ScopedTensorDescriptor output_nd{
-      parent_, output_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_data_type)};
-  ScopedFilterDescriptor filter{parent_, filter_descriptor,
-                                conv_input_descriptor,
-                                static_cast<cudnnDataType_t>(cudnn_data_type)};
-  ScopedTensorDescriptor bias_nd{parent_, bias_descriptor, CUDNN_DATA_FLOAT};
-  ScopedConvolutionDescriptor conv{
-      parent_, convolution_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_compute_type)};
-
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  CHECK(status == CUDNN_STATUS_SUCCESS)
-      << "failed to set stream for cudnn handle: " << ToString(status);
-
+  ScopedTensorDescriptor conv_input_nd(
+      conv_input_descriptor, static_cast<cudnnDataType_t>(cudnn_data_type));
+  ScopedTensorDescriptor output_nd(
+      output_descriptor, static_cast<cudnnDataType_t>(cudnn_data_type));
+  ScopedFilterDescriptor filter(filter_descriptor,
+                                static_cast<cudnnDataType_t>(cudnn_data_type));
+  ScopedTensorDescriptor bias_nd(bias_descriptor, CUDNN_DATA_FLOAT);
+  ScopedConvolutionDescriptor conv(
+      convolution_descriptor, static_cast<cudnnDataType_t>(cudnn_compute_type));
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
   const bool is_profiling = output_profile_result != nullptr;
   DeviceMemory<uint8> scratch;
   dnn::AlgorithmDesc algotype = GetCudnnConvolutionForwardAlgorithm(
-      stream, parent_, dnn_handle_, algorithm_config, is_profiling,
-      conv_input_nd, filter, conv, output_nd, scratch_allocator, &scratch);
+      stream, cudnn, algorithm_config, is_profiling, conv_input_nd, filter,
+      conv, output_nd, scratch_allocator, &scratch);
   if (algotype.is_default()) {
     if (!is_profiling) {
       LOG(ERROR) << "No suitable algorithm found";
@@ -2897,9 +2563,8 @@ bool CudnnSupport::DoFusedConvolveImpl(
   // activation descriptor. Note that this will change the nan propagation
   // behavior from separate conv, bias, and relu (which by default is
   // CUDNN_PROPAGATE_NAN.
-  ScopedActivationDescriptor activation_desc{parent_, activation_mode,
-                                             CUDNN_NOT_PROPAGATE_NAN,
-                                             output_descriptor.value_max()};
+  ScopedActivationDescriptor activation_desc(
+      activation_mode, CUDNN_NOT_PROPAGATE_NAN, output_descriptor.value_max());
   auto side_input_data_ptr = (side_input_scale == 0) ? output_data->opaque()
                                                      : side_input_data.opaque();
 
@@ -2920,8 +2585,9 @@ bool CudnnSupport::DoFusedConvolveImpl(
           << "\noutput_nd.handle() = " << output_nd.handle()
           << "\noutput_data->opaque() = " << output_data->opaque();
 
-  status = wrap::cudnnConvolutionBiasActivationForward(
-      this, stream, ToHandle(dnn_handle_), /*alpha1=*/&conv_input_scale,
+  auto status = cudnnConvolutionBiasActivationForward(
+      cudnn.handle(),
+      /*alpha1=*/&conv_input_scale,
       /*srcDesc=*/conv_input_nd.handle(), /*srcData=*/conv_input_data.opaque(),
       /*filterDesc=*/filter.handle(), /*filterData=*/filter_data.opaque(),
       /*convDesc=*/conv.handle(), algo, /*workSpace=*/scratch.opaque(),
@@ -2954,7 +2620,6 @@ bool CudnnSupport::DoFusedConvolveImpl(
   }
 
   return true;
-#endif  // CUDNN_VERSION < 6000
 }
 
 bool CudnnSupport::GetConvolveAlgorithms(
@@ -2967,19 +2632,15 @@ bool CudnnSupport::GetConvolveAlgorithms(
     CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
     CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
     CUDNN_CONVOLUTION_FWD_ALGO_FFT,
-#if CUDNN_VERSION >= 5000
     CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
-#endif
     // clang-format on
   };
   if (CudnnEnvVar<FftTilingForward>::IsEnabled()) {
     algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING);
   }
-#if CUDNN_VERSION >= 5100
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
     algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED);
   }
-#endif
 
   out_algorithms->clear();
   for (auto i : algo_types) {
@@ -2994,13 +2655,11 @@ bool CudnnSupport::GetConvolveAlgorithms(
 bool CudnnSupport::GetRnnAlgorithms(
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
-  // clang-format off
-#if CUDNN_VERSION >= 6000
+      // clang-format off
     CUDNN_RNN_ALGO_STANDARD,
     CUDNN_RNN_ALGO_PERSIST_STATIC,
     CUDNN_RNN_ALGO_PERSIST_DYNAMIC,
-#endif
-    // clang-format on
+      // clang-format on
   };
 
   out_algorithms->clear();
@@ -3019,21 +2678,17 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
-    // clang-format off
+      // clang-format off
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT,
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
-#if CUDNN_VERSION >= 5000
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD,
-#endif
-    // clang-format on
+      // clang-format on
   };
-#if CUDNN_VERSION >= 5100
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
   }
-#endif
 
   out_algorithms->clear();
   for (auto i : algo_types) {
@@ -3056,13 +2711,15 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
       CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
       // Based on cudnn.h, the following is not implemented.
       // CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD,
+
+      // Produces incorrect results for some shapes. Disabled for now, see
+      // NVIDIA bug 2072856. TODO(csigg): Only disable for subset of shapes.
+      // CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING,
       // clang-format on
   };
-#if CUDNN_VERSION >= 5100
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
   }
-#endif
 
   out_algorithms->clear();
   for (auto i : algo_types) {
@@ -3125,17 +2782,9 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
     DeviceMemory<U>* saved_mean, DeviceMemory<U>* saved_inv_var,
     bool is_training, std::function<const DeviceMemory<U>&()> var_to_inv_var,
     std::function<void()> inv_var_to_var) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
-  ScopedTensorDescriptor x_descriptor{parent_, x_desc,
-                                      ToCudnnDataType(input_data_type)};
-  ScopedTensorDescriptor scale_offset_descriptor{
-      parent_, scale_offset_desc, ToCudnnDataType(scale_data_type)};
+  ScopedTensorDescriptor x_descriptor(x_desc, ToCudnnDataType(input_data_type));
+  ScopedTensorDescriptor scale_offset_descriptor(
+      scale_offset_desc, ToCudnnDataType(scale_data_type));
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
 #if CUDNN_VERSION >= 7000
   if (BatchnormSpatialPersistentEnabled() && is_training) {
@@ -3144,7 +2793,9 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
 #endif
   float one = 1.0;
   float zero = 0.0;
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
 
+  auto status = CUDNN_STATUS_SUCCESS;
   if (is_training) {
     CHECK_EQ(batch_mean->is_null(), batch_var->is_null())
         << "batch_mean and batch_var must both be null or both be non-null";
@@ -3161,28 +2812,19 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
       batch_var_opaque = nullptr;
     }
 
-    status = wrap::cudnnBatchNormalizationForwardTraining(
-        this, stream, ToHandle(dnn_handle_), mode, &one, &zero,
-        x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(),
-        scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(), 1.0,
-        batch_mean_opaque, batch_var_opaque, epsilon, saved_mean->opaque(),
+    status = cudnnBatchNormalizationForwardTraining(
+        cudnn.handle(), mode, &one, &zero, x_descriptor.handle(), x.opaque(),
+        x_descriptor.handle(), y->opaque(), scale_offset_descriptor.handle(),
+        scale.opaque(), offset.opaque(), 1.0, batch_mean_opaque,
+        batch_var_opaque, epsilon, saved_mean->opaque(),
         saved_inv_var->opaque());
-#if CUDNN_VERSION < 5000
-    CHECK(inv_var_to_var);
-    inv_var_to_var();
-#endif
   } else {
-#if CUDNN_VERSION < 5000
-    CHECK(var_to_inv_var);
-    const void* maybe_inv_var = var_to_inv_var().opaque();
-#else
     const void* maybe_inv_var = estimated_variance.opaque();
-#endif
-    status = wrap::cudnnBatchNormalizationForwardInference(
-        this, stream, ToHandle(dnn_handle_), mode, &one, &zero,
-        x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(),
-        scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(),
-        estimated_mean.opaque(), maybe_inv_var, epsilon);
+    status = cudnnBatchNormalizationForwardInference(
+        cudnn.handle(), mode, &one, &zero, x_descriptor.handle(), x.opaque(),
+        x_descriptor.handle(), y->opaque(), scale_offset_descriptor.handle(),
+        scale.opaque(), offset.opaque(), estimated_mean.opaque(), maybe_inv_var,
+        epsilon);
   }
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to enqueue forward batch normalization on stream: "
@@ -3229,18 +2871,10 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
     DeviceMemory<U>* offset_backprop) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
-  ScopedTensorDescriptor x_descriptor{
-      parent_, x_desc, static_cast<cudnnDataType_t>(cudnn_input_type)};
-  ScopedTensorDescriptor scale_offset_descriptor{
-      parent_, scale_offset_desc,
-      static_cast<cudnnDataType_t>(cudnn_scale_type)};
+  ScopedTensorDescriptor x_descriptor(
+      x_desc, static_cast<cudnnDataType_t>(cudnn_input_type));
+  ScopedTensorDescriptor scale_offset_descriptor(
+      scale_offset_desc, static_cast<cudnnDataType_t>(cudnn_scale_type));
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
 #if CUDNN_VERSION >= 7000
   if (BatchnormSpatialPersistentEnabled()) {
@@ -3250,10 +2884,12 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
   float one = 1.0;
   float zero = 0.0;
 
-  status = wrap::cudnnBatchNormalizationBackward(
-      this, stream, ToHandle(dnn_handle_), mode, &one, &zero, &one, &zero,
-      x_descriptor.handle(), x.opaque(), x_descriptor.handle(),
-      y_backprop.opaque(), x_descriptor.handle(), x_backprop->opaque(),
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+
+  auto status = cudnnBatchNormalizationBackward(
+      cudnn.handle(), mode, &one, &zero, &one, &zero, x_descriptor.handle(),
+      x.opaque(), x_descriptor.handle(), y_backprop.opaque(),
+      x_descriptor.handle(), x_backprop->opaque(),
       scale_offset_descriptor.handle(), scale.opaque(),
       scale_backprop->opaque(), offset_backprop->opaque(), epsilon,
       mean.opaque(), inv_var.opaque());
@@ -3266,13 +2902,13 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
 }
 
 bool CudnnSupport::DoConvolve(
-    Stream* stream, const BatchDescriptor& batch_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
     const DeviceMemory<float>& input_data,
-    const FilterDescriptor& filter_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<float>& filter_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& output_descriptor, DeviceMemory<float>* output_data,
-    ScratchAllocator* scratch_allocator,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveImpl<float>(
@@ -3282,13 +2918,13 @@ bool CudnnSupport::DoConvolve(
 }
 
 bool CudnnSupport::DoConvolve(
-    Stream* stream, const BatchDescriptor& batch_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
     const DeviceMemory<double>& input_data,
-    const FilterDescriptor& filter_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<double>& filter_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& output_descriptor, DeviceMemory<double>* output_data,
-    ScratchAllocator* scratch_allocator,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveImpl<double>(
@@ -3298,12 +2934,12 @@ bool CudnnSupport::DoConvolve(
 }
 
 bool CudnnSupport::DoConvolve(
-    Stream* stream, const BatchDescriptor& batch_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
     const DeviceMemory<Eigen::half>& input_data,
-    const FilterDescriptor& filter_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<Eigen::half>& filter_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& output_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
@@ -3393,11 +3029,6 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION < 6000
-  LOG(WARNING) << "cudnnConvolutionBiasActivationForward() is only "
-                  "supported for cuDNN version >= 6";
-  return false;
-#else
   int cc_major, cc_minor;
   stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
                                                                    &cc_minor);
@@ -3413,13 +3044,23 @@ bool CudnnSupport::DoFusedConvolve(
       side_input_scale, bias_descriptor, biases, activation_mode,
       output_descriptor, output_data, scratch_allocator, algorithm_config,
       output_profile_result);
-#endif
 }
 
-template<class T>
-DeviceMemory<T> CudnnSupport::MaybeTransformLayout(
-    Stream* stream,
-    BatchDescriptor* output_descriptor,
+namespace {
+// NOTE(keveman): Temporary data layout transformation until cuDNN supports
+// kBatchYXDepth for backward pass. This function allocates temporary memory,
+// lays out the source data into the temporary but in the kBatchDepthXY
+// layout, and returns the temporary memory. The caller is responsible for
+// deallocating the temporary. Since the allocation is done using Stream's
+// AllocateTemporaryMemory, a later BlockHostUntilDone could be used for
+// deallocation.
+//
+// transform_scratch is populated with a legitimate temporary allocation iff
+// the original output data needs to be transformed.
+template <class T>
+DeviceMemory<T> MaybeTransformLayout(
+    Stream* stream, const CudnnHandle& cudnn,
+    dnn::BatchDescriptor* output_descriptor,
     DeviceMemory<T> backward_output_data,
     std::unique_ptr<TemporaryDeviceMemory<T>>* transform_scratch) {
   if (output_descriptor->layout() == dnn::DataLayout::kBatchDepthYX) {
@@ -3429,19 +3070,18 @@ DeviceMemory<T> CudnnSupport::MaybeTransformLayout(
   *transform_scratch =
       stream->AllocateTemporaryArray<T>(backward_output_data.ElementCount())
           .ConsumeValueOrDie();
-  BatchDescriptor transformed_output_descriptor;
+  dnn::BatchDescriptor transformed_output_descriptor;
   transformed_output_descriptor.CloneFrom(*output_descriptor);
   transformed_output_descriptor.set_layout(dnn::DataLayout::kBatchDepthYX);
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  ScopedTensorDescriptor orig_out_back_nd{parent_, *output_descriptor,
-                                          cudnn_type};
-  ScopedTensorDescriptor transformed_out_back_nd{
-      parent_, transformed_output_descriptor, cudnn_type};
+  ScopedTensorDescriptor orig_out_back_nd(*output_descriptor, cudnn_type);
+  ScopedTensorDescriptor transformed_out_back_nd(transformed_output_descriptor,
+                                                 cudnn_type);
 
   float alpha = 1.0f;
   float beta = 0.0f;
-  auto status = wrap::cudnnTransformTensor(
-      this, stream, ToHandle(dnn_handle_), &alpha, orig_out_back_nd.handle(),
+  auto status = cudnnTransformTensor(
+      cudnn.handle(), &alpha, orig_out_back_nd.handle(),
       backward_output_data.opaque(), &beta, transformed_out_back_nd.handle(),
       (*transform_scratch)->mutable_device_memory()->opaque());
 
@@ -3451,6 +3091,7 @@ DeviceMemory<T> CudnnSupport::MaybeTransformLayout(
   output_descriptor->set_layout(dnn::DataLayout::kBatchDepthYX);
   return (*transform_scratch)->device_memory();
 }
+}  // namespace
 
 bool CudnnSupport::DoTransformTensor(Stream* stream,
                                      const dnn::BatchDescriptor& input_desc,
@@ -3459,21 +3100,15 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
                                      const dnn::BatchDescriptor& output_desc,
                                      dnn::DataType output_type, float scale,
                                      DeviceMemoryBase* output_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
-  }
-
   float beta = 0.0f;
   ScopedTensorDescriptor input_tensor_desc(
-      parent_, input_desc, ToCudnnDataType(input_type, input_desc.layout()));
+      input_desc, ToCudnnDataType(input_type, input_desc.layout()));
   ScopedTensorDescriptor output_tensor_desc(
-      parent_, output_desc, ToCudnnDataType(output_type, output_desc.layout()));
-  status = wrap::cudnnTransformTensor(
-      this, stream, ToHandle(dnn_handle_), &scale, input_tensor_desc.handle(),
-      input_data.opaque(), &beta, output_tensor_desc.handle(),
-      output_data->opaque());
+      output_desc, ToCudnnDataType(output_type, output_desc.layout()));
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = cudnnTransformTensor(
+      cudnn.handle(), &scale, input_tensor_desc.handle(), input_data.opaque(),
+      &beta, output_tensor_desc.handle(), output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "Could not transform a tensor with layout "
                << input_desc.ToString() << " and data type "
@@ -3487,22 +3122,15 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
 
 template <class T>
 bool CudnnSupport::DoConvolveBackwardDataImpl(
-    Stream* stream,
-    const FilterDescriptor& filter_descriptor,
+    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<T>& filter_data,
-    const BatchDescriptor& output_descriptor_in,
+    const dnn::BatchDescriptor& output_descriptor_in,
     DeviceMemory<T> backward_output_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& input_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& input_descriptor,
     DeviceMemory<T>* backward_input_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
-  }
-
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
   // Alpha is the scaling factor for input.
   float falpha = 1.0;
@@ -3515,19 +3143,21 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
   void* beta = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dbeta)
                                                : static_cast<void*>(&fbeta);
 
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+
   // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass.
-  BatchDescriptor output_descriptor;
+  dnn::BatchDescriptor output_descriptor;
   output_descriptor.CloneFrom(output_descriptor_in);
   std::unique_ptr<TemporaryDeviceMemory<T>> transform_scratch;
-  backward_output_data = MaybeTransformLayout(
-      stream, &output_descriptor, backward_output_data, &transform_scratch);
+  backward_output_data =
+      MaybeTransformLayout(stream, cudnn, &output_descriptor,
+                           backward_output_data, &transform_scratch);
 
-  ScopedTensorDescriptor out_back_nd{parent_, output_descriptor, cudnn_type};
-  ScopedTensorDescriptor in_back_nd{parent_, input_descriptor, cudnn_type};
-  ScopedFilterDescriptor filter{parent_, filter_descriptor, input_descriptor,
-                                cudnn_type};
-  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
-                                   GetConvComputeType<T>()};
+  ScopedTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
+  ScopedTensorDescriptor in_back_nd(input_descriptor, cudnn_type);
+  ScopedFilterDescriptor filter(filter_descriptor, cudnn_type);
+  ScopedConvolutionDescriptor conv(convolution_descriptor,
+                                   GetConvComputeType<T>());
 
   const bool is_profiling = output_profile_result != nullptr;
   cudnnConvolutionBwdDataAlgo_t algo;
@@ -3535,8 +3165,8 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
 
   if (algorithm_config.algorithm().is_default()) {
     // With the default algorithm, use Cudnn's heuristics.
-    auto get_algorithm = [&](bool specify_limit) SHARED_LOCKS_REQUIRED(
-        dnn_handle_mutex_) -> cudnnConvolutionBwdDataAlgo_t {
+    auto get_algorithm =
+        [&](bool specify_limit) -> cudnnConvolutionBwdDataAlgo_t {
       cudnnConvolutionBwdDataPreference_t preference =
           specify_limit ? CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
                         : CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
@@ -3549,8 +3179,8 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
         memory_limit_bytes = 0;
       }
       cudnnConvolutionBwdDataAlgo_t algo_to_use;
-      cudnnStatus_t status = wrap::cudnnGetConvolutionBackwardDataAlgorithm(
-          parent_, ToHandle(dnn_handle_),
+      cudnnStatus_t status = cudnnGetConvolutionBackwardDataAlgorithm(
+          cudnn.handle(),
           /*filterDesc=*/filter.handle(),
           /*diffDesc=*/out_back_nd.handle(),
           /*convDesc=*/conv.handle(),
@@ -3568,8 +3198,8 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
 
     if (scratch_allocator != nullptr) {
       size_t size_in_bytes;
-      status = wrap::cudnnGetConvolutionBackwardDataWorkspaceSize(
-          parent_, ToHandle(dnn_handle_),
+      auto status = cudnnGetConvolutionBackwardDataWorkspaceSize(
+          cudnn.handle(),
           /*filterDesc=*/filter.handle(),
           /*diffDesc=*/out_back_nd.handle(),
           /*convDesc=*/conv.handle(),
@@ -3605,8 +3235,8 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
     algo = ToConvBackwardDataAlgo(algotype);
     conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
     size_t size_in_bytes;
-    status = wrap::cudnnGetConvolutionBackwardDataWorkspaceSize(
-        parent_, ToHandle(dnn_handle_),
+    auto status = cudnnGetConvolutionBackwardDataWorkspaceSize(
+        cudnn.handle(),
         /*filterDesc=*/filter.handle(),
         /*diffDesc=*/out_back_nd.handle(),
         /*convDesc=*/conv.handle(),
@@ -3662,24 +3292,20 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
     timer->Start(AsCUDAStream(stream));
   }
 
-#if CUDNN_VERSION >= 5000
-  status = wrap::cudnnConvolutionBackwardData(
-#else
-  status = wrap::cudnnConvolutionBackwardData_v3(
-#endif
-      this, stream, ToHandle(dnn_handle_),
-      /*alpha=*/alpha,
-      /*filterDesc=*/filter.handle(),
-      /*filterData=*/filter_data.opaque(),
-      /*diffDesc=*/out_back_nd.handle(),
-      /*diffData=*/backward_output_data.opaque(),
-      /*convDesc=*/conv.handle(),
-      /*algo=*/algo,
-      /*workSpace=*/scratch.opaque(),
-      /*workSpaceSizeInBytes=*/scratch.size(),
-      /*beta=*/beta,
-      /*gradDesc=*/in_back_nd.handle(),
-      /*gradData=*/backward_input_data->opaque());
+  auto status =
+      cudnnConvolutionBackwardData(cudnn.handle(),
+                                   /*alpha=*/alpha,
+                                   /*wDesc=*/filter.handle(),
+                                   /*w=*/filter_data.opaque(),
+                                   /*dyDesc=*/out_back_nd.handle(),
+                                   /*dy=*/backward_output_data.opaque(),
+                                   /*convDesc=*/conv.handle(),
+                                   /*algo=*/algo,
+                                   /*workSpace=*/scratch.opaque(),
+                                   /*workSpaceSizeInBytes=*/scratch.size(),
+                                   /*beta=*/beta,
+                                   /*dxDesc=*/in_back_nd.handle(),
+                                   /*dx=*/backward_input_data->opaque());
   if (is_profiling) {
     timer->Stop(AsCUDAStream(stream));
     if (status == CUDNN_STATUS_SUCCESS) {
@@ -3703,12 +3329,12 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
 }
 
 bool CudnnSupport::DoConvolveBackwardData(
-    Stream* stream, const FilterDescriptor& filter_descriptor,
+    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<double>& filter_data,
-    const BatchDescriptor& output_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
     DeviceMemory<double> backward_output_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& input_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& input_descriptor,
     DeviceMemory<double>* backward_input_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
@@ -3721,12 +3347,12 @@ bool CudnnSupport::DoConvolveBackwardData(
 }
 
 bool CudnnSupport::DoConvolveBackwardData(
-    Stream* stream, const FilterDescriptor& filter_descriptor,
+    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<float>& filter_data,
-    const BatchDescriptor& output_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
     DeviceMemory<float> backward_output_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& input_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& input_descriptor,
     DeviceMemory<float>* backward_input_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
@@ -3739,12 +3365,12 @@ bool CudnnSupport::DoConvolveBackwardData(
 }
 
 bool CudnnSupport::DoConvolveBackwardData(
-    Stream* stream, const FilterDescriptor& filter_descriptor,
+    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<Eigen::half>& filter_data,
-    const BatchDescriptor& output_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
     DeviceMemory<Eigen::half> backward_output_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& input_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& input_descriptor,
     DeviceMemory<Eigen::half>* backward_input_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
@@ -3767,12 +3393,6 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
     DeviceMemory<T>* backward_filter_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
-  }
-
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
   // Alpha is the scaling factor for input.
   float falpha = 1.0;
@@ -3785,19 +3405,21 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
   void* beta = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dbeta)
                                                : static_cast<void*>(&fbeta);
 
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+
   // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass.
-  BatchDescriptor output_descriptor;
+  dnn::BatchDescriptor output_descriptor;
   output_descriptor.CloneFrom(output_descriptor_in);
   std::unique_ptr<TemporaryDeviceMemory<T>> transform_scratch;
-  backward_output_data = MaybeTransformLayout(
-      stream, &output_descriptor, backward_output_data, &transform_scratch);
+  backward_output_data =
+      MaybeTransformLayout(stream, cudnn, &output_descriptor,
+                           backward_output_data, &transform_scratch);
 
-  ScopedTensorDescriptor out_back_nd{parent_, output_descriptor, cudnn_type};
-  ScopedTensorDescriptor input_nd{parent_, input_descriptor, cudnn_type};
-  ScopedFilterDescriptor filter{parent_, filter_descriptor, input_descriptor,
-                                cudnn_type};
-  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
-                                   GetConvComputeType<T>()};
+  ScopedTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
+  ScopedTensorDescriptor input_nd(input_descriptor, cudnn_type);
+  ScopedFilterDescriptor filter(filter_descriptor, cudnn_type);
+  ScopedConvolutionDescriptor conv(convolution_descriptor,
+                                   GetConvComputeType<T>());
 
   const bool is_profiling = output_profile_result != nullptr;
   cudnnConvolutionBwdFilterAlgo_t algo;
@@ -3809,8 +3431,7 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
     // Lambda that retrieves the algorithm.
     // specify_limit will occur when we have a scratch allocator and it succeeds
     // in allocating; otherwise, we'll fall back to the "no workspace" version.
-    auto get_algorithm = [&](bool specify_limit) SHARED_LOCKS_REQUIRED(
-        dnn_handle_mutex_) {
+    auto get_algorithm = [&](bool specify_limit) {
       cudnnConvolutionBwdFilterPreference_t preference =
           specify_limit ? CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
                         : CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
@@ -3824,8 +3445,8 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
       }
 
       cudnnConvolutionBwdFilterAlgo_t algo_to_use;
-      cudnnStatus_t status = wrap::cudnnGetConvolutionBackwardFilterAlgorithm(
-          parent_, ToHandle(dnn_handle_),
+      cudnnStatus_t status = cudnnGetConvolutionBackwardFilterAlgorithm(
+          cudnn.handle(),
           /*srcDesc=*/input_nd.handle(),
           /*diffDesc=*/out_back_nd.handle(),
           /*convDesc=*/conv.handle(),
@@ -3843,9 +3464,10 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
 
     if (scratch_allocator != nullptr) {
       size_t size_in_bytes;
-      status = wrap::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-          parent_, ToHandle(dnn_handle_), /*srcDesc=*/input_nd.handle(),
-          /*diffDesc=*/out_back_nd.handle(), /*convDesc=*/conv.handle(),
+      auto status = cudnnGetConvolutionBackwardFilterWorkspaceSize(
+          cudnn.handle(),
+          /*xDesc=*/input_nd.handle(),
+          /*dyDesc=*/out_back_nd.handle(), /*convDesc=*/conv.handle(),
           /*gradDesc=*/filter.handle(), /*algo=*/algo,
           /*sizeInBytes=*/&size_in_bytes);
       int64 size_in_bytes_int64 = size_in_bytes;
@@ -3878,9 +3500,10 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
     conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
 
     size_t size_in_bytes;
-    status = wrap::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-        parent_, ToHandle(dnn_handle_), /*srcDesc=*/input_nd.handle(),
-        /*diffDesc=*/out_back_nd.handle(), /*convDesc=*/conv.handle(),
+    auto status = cudnnGetConvolutionBackwardFilterWorkspaceSize(
+        cudnn.handle(),
+        /*xDesc=*/input_nd.handle(),
+        /*dyDesc=*/out_back_nd.handle(), /*convDesc=*/conv.handle(),
         /*gradDesc=*/filter.handle(), /*algo=*/algo,
         /*sizeInBytes=*/&size_in_bytes);
     if (status != CUDNN_STATUS_SUCCESS) {
@@ -3933,12 +3556,9 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
     timer->Start(AsCUDAStream(stream));
   }
 
-#if CUDNN_VERSION >= 5000
-  status = wrap::cudnnConvolutionBackwardFilter(
-#else
-  status = wrap::cudnnConvolutionBackwardFilter_v3(
-#endif
-      this, stream, ToHandle(dnn_handle_), /*alpha=*/alpha,
+  auto status = cudnnConvolutionBackwardFilter(
+      cudnn.handle(),
+      /*alpha=*/alpha,
       /*srcDesc=*/input_nd.handle(),
       /*srcData=*/input_data.opaque(),
       /*diffDesc=*/out_back_nd.handle(),
@@ -4033,25 +3653,19 @@ bool CudnnSupport::DoConvolveBackwardBiasImpl(
     const DeviceMemory<T>& input_data,
     const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<T>* backward_bias_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
-  }
-
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  ScopedTensorDescriptor input_nd{parent_, input_descriptor, cudnn_type};
-  ScopedTensorDescriptor bias_nd{parent_, bias_descriptor, cudnn_type};
+  ScopedTensorDescriptor input_nd(input_descriptor, cudnn_type);
+  ScopedTensorDescriptor bias_nd(bias_descriptor, cudnn_type);
 
   // Alpha is the scaling factor for input.
   float alpha = 1.0;
   // Beta is the scaling factor for output.
   float beta = 0.0;
 
-  status = wrap::cudnnConvolutionBackwardBias(
-      this, stream, ToHandle(dnn_handle_), &alpha, input_nd.handle(),
-      input_data.opaque(), &beta, bias_nd.handle(),
-      backward_bias_data->opaque());
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = cudnnConvolutionBackwardBias(
+      cudnn.handle(), &alpha, input_nd.handle(), input_data.opaque(), &beta,
+      bias_nd.handle(), backward_bias_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to enqueue backward convolution on stream: "
                << ToString(status);
@@ -4061,27 +3675,27 @@ bool CudnnSupport::DoConvolveBackwardBiasImpl(
 }
 
 bool CudnnSupport::DoConvolveBackwardBias(
-    Stream* stream, const BatchDescriptor& input_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<double>& input_data,
-    const BatchDescriptor& bias_descriptor,
+    const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<double>* backward_bias_data) {
   return DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
                                     bias_descriptor, backward_bias_data);
 }
 
 bool CudnnSupport::DoConvolveBackwardBias(
-    Stream* stream, const BatchDescriptor& input_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<float>& input_data,
-    const BatchDescriptor& bias_descriptor,
+    const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<float>* backward_bias_data) {
   return DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
                                     bias_descriptor, backward_bias_data);
 }
 
 bool CudnnSupport::DoConvolveBackwardBias(
-    Stream* stream, const BatchDescriptor& input_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<Eigen::half>& input_data,
-    const BatchDescriptor& bias_descriptor,
+    const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<Eigen::half>* backward_bias_data) {
   return DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
                                     bias_descriptor, backward_bias_data);
@@ -4227,17 +3841,15 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
                              const DeviceMemory<float>& biases,
                              const dnn::BatchDescriptor& dimensions,
                              DeviceMemory<float>* output_data) {
-  ScopedTensorDescriptor input_descriptor{parent_, dimensions,
-                                          CUDNN_DATA_FLOAT};
+  ScopedTensorDescriptor input_descriptor(dimensions, CUDNN_DATA_FLOAT);
 
-  BatchDescriptor bias_dimensions;
+  dnn::BatchDescriptor bias_dimensions;
   bias_dimensions.set_count(1)
       .set_feature_map_count(dimensions.feature_map_count())
       .set_height(1)
       .set_width(1)
       .set_layout(dnn::DataLayout::kBatchYXDepth);
-  ScopedTensorDescriptor bias_descriptor{parent_, bias_dimensions,
-                                         CUDNN_DATA_FLOAT};
+  ScopedTensorDescriptor bias_descriptor(bias_dimensions, CUDNN_DATA_FLOAT);
 
   // cudnnAddTensor after R3 is in-place, so we need to copy input_data to
   // output_data before doing the addition, unless the input and
@@ -4253,23 +3865,14 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
     }
   }
 
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
   const float alpha = 1.0f;
   const float beta = 1.0f;
 
-#if CUDNN_VERSION >= 5000
-  status = wrap::cudnnAddTensor(
-#else
-  status = wrap::cudnnAddTensor_v3(
-#endif
-      this, stream, ToHandle(dnn_handle_), &alpha, bias_descriptor.handle(),
-      biases.opaque(), &beta, input_descriptor.handle(), output_data->opaque());
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+
+  auto status = cudnnAddTensor(
+      cudnn.handle(), &alpha, bias_descriptor.handle(), biases.opaque(), &beta,
+      input_descriptor.handle(), output_data->opaque());
 
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "stream " << stream << " could not enqueue bias addition.";
@@ -4285,59 +3888,19 @@ bool CudnnSupport::DoActivate(Stream* stream,
                               const DeviceMemory<float>& input_data,
                               DeviceMemory<float>* output_data,
                               uint64 options) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
-#if CUDNN_VERSION >= 5000
-  ScopedActivationDescriptor activation_desc{
-      parent_, activation_mode, CUDNN_PROPAGATE_NAN, dimensions.value_max()};
-#else
-  cudnnActivationMode_t mode;
-  switch (activation_mode) {
-    case dnn::ActivationMode::kRelu6:
-      // TODO(leary) should probably do a post-pass to clip at 6?
-      LOG(WARNING) << "user requested Relu6, but providing Relu instead";
-      mode = CUDNN_ACTIVATION_RELU;
-      break;
-    case dnn::ActivationMode::kReluX:
-      // TODO(broune) should probably do a post-pass to clip at X?
-      LOG(WARNING) << "user requested ReluX, but providing Relu instead";
-      mode = CUDNN_ACTIVATION_RELU;
-      break;
-    case dnn::ActivationMode::kRelu:
-      mode = CUDNN_ACTIVATION_RELU;
-      break;
-    case dnn::ActivationMode::kSigmoid:
-      mode = CUDNN_ACTIVATION_SIGMOID;
-      break;
-    case dnn::ActivationMode::kTanh:
-      mode = CUDNN_ACTIVATION_TANH;
-      break;
-    default:
-      LOG(ERROR) << "unrecognized activation mode: "
-                 << static_cast<int>(activation_mode);
-      return false;
-  }
-#endif
+  ScopedActivationDescriptor activation_desc(
+      activation_mode, CUDNN_PROPAGATE_NAN, dimensions.value_max());
 
-  ScopedTensorDescriptor input_nd{parent_, dimensions, CUDNN_DATA_FLOAT};
+  ScopedTensorDescriptor input_nd(dimensions, CUDNN_DATA_FLOAT);
   // Alpha is the input scaling factor.
   float alpha = 1.0;
   // Beta is the output scaling factor.
   float beta = 0.0;
-  status = wrap::cudnnActivationForward(
-      this, stream, ToHandle(dnn_handle_),
-#if CUDNN_VERSION >= 5000
-      activation_desc.handle(),
-#else
-      mode,
-#endif
-      &alpha, input_nd.handle(), input_data.opaque(), &beta, input_nd.handle(),
-      output_data->opaque());
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = cudnnActivationForward(
+      cudnn.handle(), activation_desc.handle(), &alpha, input_nd.handle(),
+      input_data.opaque(), &beta, input_nd.handle(), output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "stream " << stream
                << " could not enqueue activation: " << ToString(status);
@@ -4353,26 +3916,19 @@ bool CudnnSupport::DoPoolForward(
     const DeviceMemory<double>& input_data,
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<double>* output_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
   // Alpha is the scaling factor for input.
   double alpha = 1.0;
   // Beta is the scaling factor for output.
   double beta = 0.0;
 
-  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_DOUBLE};
-  ScopedTensorDescriptor dest_desc{parent_, output_dimensions,
-                                   CUDNN_DATA_DOUBLE};
-  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
-  status = wrap::cudnnPoolingForward(
-      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
-      src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
-      output_data->opaque());
+  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_DOUBLE);
+  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_DOUBLE);
+  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = cudnnPoolingForward(
+      cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+      input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to enqueue forward pooling on stream: "
                << ToString(status);
@@ -4387,26 +3943,19 @@ bool CudnnSupport::DoPoolForward(
     const DeviceMemory<float>& input_data,
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<float>* output_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
   // Alpha is the scaling factor for input.
   float alpha = 1.0;
   // Beta is the scaling factor for output.
   float beta = 0.0;
 
-  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_FLOAT};
-  ScopedTensorDescriptor dest_desc{parent_, output_dimensions,
-                                   CUDNN_DATA_FLOAT};
-  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
-  status = wrap::cudnnPoolingForward(
-      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
-      src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
-      output_data->opaque());
+  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_FLOAT);
+  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_FLOAT);
+  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = cudnnPoolingForward(
+      cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+      input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to enqueue forward pooling on stream: "
                << ToString(status);
@@ -4421,25 +3970,18 @@ bool CudnnSupport::DoPoolForward(
     const DeviceMemory<Eigen::half>& input_data,
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<Eigen::half>* output_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
   // Alpha is the scaling factor for input.
   float alpha = 1.0;
   // Beta is the scaling factor for output.
   float beta = 0.0;
 
-  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_HALF};
-  ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
-  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
-  status = wrap::cudnnPoolingForward(
-      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
-      src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
-      output_data->opaque());
+  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_HALF);
+  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_HALF);
+  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = cudnnPoolingForward(
+      cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+      input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to enqueue forward pooling on stream: "
                << ToString(status);
@@ -4456,27 +3998,21 @@ bool CudnnSupport::DoPoolBackward(
     const DeviceMemory<double>& output_data,
     const DeviceMemory<double>& input_diff_data,
     DeviceMemory<double>* output_diff_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
   // Alpha is the scaling factor for input.
   double alpha = 1.0;
   // Beta is the scaling factor for output.
   double beta = 0.0;
 
-  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_DOUBLE};
-  ScopedTensorDescriptor dest_desc{parent_, output_dimensions,
-                                   CUDNN_DATA_DOUBLE};
-  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
-  status = wrap::cudnnPoolingBackward(
-      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
-      dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
-      input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
-      src_desc.handle(), output_diff_data->opaque());
+  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_DOUBLE);
+  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_DOUBLE);
+  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = cudnnPoolingBackward(
+      cudnn.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
+      output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
+      src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
+      output_diff_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to enqueue backward pooling on stream: "
                << ToString(status);
@@ -4493,27 +4029,21 @@ bool CudnnSupport::DoPoolBackward(
     const DeviceMemory<float>& output_data,
     const DeviceMemory<float>& input_diff_data,
     DeviceMemory<float>* output_diff_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
   // Alpha is the scaling factor for input.
   float alpha = 1.0;
   // Beta is the scaling factor for output.
   float beta = 0.0;
 
-  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_FLOAT};
-  ScopedTensorDescriptor dest_desc{parent_, output_dimensions,
-                                   CUDNN_DATA_FLOAT};
-  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
-  status = wrap::cudnnPoolingBackward(
-      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
-      dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
-      input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
-      src_desc.handle(), output_diff_data->opaque());
+  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_FLOAT);
+  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_FLOAT);
+  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = cudnnPoolingBackward(
+      cudnn.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
+      output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
+      src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
+      output_diff_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to enqueue backward pooling on stream: "
                << ToString(status);
@@ -4530,26 +4060,21 @@ bool CudnnSupport::DoPoolBackward(
     const DeviceMemory<Eigen::half>& output_data,
     const DeviceMemory<Eigen::half>& input_diff_data,
     DeviceMemory<Eigen::half>* output_diff_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
   // Alpha is the scaling factor for input.
   float alpha = 1.0;
   // Beta is the scaling factor for output.
   float beta = 0.0;
 
-  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_HALF};
-  ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
-  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
-  status = wrap::cudnnPoolingBackward(
-      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
-      dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
-      input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
-      src_desc.handle(), output_diff_data->opaque());
+  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_HALF);
+  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_HALF);
+  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = cudnnPoolingBackward(
+      cudnn.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
+      output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
+      src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
+      output_diff_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to enqueue backward pooling on stream: "
                << ToString(status);
@@ -4571,7 +4096,7 @@ bool CudnnSupport::DoNormalizeWithDimensions(
     const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {
   // Check for unsupported modes.
   if (normalize_descriptor.wrap_around()) {
-    LOG(ERROR) << "CUDA LRN does not support wrap-around mode";
+    LOG(ERROR) << "CUDA LRN does not support cudnn-around mode";
     return false;
   }
   if (normalize_descriptor.segment_size()) {
@@ -4579,26 +4104,21 @@ bool CudnnSupport::DoNormalizeWithDimensions(
     return false;
   }
 
-  // Launch the normalization.
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
-  ScopedTensorDescriptor dims{parent_, dimensions, CUDNN_DATA_FLOAT};
-  ScopedNormalizeDescriptor normalize{parent_, normalize_descriptor};
+  ScopedTensorDescriptor dims(dimensions, CUDNN_DATA_FLOAT);
+  ScopedNormalizeDescriptor normalize(normalize_descriptor);
 
   // Alpha is the scaling factor for input.
   float alpha = 1.0f;
   // Beta is the scaling factor for output.
   float beta = 0.0f;
 
-  status = wrap::cudnnLRNCrossChannelForward(
-      this, stream, ToHandle(dnn_handle_), normalize.handle(),
-      CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, dims.handle(), input_data.opaque(),
-      &beta, dims.handle(), output_data->opaque());
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+
+  // Launch the normalization.
+  auto status = cudnnLRNCrossChannelForward(
+      cudnn.handle(), normalize.handle(), CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha,
+      dims.handle(), input_data.opaque(), &beta, dims.handle(),
+      output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to run cudnnLRNCrossChannelForward";
     return false;
@@ -4614,7 +4134,7 @@ bool CudnnSupport::DoNormalizeBackwardWithDimensions(
     DeviceMemory<float>* raw_variable_gradient) {
   // Check for unsupported modes.
   if (normalize_descriptor.wrap_around()) {
-    LOG(ERROR) << "CUDA LRN does not support wrap-around mode";
+    LOG(ERROR) << "CUDA LRN does not support cudnn-around mode";
     return false;
   }
   if (normalize_descriptor.segment_size()) {
@@ -4622,23 +4142,16 @@ bool CudnnSupport::DoNormalizeBackwardWithDimensions(
     return false;
   }
 
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
-  ScopedTensorDescriptor dims{parent_, dimensions, CUDNN_DATA_FLOAT};
-  ScopedNormalizeDescriptor normalize{parent_, normalize_descriptor};
+  ScopedTensorDescriptor dims(dimensions, CUDNN_DATA_FLOAT);
+  ScopedNormalizeDescriptor normalize(normalize_descriptor);
 
   float alpha = 1.0f;
   float beta = 0.0f;
 
-  status = wrap::cudnnLRNCrossChannelBackward(
-      this, stream, ToHandle(dnn_handle_), normalize.handle(),
-      CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, dims.handle(),
-      normalized_data.opaque(), dims.handle(),
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = cudnnLRNCrossChannelBackward(
+      cudnn.handle(), normalize.handle(), CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha,
+      dims.handle(), normalized_data.opaque(), dims.handle(),
       normalized_variable_gradient.opaque(), dims.handle(), raw_data.opaque(),
       &beta, dims.handle(), raw_variable_gradient->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4750,21 +4263,18 @@ bool CudnnSupport::DoMemcpyH2DQuantized(
 }
 
 bool CudnnSupport::DeriveOutputBatchDescriptor(
-    const BatchDescriptor& batch_descriptor,
-    const FilterDescriptor& filter_descriptor,
+    const dnn::BatchDescriptor& batch_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     dnn::BatchDescriptor* output_batch_descriptor) {
-  ScopedTensorDescriptor input_nd{parent_, batch_descriptor, CUDNN_DATA_FLOAT};
-  ScopedFilterDescriptor filter{parent_, filter_descriptor, batch_descriptor,
-                                CUDNN_DATA_FLOAT};
-  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
-                                   CUDNN_DATA_FLOAT};
+  ScopedTensorDescriptor input_nd(batch_descriptor, CUDNN_DATA_FLOAT);
+  ScopedFilterDescriptor filter(filter_descriptor, CUDNN_DATA_FLOAT);
+  ScopedConvolutionDescriptor conv(convolution_descriptor, CUDNN_DATA_FLOAT);
 
   int dn = batch_descriptor.ndims() + 2;
   std::vector<int> dims(dn);  // in BDYX
-  auto status = wrap::cudnnGetConvolutionNdForwardOutputDim(
-      parent_, conv.handle(), input_nd.handle(), filter.handle(), dn,
-      dims.data());
+  auto status = cudnnGetConvolutionNdForwardOutputDim(
+      conv.handle(), input_nd.handle(), filter.handle(), dn, dims.data());
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "could not get output tensor for convolution: "
                << ToString(status);
@@ -4793,9 +4303,8 @@ void initialize_cudnn() {
             cuda::CUDAExecutor* cuda_executor =
                 dynamic_cast<cuda::CUDAExecutor*>(parent);
             if (cuda_executor == nullptr) {
-              LOG(ERROR)
-                  << "Attempting to initialize an instance of the cuBLAS "
-                  << "support library with a non-CUDA StreamExecutor";
+              LOG(ERROR) << "Attempting to initialize an instance of the cuDNN "
+                         << "support library with a non-CUDA StreamExecutor";
               return nullptr;
             }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index dfe2779949239fdd81c01bcd5784ec8cfb0bdb31..e2de3c62d81ae56c28fd4b888c74435ceecc6b22 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -19,6 +19,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
 
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
@@ -42,7 +43,6 @@ extern const PluginId kCuDnnPlugin;
 class CudnnSupport : public dnn::DnnSupport {
  public:
   explicit CudnnSupport(CUDAExecutor* parent);
-  ~CudnnSupport() override;
 
   port::Status Init() override;
   port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
@@ -624,54 +624,11 @@ class CudnnSupport : public dnn::DnnSupport {
                          dnn::DataType output_type, float scale,
                          DeviceMemoryBase* output_data) override;
 
-  const Stream* GetCurrentDnnStream() const
-      SHARED_LOCKS_REQUIRED(dnn_handle_mutex_) {
-    return current_dnn_stream_;
-  }
-
-  void SetCurrentDnnStream(Stream* stream)
-      EXCLUSIVE_LOCKS_REQUIRED(dnn_handle_mutex_) {
-    current_dnn_stream_ = stream;
-  }
-
-  CUDAExecutor* GetParentExecutor() { return parent_; }
-
-  // Guards the enqueueing of DNN operations via the dnn_handle_ below, and
-  // access to current_dnn_stream_.
-  //
-  // This is a public member because we need to add thread safety annotations in
-  // the cudnn wrapper functions in the cc file, which need to access this
-  // mutex (the annotations require C++ permission checks).
-  mutex dnn_handle_mutex_;
-
  private:
   CUDAExecutor* parent_;  // Parent executor object. Not owned.
 
-  // cudnn library handle. cudnnHandle_t type is not present in this header to
-  // prevent third-party library header inclusions from leaking outside the
-  // single cuda_dnn translation unit.
-  void* dnn_handle_ GUARDED_BY(dnn_handle_mutex_);
-
-  // The current cudnn stream that is set by cudnnSetStream().
-  Stream* current_dnn_stream_ GUARDED_BY(dnn_handle_mutex_);
-
-  // NOTE(keveman): Temporary data layout transformation until cuDNN supports
-  // kBatchYXDepth for backward pass. This function allocates temporary memory,
-  // lays out the source data into the temporary but in the kBatchDepthXY
-  // layout, and returns the temporary memory. The caller is responsible for
-  // deallocating the temporary. Since the allocation is done using Stream's
-  // AllocateTemporaryMemory, a later BlockHostUntilDone could be used for
-  // deallocation.
-  //
-  // transform_scratch is populated with a legitimate temporary allocation iff
-  // the original output data needs to be transformed.
-  template<class T>
-  DeviceMemory<T> MaybeTransformLayout(
-      Stream* stream,
-      dnn::BatchDescriptor* output_descriptor,
-      DeviceMemory<T> backward_output_data,
-      std::unique_ptr<TemporaryDeviceMemory<T>>* transform_scratch)
-      EXCLUSIVE_LOCKS_REQUIRED(dnn_handle_mutex_);
+  // Provides access to the cuDNN handle.
+  std::unique_ptr<class CudnnAccess> cudnn_;
 
   template <class T, class U>
   bool DoBatchNormalizationForwardImpl(
@@ -700,7 +657,7 @@ class CudnnSupport : public dnn::DnnSupport {
 
   template <class T>
   bool DoConvolveImpl(Stream* stream,
-                      const dnn::BatchDescriptor& batch_descriptor,
+                      const dnn::BatchDescriptor& input_descriptor,
                       const DeviceMemory<T>& input_data,
                       const dnn::FilterDescriptor& filter_descriptor,
                       const DeviceMemory<T>& filter_data,
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 71cab145b9bb5a8320df8fb78241dd0d37d54239..09e9f9f7582904e7bdccb54abafa1f99f6e3e2a8 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -21,21 +21,22 @@ limitations under the License.
 #include <set>
 #include <utility>
 
+#include "cuda/include/cuda_runtime.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/lib/casts.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/human_readable.h"
+#include "tensorflow/stream_executor/lib/inlined_vector.h"
 #include "tensorflow/stream_executor/lib/notification.h"
-#include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/lib/stacktrace.h"
 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/lib/inlined_vector.h"
 
 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
@@ -62,14 +63,14 @@ class CreatedContexts {
  public:
   // Returns whether context is a member of the live set.
   static bool Has(CUcontext context) {
-    tf_shared_lock lock{mu_};
+    tf_shared_lock lock(mu_);
     return Live()->find(context) != Live()->end();
   }
 
   // Adds context to the live set.
   static CudaContext* Add(CUcontext context) {
     CHECK(context != nullptr);
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     auto cuda_context = new CudaContext(context, next_id_++);
     Live()->insert(
         std::make_pair(context, std::unique_ptr<CudaContext>(cuda_context)));
@@ -79,7 +80,7 @@ class CreatedContexts {
   // Removes context from the live set.
   static void Remove(CUcontext context) {
     CHECK(context != nullptr);
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     auto it = Live()->find(context);
     CHECK(it != Live()->end()) << context;
     Live()->erase(it);
@@ -204,11 +205,11 @@ string ToString(CUresult result) {
     case 719:
       return "CUDA_ERROR_LAUNCH_FAILED";
 
-    OSTREAM_CUDA_ERROR(CONTEXT_ALREADY_IN_USE)
-    OSTREAM_CUDA_ERROR(PEER_ACCESS_UNSUPPORTED)
-    OSTREAM_CUDA_ERROR(NOT_PERMITTED)
-    OSTREAM_CUDA_ERROR(NOT_SUPPORTED)
-    OSTREAM_CUDA_ERROR(UNKNOWN)  // Unknown internal error to CUDA.
+      OSTREAM_CUDA_ERROR(CONTEXT_ALREADY_IN_USE)
+      OSTREAM_CUDA_ERROR(PEER_ACCESS_UNSUPPORTED)
+      OSTREAM_CUDA_ERROR(NOT_PERMITTED)
+      OSTREAM_CUDA_ERROR(NOT_SUPPORTED)
+      OSTREAM_CUDA_ERROR(UNKNOWN)  // Unknown internal error to CUDA.
     default:
       return port::StrCat("CUresult(", static_cast<int>(result), ")");
   }
@@ -396,8 +397,8 @@ static port::Status InternalInit() {
 
   LOG(ERROR) << "failed call to cuInit: " << ToString(res);
   Diagnostician::LogDiagnosticInformation();
-  return port::Status{port::error::ABORTED,
-                      port::StrCat("failed call to cuInit: ", ToString(res))};
+  return port::Status(port::error::ABORTED,
+                      port::StrCat("failed call to cuInit: ", ToString(res)));
 }
 
 }  // namespace
@@ -425,9 +426,9 @@ static port::Status InternalInit() {
     return port::Status::OK();
   }
 
-  return port::Status{
+  return port::Status(
       port::error::INTERNAL,
-      port::StrCat("failed call to cuDeviceGet: ", ToString(res))};
+      port::StrCat("failed call to cuDeviceGet: ", ToString(res)));
 }
 
 /* static */ bool CUDADriver::GetDeviceName(CUdevice device,
@@ -470,7 +471,8 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 }
 
 /* static */ port::Status CUDADriver::CreateContext(
-    CUdevice device, DeviceOptions device_options, CudaContext** context) {
+    CUdevice device, const DeviceOptions &device_options,
+    CudaContext **context) {
   *context = nullptr;
 
   int flags = 0;
@@ -481,62 +483,45 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   CUresult res;
   CUcontext former_context;
   CUcontext new_context;
-  {
-    // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
-    // context creation: see http://b/13248943
 
-#if CUDA_VERSION >= 7000
-    {
-      unsigned int former_primary_context_flags;
-      int former_primary_context_is_active;
-      CHECK_EQ(CUDA_SUCCESS,
-               cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
-                                          &former_primary_context_is_active));
-      if (former_primary_context_flags != flags) {
-        if (former_primary_context_is_active) {
-          LOG(ERROR)
-              << "The primary context is active and has a different flag set ("
-              << former_primary_context_flags << ") than the desired flag set ("
-              << flags << ").";
-        } else {
-          CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
-        }
-      }
+  unsigned int former_primary_context_flags;
+  int former_primary_context_is_active;
+  CHECK_EQ(CUDA_SUCCESS,
+           cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
+                                      &former_primary_context_is_active));
+  if (former_primary_context_flags != flags) {
+    if (former_primary_context_is_active) {
+      LOG(ERROR)
+          << "The primary context is active and has a different flag set ("
+          << former_primary_context_flags << ") than the desired flag set ("
+          << flags << ").";
+    } else {
+      CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
     }
+  }
 
-    former_context = CUDADriver::CurrentContextOrDie();
-    res = cuDevicePrimaryCtxRetain(&new_context, device);
-    if (former_context != nullptr) {
-      CUdevice former_device;
-      if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
-        if (former_device == device) {
-          if (former_context == new_context) {
-            VLOG(2) << "The primary context " << former_context
-                    << " for device " << device
-                    << " exists before initializing the StreamExecutor.";
-          } else {
-            LOG(WARNING)
-                << "A non-primary context " << former_context << " for device "
-                << device
-                << " exists before initializing the StreamExecutor. The "
-                << "primary context is now " << new_context << ". We "
-                << "haven't verified StreamExecutor works with that.";
-          }
+  former_context = CUDADriver::CurrentContextOrDie();
+  res = cuDevicePrimaryCtxRetain(&new_context, device);
+  if (former_context != nullptr) {
+    CUdevice former_device;
+    if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
+      if (former_device == device) {
+        if (former_context == new_context) {
+          VLOG(2) << "The primary context " << former_context << " for device "
+                  << device
+                  << " exists before initializing the StreamExecutor.";
+        } else {
+          LOG(WARNING) << "A non-primary context " << former_context
+                       << " for device " << device
+                       << " exists before initializing the StreamExecutor. The "
+                       << "primary context is now " << new_context << ". We "
+                       << "haven't verified StreamExecutor works with that.";
         }
-      } else {
-        LOG(ERROR) << "Failed to get the device of the current context "
-                   << former_context;
       }
+    } else {
+      LOG(ERROR) << "Failed to get the device of the current context "
+                 << former_context;
     }
-#else
-    former_context = CurrentContext();
-    if (former_context != nullptr) {
-      LOG(WARNING)
-          << "creating context when one is currently active; existing: "
-          << former_context;
-    }
-    res = cuCtxCreate(&new_context, flags, device);
-#endif
   }
   CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
 
@@ -548,11 +533,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
     return port::Status::OK();
   }
 
-#if CUDA_VERSION >= 7000
   string message = "failed call to cuDevicePrimaryCtxRetain: " + ToString(res);
-#else
-  string message = "failed call to cuCtxCreate: " + ToString(res);
-#endif
   if (res == CUDA_ERROR_OUT_OF_MEMORY) {
     uint64 total_memory;
     if (GetDeviceTotalMemory(device, &total_memory)) {
@@ -562,14 +543,13 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
     }
   }
 
-  return port::Status{port::error::INTERNAL, message};
+  return port::Status(port::error::INTERNAL, message);
 }
 
 /* static */ void CUDADriver::DestroyContext(CudaContext* context) {
   if (context == nullptr) {
     return;
   }
-#if CUDA_VERSION >= 7000
   CUcontext former_context = CurrentContext();
   CUresult res = cuCtxSetCurrent(context->context());
   CUdevice device;
@@ -577,9 +557,6 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   cuCtxSetCurrent(former_context);
 
   res = cuDevicePrimaryCtxRelease(device);
-#else
-  CUresult res = cuCtxDestroy(context->context());
-#endif
 
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to release CUDA context; leaking: " << ToString(res);
@@ -615,7 +592,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 /* static */ port::StatusOr<CUsharedconfig>
 CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   CUsharedconfig shared_mem_config;
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult result = cuCtxGetSharedMemConfig(&shared_mem_config);
   if (result != CUDA_SUCCESS) {
     CUdevice device;
@@ -623,16 +600,16 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     LOG(ERROR) << "failed to get CUDA device shared memory config. "
                << "Context device ID: " << device
                << ", result: " << ToString(result);
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
-        port::StrCat("failed to get shared memory config: ", ToString(result))};
+        port::StrCat("failed to get shared memory config: ", ToString(result)));
   }
   return shared_mem_config;
 }
 
 /* static */ port::Status CUDADriver::ContextSetSharedMemConfig(
     CudaContext* context, CUsharedconfig shared_mem_config) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult result = cuCtxSetSharedMemConfig(shared_mem_config);
   if (result != CUDA_SUCCESS) {
     CUdevice device;
@@ -641,9 +618,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                << "Context device ID: " << device
                << ", config: " << shared_mem_config
                << ", result: " << ToString(result);
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
-        port::StrCat("failed to set shared memory config: ", ToString(result))};
+        port::StrCat("failed to set shared memory config: ", ToString(result)));
   }
   return port::Status::OK();
 }
@@ -654,7 +631,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     unsigned int block_dim_y, unsigned int block_dim_z,
     unsigned int shared_mem_bytes, CUstream stream, void **kernel_params,
     void **extra) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
@@ -674,11 +651,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ port::Status CUDADriver::LoadCubin(CudaContext* context,
                                                 const char *cubin_bytes,
                                                 CUmodule *module) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult result = cuModuleLoadFatBinary(module, cubin_bytes);
   if (result != CUDA_SUCCESS) {
-    return port::Status{port::error::INTERNAL,
-                        "failed to load in-memory CUBIN: " + ToString(result)};
+    return port::Status(port::error::INTERNAL,
+                        "failed to load in-memory CUBIN: " + ToString(result));
   }
 
   return port::Status::OK();
@@ -691,7 +668,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   bool ret = true;
   GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
                                  &notification]() {
-    ScopedActivateContext activation{context};
+    ScopedActivateContext activation(context);
     void *ptx_data = const_cast<char *>(ptx_contents);
     static const unsigned int kLogBufferBytesLimit = 1024;
     unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
@@ -757,7 +734,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ bool CUDADriver::SynchronousMemsetUint8(CudaContext* context,
                                                      CUdeviceptr location,
                                                      uint8 value, size_t size) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemsetD8(location, value, size);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
@@ -770,7 +747,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                       CUdeviceptr location,
                                                       uint32 value,
                                                       size_t uint32_count) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemsetD32(location, value, uint32_count);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
@@ -784,7 +761,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                       uint8 value,
                                                       size_t uint32_count,
                                                       CUstream stream) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemsetD8Async(location, value, uint32_count, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
@@ -799,7 +776,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                        uint32 value,
                                                        size_t uint32_count,
                                                        CUstream stream) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemsetD32Async(location, value, uint32_count, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
@@ -877,9 +854,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     return device;
   }
 
-  return port::Status{
+  return port::Status(
       port::error::INTERNAL,
-      port::StrCat("failed to get device for context: ", ToString(result))};
+      port::StrCat("failed to get device for context: ", ToString(result)));
 }
 
 /* static */ bool CUDADriver::CreateStream(CudaContext *context,
@@ -937,7 +914,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 
 /* static */ void CUDADriver::DeviceDeallocate(CudaContext* context,
                                                void *location) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location);
   CUresult res = cuMemFree(pointer);
   if (res != CUDA_SUCCESS) {
@@ -948,9 +925,40 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 }
 
+/* static */ void *CUDADriver::UnifiedMemoryAllocate(CudaContext *context,
+                                                     uint64 bytes) {
+  ScopedActivateContext activation(context);
+  CUdeviceptr result = 0;
+  // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
+  CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to alloc " << bytes
+               << " bytes unified memory; result: " << ToString(res);
+    return nullptr;
+  }
+  void *ptr = reinterpret_cast<void *>(result);
+  VLOG(2) << "allocated " << ptr << " for context " << context << " of "
+          << bytes << " bytes in unified memory";
+  return ptr;
+}
+
+/* static */ void CUDADriver::UnifiedMemoryDeallocate(CudaContext *context,
+                                                      void *location) {
+  ScopedActivateContext activation(context);
+  CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location);
+  CUresult res = cuMemFree(pointer);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to free unified memory at " << location
+               << "; result: " << ToString(res);
+  } else {
+    VLOG(2) << "deallocated unified memory at " << location << " for context "
+            << context;
+  }
+}
+
 /* static */ void *CUDADriver::HostAllocate(CudaContext *context,
                                             uint64 bytes) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   void *host_mem = nullptr;
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
   CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
@@ -963,7 +971,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 
 /* static */ void CUDADriver::HostDeallocate(CudaContext* context,
                                              void *location) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemFreeHost(location);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error deallocating host memory at " << location << ": "
@@ -973,7 +981,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 
 /* static */ bool CUDADriver::HostRegister(CudaContext* context, void *location,
                                            uint64 bytes) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
   CUresult res =
       cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
@@ -987,7 +995,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 
 /* static */ bool CUDADriver::HostUnregister(CudaContext* context,
                                              void *location) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemHostUnregister(location);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error unregistering host memory at " << location << ": "
@@ -1000,8 +1008,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ port::Status CUDADriver::DestroyEvent(CudaContext* context,
                                                    CUevent *event) {
   if (*event == nullptr) {
-    return port::Status{port::error::INVALID_ARGUMENT,
-                        "input event cannot be null"};
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "input event cannot be null");
   }
 
   ScopedActivateContext activated{context};
@@ -1013,15 +1021,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       return port::Status::OK();
     case CUDA_ERROR_DEINITIALIZED:
     case CUDA_ERROR_NOT_INITIALIZED:
-      return port::Status{
+      return port::Status(
           port::error::FAILED_PRECONDITION,
           port::Printf("error destroying CUDA event in context %p: %s", context,
-                       ToString(res).c_str())};
+                       ToString(res).c_str()));
     default:
-      return port::Status{
+      return port::Status(
           port::error::INTERNAL,
           port::Printf("error destroying CUDA event in context %p: %s", context,
-                       ToString(res).c_str())};
+                       ToString(res).c_str()));
   }
 }
 
@@ -1035,15 +1043,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       return port::Status::OK();
     case CUDA_ERROR_DEINITIALIZED:
     case CUDA_ERROR_NOT_INITIALIZED:
-      return port::Status{
+      return port::Status(
           port::error::FAILED_PRECONDITION,
           port::Printf("error recording CUDA event on stream %p: %s", stream,
-                       ToString(res).c_str())};
+                       ToString(res).c_str()));
     default:
-      return port::Status{
+      return port::Status(
           port::error::INVALID_ARGUMENT,
           port::Printf("error recording CUDA event on stream %p: %s", stream,
-                       ToString(res).c_str())};
+                       ToString(res).c_str()));
   }
 }
 
@@ -1052,9 +1060,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   ScopedActivateContext activated{context};
   CUresult res = cuEventQuery(event);
   if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
-        port::Printf("failed to query event: %s", ToString(res).c_str())};
+        port::Printf("failed to query event: %s", ToString(res).c_str()));
   }
 
   return res;
@@ -1084,7 +1092,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ bool CUDADriver::WaitStreamOnEvent(CudaContext* context,
                                                 CUstream stream,
                                                 CUevent event) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
@@ -1095,7 +1103,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 }
 
 /* static */ bool CUDADriver::SynchronizeContext(CudaContext* context) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuCtxSynchronize();
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
@@ -1141,7 +1149,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                            void *host_dst,
                                                            CUdeviceptr gpu_src,
                                                            uint64 size) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemcpyDtoH(host_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(
@@ -1159,7 +1167,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                            CUdeviceptr gpu_dst,
                                                            const void *host_src,
                                                            uint64 size) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemcpyHtoD(gpu_dst, host_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(port::Printf(
@@ -1176,7 +1184,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                            CUdeviceptr gpu_dst,
                                                            CUdeviceptr gpu_src,
                                                            uint64 size) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemcpyDtoD(gpu_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(port::Printf(
@@ -1194,7 +1202,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                     CUdeviceptr gpu_src,
                                                     uint64 size,
                                                     CUstream stream) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
@@ -1214,7 +1222,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                     const void *host_src,
                                                     uint64 size,
                                                     CUstream stream) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
@@ -1233,7 +1241,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                     CUdeviceptr gpu_src,
                                                     uint64 size,
                                                     CUstream stream) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
   if (result != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
@@ -1275,12 +1283,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   if (res == CUDA_SUCCESS) {
     return port::Status::OK();
   } else if (res == CUDA_ERROR_OUT_OF_MEMORY) {
-    return port::Status{port::error::RESOURCE_EXHAUSTED,
-                        "could not create CUDA event: out of device memory"};
+    return port::Status(port::error::RESOURCE_EXHAUSTED,
+                        "could not create CUDA event: out of device memory");
   } else {
-    return port::Status{
+    return port::Status(
         port::error::FAILED_PRECONDITION,
-        port::StrCat("could not create CUDA event: ", ToString(res))};
+        port::StrCat("could not create CUDA event: ", ToString(res)));
   }
 }
 
@@ -1308,10 +1316,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     return context;
   }
 
-  return port::Status{
+  return port::Status(
       port::error::INTERNAL,
       port::StrCat("failed to query device pointer for context: ",
-                   ToString(result))};
+                   ToString(result)));
 }
 
 /* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace(
@@ -1326,16 +1334,16 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       case CU_MEMORYTYPE_HOST:
         return MemorySpace::kHost;
       default:
-        return port::Status{
+        return port::Status(
             port::error::INTERNAL,
-            port::StrCat("unknown memory space provided by CUDA API: ", value)};
+            port::StrCat("unknown memory space provided by CUDA API: ", value));
     }
   }
 
-  return port::Status{
+  return port::Status(
       port::error::INTERNAL,
       port::StrCat("failed to query device pointer for memory space: ",
-                   ToString(result))};
+                   ToString(result)));
 }
 
 /* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr,
@@ -1348,16 +1356,16 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     // We differentiate between "this pointer is unknown" (return here) and
     // "there was an internal error while performing this operation" (return
     // below).
-    return port::Status{
+    return port::Status(
         port::error::NOT_FOUND,
         port::Printf("not a device pointer %p; %s",
-                     reinterpret_cast<void *>(dptr), ToString(result).c_str())};
+                     reinterpret_cast<void *>(dptr), ToString(result).c_str()));
   }
 
-  return port::Status{
+  return port::Status(
       port::error::INTERNAL,
       port::Printf("failed to get pointer into for device pointer %p; %s",
-                   reinterpret_cast<void *>(dptr), ToString(result).c_str())};
+                   reinterpret_cast<void *>(dptr), ToString(result).c_str()));
 }
 
 /* static */ port::StatusOr<CUdevice> CUDADriver::GetPointerDevice(
@@ -1380,10 +1388,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     return port::Status::OK();
   }
 
-  return port::Status{
+  return port::Status(
       port::error::INTERNAL,
       port::Printf("failed to get compute capability for device: %s; %d",
-                   ToString(result).c_str(), device)};
+                   ToString(result).c_str(), device));
 }
 
 // Helper function that turns the integer output of cuDeviceGetAttribute to type
@@ -1394,10 +1402,10 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   int value = -1;
   CUresult result = cuDeviceGetAttribute(&value, attribute, device);
   if (result != CUDA_SUCCESS) {
-    return port::Status{
+    return port::Status(
         port::error::NOT_FOUND,
         port::StrCat("could not retrieve CUDA device attribute (", attribute,
-                     "): ", ToString(result))};
+                     "): ", ToString(result)));
   }
   T converted = value;
   return converted;
@@ -1499,10 +1507,10 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   int val;
   CUresult res = cuDeviceGetAttribute(&val, attribute, device);
   if (res != CUDA_SUCCESS) {
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
         port::Printf("failed to get device attribute %d for device %d: %s",
-                     attribute, device, ToString(res).c_str())};
+                     attribute, device, ToString(res).c_str()));
   }
   return val;
 }
@@ -1523,7 +1531,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 /* static */ bool CUDADriver::GetDeviceMemoryInfo(CudaContext* context,
                                                   int64 *free_out,
                                                   int64 *total_out) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   size_t free = 0;
   size_t total = 0;
   CUresult res = cuMemGetInfo(&free, &total);
@@ -1603,10 +1611,10 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
   if (result != CUDA_SUCCESS &&
       result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
         port::Printf("failed to enable peer access from %p to %p: %s", from, to,
-                     ToString(result).c_str())};
+                     ToString(result).c_str()));
   }
 
   return port::Status::OK();
@@ -1615,16 +1623,16 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 /* static */ port::StatusOr<int> CUDADriver::GetMaxOccupiedBlocksPerCore(
     CudaContext* context, CUfunction kernel, int threads_per_block,
     size_t dynamic_shared_memory_bytes) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
 
   int max_blocks;
   CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
       &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
   if (result != CUDA_SUCCESS) {
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
         port::Printf("failed to calculate occupancy of kernel %p: %s", kernel,
-                     ToString(result).c_str())};
+                     ToString(result).c_str()));
   }
 
   return max_blocks;
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index a9969e247e181599f2b3707f6c65c6527dd4683d..3713a5b7b98f8bd5173d649fa592107f06bda27d 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -106,6 +106,16 @@ class CUDADriver {
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
   static void DeviceDeallocate(CudaContext* context, void *location);
 
+  // Allocates a unified memory space of size bytes associated with the given
+  // context via cuMemAllocManaged.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
+  static void* UnifiedMemoryAllocate(CudaContext* context, uint64 bytes);
+
+  // Deallocates a unified memory space of size bytes associated with the given
+  // context via cuMemFree.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
+  static void UnifiedMemoryDeallocate(CudaContext* context, void* location);
+
   // Allocates page-locked and CUDA-registered memory on the host via
   // cuMemAllocHost.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
@@ -147,7 +157,7 @@ class CUDADriver {
   // userspace processes is given here:
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
   static port::Status CreateContext(CUdevice device,
-                                    DeviceOptions device_options,
+                                    const DeviceOptions& device_options,
                                     CudaContext** context);
 
   // Destroys the provided context via cuCtxDestroy.
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index 5b34740f9f1f9067f949ad41bcae0b97d3d3c7f4..013ca2d7f6d7f9d58837d9aa2ebbefc5906c5623 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -138,8 +138,8 @@ port::Status CUDAFftPlan::Initialize(
                                   CUDAFftType(type), 1 /* = batch */);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to create cuFFT 1d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to create cuFFT 1d plan."};
+            return port::Status(port::error::INTERNAL,
+                                "Failed to create cuFFT 1d plan.");
           }
           return port::Status::OK();
         case 2:
@@ -148,8 +148,8 @@ port::Status CUDAFftPlan::Initialize(
                                   elem_count_[1], CUDAFftType(type));
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to create cuFFT 2d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to create cuFFT 2d plan."};
+            return port::Status(port::error::INTERNAL,
+                                "Failed to create cuFFT 2d plan.");
           }
           return port::Status::OK();
         case 3:
@@ -159,29 +159,29 @@ port::Status CUDAFftPlan::Initialize(
                                 elem_count_[2], CUDAFftType(type));
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to create cuFFT 3d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to create cuFFT 3d plan."};
+            return port::Status(port::error::INTERNAL,
+                                "Failed to create cuFFT 3d plan.");
           }
           return port::Status::OK();
         default:
           LOG(ERROR) << "Invalid rank value for cufftPlan. "
                         "Requested 1, 2, or 3, given: "
                      << rank;
-          return port::Status{port::error::INVALID_ARGUMENT,
-                              "cufftPlan only takes rank 1, 2, or 3."};
+          return port::Status(port::error::INVALID_ARGUMENT,
+                              "cufftPlan only takes rank 1, 2, or 3.");
       }
     } else {
       ret = wrap::cufftCreate(parent, &plan_);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to create cuFFT plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to create cuFFT plan."};
+        return port::Status(port::error::INTERNAL,
+                            "Failed to create cuFFT plan.");
       }
       ret = wrap::cufftSetAutoAllocation(parent, plan_, 0);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to set auto allocation for cuFFT plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to set auto allocation for cuFFT plan."};
+        return port::Status(port::error::INTERNAL,
+                            "Failed to set auto allocation for cuFFT plan.");
       }
       switch (rank) {
         case 1:
@@ -190,8 +190,8 @@ port::Status CUDAFftPlan::Initialize(
                                       &scratch_size_bytes_);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to make cuFFT 1d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to make cuFFT 1d plan."};
+            return port::Status(port::error::INTERNAL,
+                                "Failed to make cuFFT 1d plan.");
           }
           break;
         case 2:
@@ -200,8 +200,8 @@ port::Status CUDAFftPlan::Initialize(
                                       &scratch_size_bytes_);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to make cuFFT 2d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to make cuFFT 2d plan."};
+            return port::Status(port::error::INTERNAL,
+                                "Failed to make cuFFT 2d plan.");
           }
           break;
         case 3:
@@ -210,16 +210,16 @@ port::Status CUDAFftPlan::Initialize(
                                       CUDAFftType(type), &scratch_size_bytes_);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to make cuFFT 3d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to make cuFFT 3d plan."};
+            return port::Status(port::error::INTERNAL,
+                                "Failed to make cuFFT 3d plan.");
           }
           break;
         default:
           LOG(ERROR) << "Invalid rank value for cufftPlan. "
                         "Requested 1, 2, or 3, given: "
                      << rank;
-          return port::Status{port::error::INVALID_ARGUMENT,
-                              "cufftPlan only takes rank 1, 2, or 3."};
+          return port::Status(port::error::INVALID_ARGUMENT,
+                              "cufftPlan only takes rank 1, 2, or 3.");
       }
       return UpdateScratchAllocator(stream, scratch_allocator);
     }
@@ -233,23 +233,23 @@ port::Status CUDAFftPlan::Initialize(
           output_distance, CUDAFftType(type), batch_count);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to create cuFFT batched plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to create cuFFT batched plan."};
+        return port::Status(port::error::INTERNAL,
+                            "Failed to create cuFFT batched plan.");
       }
     } else {
       auto ret = wrap::cufftCreate(parent, &plan_);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to create cuFFT batched plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to create cuFFT batched plan."};
+        return port::Status(port::error::INTERNAL,
+                            "Failed to create cuFFT batched plan.");
       }
       ret = wrap::cufftSetAutoAllocation(parent, plan_, 0);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to set auto allocation for cuFFT batched plan:"
                    << ret;
-        return port::Status{
+        return port::Status(
             port::error::INTERNAL,
-            "Failed to set auto allocation for cuFFT batched plan."};
+            "Failed to set auto allocation for cuFFT batched plan.");
       }
       ret = wrap::cufftMakePlanMany(
           parent, plan_, rank, elem_count_,
@@ -259,8 +259,8 @@ port::Status CUDAFftPlan::Initialize(
           &scratch_size_bytes_);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to make cuFFT batched plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to make cuFFT batched plan."};
+        return port::Status(port::error::INTERNAL,
+                            "Failed to make cuFFT batched plan.");
       }
       return UpdateScratchAllocator(stream, scratch_allocator);
     }
@@ -293,8 +293,8 @@ port::Status CUDAFftPlan::UpdateScratchAllocator(
   cufftResult_t ret = wrap::cufftSetWorkArea(parent_, plan_, scratch_.opaque());
   if (ret != CUFFT_SUCCESS) {
     LOG(ERROR) << "failed to set work area for cuFFT plan:" << ret;
-    return port::Status{port::error::INTERNAL,
-                        "Failed to set work area for cuFFT plan."};
+    return port::Status(port::error::INTERNAL,
+                        "Failed to set work area for cuFFT plan.");
   }
   return port::Status::OK();
 }
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 7c87d33d21b58af477a541953b9030e581cfa46d..f2be68bc421c1fbc31ea5a054b91130c11949635 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -609,10 +609,10 @@ port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) {
                                     AsCUDAEvent(event)->cuda_event())) {
     return port::Status::OK();
   } else {
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
         port::Printf("error recording waiting for CUDA event on stream %p",
-                     stream)};
+                     stream));
   }
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index f686685474b35acfb54c327401500c42109006d0..773cbfb8a17a416d18ae599bf4f72e1550538dee 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -74,6 +74,14 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
 
   void Deallocate(DeviceMemoryBase *mem) override;
 
+  void *UnifiedMemoryAllocate(uint64 size) override {
+    return CUDADriver::UnifiedMemoryAllocate(context_, size);
+  }
+
+  void UnifiedMemoryDeallocate(void *location) override {
+    return CUDADriver::UnifiedMemoryDeallocate(context_, location);
+  }
+
   // CUDA allocation/registration functions are necessary because the driver
   // internally sets up buffers for DMA operations (and page locks them).
   // There's no external interface for us to otherwise control these DMA
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index 649224a20e959a27896ec1c1ee61333a3a3c1ec1..622a4a4edb1fe4163831e9429c1a7ab9262f2727 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -124,9 +124,9 @@ port::StatusOr<StreamExecutor*> CudaPlatform::FirstExecutorForBus(
     }
   }
 
-  return port::Status{
+  return port::Status(
       port::error::NOT_FOUND,
-      port::Printf("Executor for bus %d not found.", bus_ordinal)};
+      port::Printf("Executor for bus %d not found.", bus_ordinal));
 }
 
 Platform::Id CudaPlatform::id() const { return kCudaPlatformId; }
@@ -172,11 +172,11 @@ CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
       this, MakeUnique<CUDAExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
         port::Printf(
             "failed initializing StreamExecutor for CUDA device ordinal %d: %s",
-            config.ordinal, init_status.ToString().c_str())};
+            config.ordinal, init_status.ToString().c_str()));
   }
 
   return std::move(executor);
@@ -206,7 +206,6 @@ static void InitializeCudaPlatform() {
 REGISTER_MODULE_INITIALIZER(cuda_platform,
                             stream_executor::InitializeCudaPlatform());
 
-DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 // Note that module initialization sequencing is not supported in the
 // open-source project, so this will be a no-op there.
 REGISTER_MODULE_INITIALIZER_SEQUENCE(cuda_platform, multi_platform_manager);
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index e289e7ced57b164412b0fd78b538c3b08c7db5aa..88c4f15792737aac8dfafefba4c7fce74c434320 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -114,7 +114,7 @@ CUDARng::~CUDARng() {
 }
 
 bool CUDARng::Init() {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   CHECK(rng_ == nullptr);
 
   curandStatus_t ret =
@@ -150,7 +150,7 @@ constexpr bool ComplexIsConsecutiveFloats() {
 template <typename T>
 bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
                                             DeviceMemory<T> *v) {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   static_assert(ComplexIsConsecutiveFloats(),
                 "std::complex values are not stored as consecutive values");
 
@@ -209,7 +209,7 @@ bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
                                              ElemT stddev,
                                              DeviceMemory<ElemT> *v,
                                              FuncT func) {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
 
   if (!SetStream(stream)) {
     return false;
@@ -241,7 +241,7 @@ bool CUDARng::DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
 }
 
 bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   CHECK(rng_ != nullptr);
 
   if (!CheckSeed(seed, seed_bytes)) {
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 18606eb7179485623fb4f5a129857235841870c5..38abc66079264dd46634fca1b0d9297844a31aa1 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -882,8 +882,8 @@ enum class ElementwiseOperation { kAdd, kMultiply };
 
 string ElementwiseOperationString(ElementwiseOperation op);
 
-// A simple class representing the version of the backing library, to 
-// workaround the "too perfect forwarding" issue in gcc6+ compilers. 
+// A simple class representing the version of the backing library, to
+// workaround the "too perfect forwarding" issue in gcc6+ compilers.
 // See PR#16309 and issue #18402 for links discussing the issue.
 class VersionInfo {
  public:
@@ -1051,10 +1051,8 @@ class DnnSupport {
   //    convolution result.
   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
   //    space in order to speed up the convolution operation.
-  //  algorithm: specifies which algorithm should be used for the
-  //    operation. If algorithm.is_default(), the system will pick an algorithm
-  //    by default. The coding of the algorithm is be interpretted by the
-  //    underlying implementation.
+  //  algorithm_config: specifies which algorithm should be used for the
+  //    operation.
   //  output_profile_result: the output profile result for this call. The
   //    profiling is only enabled when this is not nullptr.
   //
@@ -1153,17 +1151,13 @@ class DnnSupport {
   //    convolution input.
   //  filter_descriptor: dimensions of the convolution filter.
   //  convolution_descriptor: stride of the convolution filter.
-  //  input. This can be DeviceMemory pointing to NULL only when activation_mode
-  //  is kNone.
   //  output_descriptor: dimensions of the output layer.
   //  output_data: un-owned device memory region in which to place the
   //    convolution result.
   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
   //    space in order to speed up the convolution operation.
-  //  algorithm: an integer to specify which algorithm should be used for the
-  //    operation. kDefaultAlgorithm means the system will pick an algorithm
-  //    by default. The coding of the algorithm is be interpreted by the
-  //    underlying implementation.
+  //  algorithm_config: specifies which algorithm should be used for the
+  //    operation.
   //  output_profile_result: the output profile result for this call. The
   //    profiling is only enabled when this is not nullptr.
   //
@@ -1220,6 +1214,7 @@ class DnnSupport {
       ProfileResult* output_profile_result) = 0;
 
   // Return a list of algorithms supported by the forward convolution pass.
+  // cc_major and cc_minor are the compute capabilities of the device.
   virtual bool GetConvolveAlgorithms(
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<AlgorithmDesc>* out_algorithms);
@@ -2036,8 +2031,8 @@ class DnnSupport {
                       const dnn::AlgorithmConfig& algorithm_config,
                       float dropout, uint64 seed,
                       ScratchAllocator* state_allocator) {
-    return port::Status{port::error::UNIMPLEMENTED,
-                        "createRnnDescriptor is unimplemented"};
+    return port::Status(port::error::UNIMPLEMENTED,
+                        "createRnnDescriptor is unimplemented");
   }
 
   // Create a RNN sequence descriptor that specifies either the input or output
@@ -2051,8 +2046,8 @@ class DnnSupport {
   virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
   createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
                                     int data_size, dnn::DataType data_type) {
-    return port::Status{port::error::UNIMPLEMENTED,
-                        "createRnnSequenceTensorDescriptor is unimplemented"};
+    return port::Status(port::error::UNIMPLEMENTED,
+                        "createRnnSequenceTensorDescriptor is unimplemented");
   }
 
   // Create an RNN state descriptor that specifies the input or hidden state.
@@ -2060,8 +2055,8 @@ class DnnSupport {
   virtual port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
   createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
                                  dnn::DataType data_type) {
-    return port::Status{port::error::UNIMPLEMENTED,
-                        "createRnnStateTensorDescriptor is unimplemented"};
+    return port::Status(port::error::UNIMPLEMENTED,
+                        "createRnnStateTensorDescriptor is unimplemented");
   }
 
   // Enqueue a forward operation of the RNN model onto the stream.
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index 0c3991c151d5bbb84333240aa1ca4f9e1587330d..e82f57569f35eb286ecc81caec30a77f148bd675 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -106,19 +106,19 @@ class HostExecutor : public internal::StreamExecutorInterface {
   bool HostCallback(Stream *stream, std::function<void()> callback) override;
 
   port::Status AllocateEvent(Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
+    return port::Status(port::error::UNIMPLEMENTED, "");
   }
 
   port::Status DeallocateEvent(Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
+    return port::Status(port::error::UNIMPLEMENTED, "");
   }
 
   port::Status RecordEvent(Stream *stream, Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
+    return port::Status(port::error::UNIMPLEMENTED, "");
   }
 
   port::Status WaitForEvent(Stream *stream, Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
+    return port::Status(port::error::UNIMPLEMENTED, "");
   }
 
   Event::Status PollForEventStatus(Event *event) override {
@@ -167,7 +167,7 @@ class HostExecutor : public internal::StreamExecutorInterface {
         "Shared memory configuration is unsupported for host "
         "executors."};
     LOG(INFO) << error_msg;
-    return port::Status{port::error::UNIMPLEMENTED, error_msg};
+    return port::Status(port::error::UNIMPLEMENTED, error_msg);
   }
 
   bool SupportsBlas() const override;
diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc
index a652b08b4fc7e075c9f560baa5c08bcef8dd0d3d..410dc9da899cc967b36c1738a6b7c128a98cf70c 100644
--- a/tensorflow/stream_executor/host/host_platform.cc
+++ b/tensorflow/stream_executor/host/host_platform.cc
@@ -70,11 +70,11 @@ HostPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
       this, MakeUnique<HostExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
         port::Printf(
             "failed initializing StreamExecutor for device ordinal %d: %s",
-            config.ordinal, init_status.ToString().c_str())};
+            config.ordinal, init_status.ToString().c_str()));
   }
 
   return std::move(executor);
@@ -100,7 +100,6 @@ static void InitializeHostPlatform() {
 REGISTER_MODULE_INITIALIZER(host_platform,
                             stream_executor::host::InitializeHostPlatform());
 
-DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 // Note that module initialization sequencing is not supported in the
 // open-source project, so this will be a no-op there.
 REGISTER_MODULE_INITIALIZER_SEQUENCE(host_platform, multi_platform_manager);
diff --git a/tensorflow/stream_executor/host_or_device_scalar.h b/tensorflow/stream_executor/host_or_device_scalar.h
index c9e3e14778384adc81c93b8156ce71bb83ec9909..1f5d4b9260ce211743150f66261fcbf2bdf3bab8 100644
--- a/tensorflow/stream_executor/host_or_device_scalar.h
+++ b/tensorflow/stream_executor/host_or_device_scalar.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
 
-#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/platform/logging.h"
 
 namespace stream_executor {
 
diff --git a/tensorflow/stream_executor/kernel_spec.cc b/tensorflow/stream_executor/kernel_spec.cc
index f0a5785b72f53a0f516e835c88d113876e4e54b7..902892af3f011827fb7cf71a523e5a085d6a661f 100644
--- a/tensorflow/stream_executor/kernel_spec.cc
+++ b/tensorflow/stream_executor/kernel_spec.cc
@@ -93,7 +93,7 @@ const char *CudaPtxInMemory::default_text() const {
     return nullptr;
   }
 
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
 
   auto ptx = ptx_by_compute_capability_.begin()->second;
   // Check if there is an entry in decompressed ptx table.
@@ -127,7 +127,7 @@ const char *CudaPtxInMemory::text(int compute_capability_major,
     return nullptr;
   }
 
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
 
   // Check if there is an entry in decompressed ptx table.
   auto decompressed_ptx_iter = decompressed_ptx_.find(ptx_iter->second);
diff --git a/tensorflow/stream_executor/multi_platform_manager.h b/tensorflow/stream_executor/multi_platform_manager.h
index 7e316879ca0cf9c2a97ee37c556e0f0d9b83e5fa..146a128e85cfe84a844aae0fd50d5a329df2723c 100644
--- a/tensorflow/stream_executor/multi_platform_manager.h
+++ b/tensorflow/stream_executor/multi_platform_manager.h
@@ -68,6 +68,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 
+#include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform.h"
@@ -182,4 +183,9 @@ class MultiPlatformManager {
 
 }  // namespace stream_executor
 
+// multi_platform_manager.cc will define this instance. Includers of this header
+// should use
+// REGISTER_MODULE_INITIALIZER_SEQUENCE(my_platform, multi_platform_manager);
+DECLARE_MODULE_INITIALIZER(multi_platform_manager);
+
 #endif  // TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
diff --git a/tensorflow/stream_executor/plugin_registry.cc b/tensorflow/stream_executor/plugin_registry.cc
index 7812703efd8b4f143c195225c2c12502378bf5f2..c53685c57b0103856ba8ef3efbaa7b6747405815 100644
--- a/tensorflow/stream_executor/plugin_registry.cc
+++ b/tensorflow/stream_executor/plugin_registry.cc
@@ -72,11 +72,11 @@ port::Status PluginRegistry::RegisterFactoryInternal(
   mutex_lock lock{GetPluginRegistryMutex()};
 
   if (factories->find(plugin_id) != factories->end()) {
-    return port::Status{
+    return port::Status(
         port::error::ALREADY_EXISTS,
         port::Printf("Attempting to register factory for plugin %s when "
                      "one has already been registered",
-                     plugin_name.c_str())};
+                     plugin_name.c_str()));
   }
 
   (*factories)[plugin_id] = factory;
@@ -92,9 +92,9 @@ port::StatusOr<FACTORY_TYPE> PluginRegistry::GetFactoryInternal(
   if (iter == factories.end()) {
     iter = generic_factories.find(plugin_id);
     if (iter == generic_factories.end()) {
-      return port::Status{
+      return port::Status(
           port::error::NOT_FOUND,
-          port::Printf("Plugin ID %p not registered.", plugin_id)};
+          port::Printf("Plugin ID %p not registered.", plugin_id));
     }
   }
 
@@ -212,10 +212,11 @@ bool PluginRegistry::HasFactory(Platform::Id platform_id,
       plugin_id = default_factories_[platform_id].FACTORY_VAR;                \
                                                                               \
       if (plugin_id == kNullPlugin) {                                         \
-        return port::Status{port::error::FAILED_PRECONDITION,                 \
-                            "No suitable " PLUGIN_STRING                      \
-                            " plugin registered. Have you linked in a "       \
-                            PLUGIN_STRING "-providing plugin?"};              \
+        return port::Status(                                                  \
+            port::error::FAILED_PRECONDITION,                                 \
+            "No suitable " PLUGIN_STRING                                      \
+            " plugin registered. Have you linked in a " PLUGIN_STRING         \
+            "-providing plugin?");                                            \
       } else {                                                                \
         VLOG(2) << "Selecting default " PLUGIN_STRING " plugin, "             \
                 << plugin_names_[plugin_id];                                  \
@@ -231,9 +232,9 @@ bool PluginRegistry::HasFactory(Platform::Id platform_id,
       PlatformKind platform_kind, PluginId plugin_id) {                       \
     auto iter = platform_id_by_kind_.find(platform_kind);                     \
     if (iter == platform_id_by_kind_.end()) {                                 \
-      return port::Status{port::error::FAILED_PRECONDITION,                   \
+      return port::Status(port::error::FAILED_PRECONDITION,                   \
                           port::Printf("Platform kind %d not registered.",    \
-                                       static_cast<int>(platform_kind))};     \
+                                       static_cast<int>(platform_kind)));     \
     }                                                                         \
     return GetFactory<PluginRegistry::FACTORY_TYPE>(iter->second, plugin_id); \
   }
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 093f0c9306590a1e8faf0ab97621941b6fc2d759..4a98cfe16460ff860b6b73fedc21e98b5a3ed9fd 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -276,7 +276,7 @@ Stream::~Stream() {
 Stream &Stream::Init() {
   VLOG_CALL();
 
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   CHECK_EQ(false, allocated_)
       << "stream appears to already have been initialized";
   CHECK(!ok_) << "stream should be in !ok() state pre-initialization";
@@ -1899,7 +1899,7 @@ Stream &Stream::ThenCopyDevice2HostBuffer(
 }
 
 Stream *Stream::GetOrCreateSubStream() {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   for (auto &stream : sub_streams_) {
     if (stream.second) {
       stream.second = false;
@@ -1916,7 +1916,7 @@ Stream *Stream::GetOrCreateSubStream() {
 }
 
 void Stream::ReturnSubStream(Stream *sub_stream) {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   for (auto &stream : sub_streams_) {
     if (stream.first.get() == sub_stream) {
       stream.second = true;
@@ -4480,6 +4480,40 @@ Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
               n, alpha, a, lda, b, ldb);
 }
 
+Stream &Stream::ThenBlasGemmBatched(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb, float beta,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &c, int ldc,
+    int batch_count) {
+  return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
+                                        b, ldb, beta, c, ldc, batch_count,
+                                        /*scratch_allocator=*/nullptr);
+}
+
+Stream &Stream::ThenBlasGemmBatchedWithScratch(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb, float beta,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &c, int ldc,
+    int batch_count, ScratchAllocator *scratch_allocator) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
+
+  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, float,
+               const port::ArraySlice<DeviceMemory<Eigen::half> *> &, int,
+               const port::ArraySlice<DeviceMemory<Eigen::half> *> &, int,
+               float, const port::ArraySlice<DeviceMemory<Eigen::half> *> &,
+               int, int, ScratchAllocator *>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
+              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
+              scratch_allocator);
+}
+
 Stream &Stream::ThenBlasGemmBatched(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
     uint64 k, float alpha, const port::ArraySlice<DeviceMemory<float> *> &a,
@@ -5196,7 +5230,7 @@ port::Status Stream::BlockHostUntilDone() {
   port::Status first_error;
   {
     // Wait until all active sub-streams have done their tasks.
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     for (auto &stream : sub_streams_) {
       if (!stream.second) {
         first_error.Update(stream.first->BlockHostUntilDone());
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 3d1b011c570a62243d45d38d3d24b371e4382c33..3da1b856d6a41fa0c8d5a77feac33932da392422 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -66,9 +66,6 @@ namespace dnn {
 class BatchDescriptor;
 class FilterDescriptor;
 class ConvolutionDescriptor;
-class BatchDescriptor;
-class FilterDescriptor;
-class ConvolutionDescriptor;
 class ProfileResult;
 class AlgorithmDesc;
 }  // namespace dnn
@@ -1474,6 +1471,13 @@ class Stream {
       blas::ProfileResult *output_profile_result);
 
   // See BlasSupport::DoBlasGemmBatched.
+  Stream &ThenBlasGemmBatched(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, float alpha,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,
+      int ldc, int batch_count);
   Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb,
                               uint64 m, uint64 n, uint64 k, float alpha,
                               const port::ArraySlice<DeviceMemory<float> *> &a,
@@ -1506,6 +1510,13 @@ class Stream {
       std::complex<double> beta,
       const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc,
       int batch_count);
+  Stream &ThenBlasGemmBatchedWithScratch(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, float alpha,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator);
   Stream &ThenBlasGemmBatchedWithScratch(
       blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
       uint64 k, float alpha, const port::ArraySlice<DeviceMemory<float> *> &a,
@@ -2005,7 +2016,7 @@ class Stream {
   friend class ocl::CLBlas;    // for parent_.
 
   bool InErrorState() const LOCKS_EXCLUDED(mu_) {
-    tf_shared_lock lock{mu_};
+    tf_shared_lock lock(mu_);
     return !ok_;
   }
 
@@ -2015,7 +2026,7 @@ class Stream {
     if (operation_retcode) {
       return;
     }
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     ok_ = false;
   }
 
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 2584c92f0c5a1129e2f10aa7148161a8d2d40c50..9c989b971dcee6dd99aa155cd2230ba849d204fe 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -174,6 +174,15 @@ class StreamExecutorInterface {
   virtual void *AllocateSubBuffer(DeviceMemoryBase *parent, uint64 offset,
                                   uint64 size) = 0;
   virtual void Deallocate(DeviceMemoryBase *mem) = 0;
+  // Allocates unified memory space of the given size, if supported.
+  // See
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd
+  // for more details on unified memory.
+  virtual void *UnifiedMemoryAllocate(uint64 size) { return nullptr; }
+
+  // Deallocates unified memory space previously allocated with
+  // UnifiedMemoryAllocate.
+  virtual void UnifiedMemoryDeallocate(void *mem) {}
   virtual void *HostMemoryAllocate(uint64 size) = 0;
   virtual void HostMemoryDeallocate(void *mem) = 0;
   virtual bool HostMemoryRegister(void *mem, uint64 size) = 0;
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 20579790ef4832cedec5bf1e80a2245ec2646be6..b222a4d82a3e87a52c44427627e7aaacd0ed5c0d 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -232,7 +232,7 @@ void StreamExecutor::Deallocate(DeviceMemoryBase *mem) {
 }
 
 void StreamExecutor::GetMemAllocs(std::map<void *, AllocRecord> *records_out) {
-  tf_shared_lock lock{mu_};
+  tf_shared_lock lock(mu_);
   *records_out = mem_allocs_;
 }
 
@@ -256,13 +256,13 @@ port::Status StreamExecutor::SetDeviceSharedMemoryConfig(
     string error_msg = port::Printf(
         "Invalid shared memory config specified: %d", static_cast<int>(config));
     LOG(ERROR) << error_msg;
-    return port::Status{port::error::INVALID_ARGUMENT, error_msg};
+    return port::Status(port::error::INVALID_ARGUMENT, error_msg);
   }
   return implementation_->SetDeviceSharedMemoryConfig(config);
 }
 
 const DeviceDescription &StreamExecutor::GetDeviceDescription() const {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   if (device_description_ != nullptr) {
     return *device_description_;
   }
@@ -393,7 +393,7 @@ StreamExecutor::createRnnStateTensorDescriptor(int num_layer, int batch_size,
 }
 
 dnn::DnnSupport *StreamExecutor::AsDnn() {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   if (dnn_ != nullptr) {
     return dnn_.get();
   }
@@ -403,7 +403,7 @@ dnn::DnnSupport *StreamExecutor::AsDnn() {
 }
 
 blas::BlasSupport *StreamExecutor::AsBlas() {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   if (blas_ != nullptr) {
     return blas_.get();
   }
@@ -413,7 +413,7 @@ blas::BlasSupport *StreamExecutor::AsBlas() {
 }
 
 fft::FftSupport *StreamExecutor::AsFft() {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   if (fft_ != nullptr) {
     return fft_.get();
   }
@@ -423,7 +423,7 @@ fft::FftSupport *StreamExecutor::AsFft() {
 }
 
 rng::RngSupport *StreamExecutor::AsRng() {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   if (rng_ != nullptr) {
     return rng_.get();
   }
@@ -464,6 +464,20 @@ bool StreamExecutor::GetSymbol(const string &symbol_name, void **mem,
   return implementation_->GetSymbol(symbol_name, mem, bytes);
 }
 
+void *StreamExecutor::UnifiedMemoryAllocate(uint64 bytes) {
+  void *buffer = implementation_->UnifiedMemoryAllocate(bytes);
+  VLOG(1) << "Called StreamExecutor::UnifiedMemoryAllocate(size=" << bytes
+          << ") returns " << buffer << StackTraceIfVLOG10();
+  return buffer;
+}
+
+void StreamExecutor::UnifiedMemoryDeallocate(void *location) {
+  VLOG(1) << "Called StreamExecutor::UnifiedMemoryDeallocate(location="
+          << location << ")" << StackTraceIfVLOG10();
+
+  return implementation_->UnifiedMemoryDeallocate(location);
+}
+
 void *StreamExecutor::HostMemoryAllocate(uint64 size) {
   void *buffer = implementation_->HostMemoryAllocate(size);
   VLOG(1) << "Called StreamExecutor::HostMemoryAllocate(size=" << size
@@ -582,12 +596,12 @@ port::Status StreamExecutor::SynchronousMemcpyD2H(
 
   result = implementation_->SynchronousMemcpy(host_dst, device_src, size);
   if (!result.ok()) {
-    result = port::Status{port::error::INTERNAL,
+    result = port::Status(port::error::INTERNAL,
                           port::Printf("failed to synchronously memcpy "
                                        "device-to-host: device %p to host %p "
                                        "size %lld: %s",
                                        device_src.opaque(), host_dst, size,
-                                       result.ToString().c_str())};
+                                       result.ToString().c_str()));
   }
 
   return result;
@@ -605,12 +619,12 @@ port::Status StreamExecutor::SynchronousMemcpyH2D(
 
   result = implementation_->SynchronousMemcpy(device_dst, host_src, size);
   if (!result.ok()) {
-    result = port::Status{
+    result = port::Status(
         port::error::INTERNAL,
         port::Printf("failed to synchronously memcpy host-to-device: host "
                      "%p to device %p size %lld: %s",
                      host_src, device_dst->opaque(), size,
-                     result.ToString().c_str())};
+                     result.ToString().c_str()));
   }
 
   return result;
@@ -723,7 +737,7 @@ void StreamExecutor::EnqueueOnBackgroundThread(std::function<void()> task) {
 
 void StreamExecutor::CreateAllocRecord(void *opaque, uint64 bytes) {
   if (FLAGS_check_device_leaks && opaque != nullptr && bytes != 0) {
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     mem_allocs_[opaque] = AllocRecord{
         bytes, ""};
   }
@@ -731,7 +745,7 @@ void StreamExecutor::CreateAllocRecord(void *opaque, uint64 bytes) {
 
 void StreamExecutor::EraseAllocRecord(void *opaque) {
   if (FLAGS_check_device_leaks && opaque != nullptr) {
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     if (mem_allocs_.find(opaque) == mem_allocs_.end()) {
       LOG(ERROR) << "Deallocating unknown pointer: "
                  << port::Printf("0x%p", opaque);
@@ -745,7 +759,7 @@ void StreamExecutor::EnableTracing(bool enabled) { tracing_enabled_ = enabled; }
 
 void StreamExecutor::RegisterTraceListener(TraceListener *listener) {
   {
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     if (listeners_.find(listener) != listeners_.end()) {
       LOG(INFO) << "Attempt to register already-registered listener, "
                 << listener;
@@ -759,7 +773,7 @@ void StreamExecutor::RegisterTraceListener(TraceListener *listener) {
 
 bool StreamExecutor::UnregisterTraceListener(TraceListener *listener) {
   {
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     if (listeners_.find(listener) == listeners_.end()) {
       LOG(INFO) << "Attempt to unregister unknown listener, " << listener;
       return false;
@@ -776,7 +790,7 @@ void StreamExecutor::SubmitTrace(TraceCallT trace_call, ArgsT &&... args) {
   if (tracing_enabled_) {
     {
       // instance tracers held in a block to limit the lock lifetime.
-      tf_shared_lock lock{mu_};
+      tf_shared_lock lock(mu_);
       for (TraceListener *listener : listeners_) {
         (listener->*trace_call)(std::forward<ArgsT>(args)...);
       }
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index ab6b00f6601b5f0fae02255facf5fb182c4c6d64..ad80a1ba259ce0c6e2785373cc986b8bf34f6460 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -177,6 +177,9 @@ class StreamExecutor {
   //
   // Resets the internal contents of mem to be null-representative, but this
   // null-out effect should not be relied upon in client code.
+  //
+  // TODO(jlebar): Change this to accept a DeviceMemoryBase by value, see
+  // discussion in cl/195744342.
   void Deallocate(DeviceMemoryBase *mem);
 
   // Retrieves a mapping of active opaque device memory pointer to a string
@@ -187,6 +190,16 @@ class StreamExecutor {
   // activated.
   void GetMemAllocs(std::map<void *, AllocRecord> *records_out);
 
+  // Allocates unified memory space of the given size, if supported.
+  // See
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd
+  // for more details on unified memory.
+  void *UnifiedMemoryAllocate(uint64 bytes);
+
+  // Deallocates unified memory space previously allocated with
+  // UnifiedMemoryAllocate.
+  void UnifiedMemoryDeallocate(void *location);
+
   // Allocates a region of host memory and registers it with the platform API.
   // Memory allocated in this manner (or allocated and registered with
   // HostMemoryRegister() is required for use in asynchronous memcpy operations,
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 14944ec25b4ee2fe40ad8dfc142bd44fbba7dd56..d71fd71bbd83add63d11bcd62ae7ecaa2a8be8d1 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -959,15 +959,6 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=tf_copts(), **kwargs):
   if not cuda_deps:
     cuda_deps = []
 
-  if 'linkstatic' not in kwargs or kwargs['linkstatic'] != 1:
-    enable_text_relocation_linkopt = select({
-          clean_dep("//tensorflow:darwin"): [],
-          clean_dep("//tensorflow:windows"): [],
-          "//conditions:default": ['-Wl,-z,notext'],})
-    if 'linkopts' in kwargs:
-      kwargs['linkopts'] += enable_text_relocation_linkopt
-    else:
-      kwargs['linkopts'] = enable_text_relocation_linkopt
   native.cc_library(
       deps=deps + if_cuda(cuda_deps + [
           clean_dep("//tensorflow/core:cuda"),
@@ -1309,7 +1300,7 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
     native.cc_library(
         name=basename + "_gpu",
         srcs=gpu_srcs,
-        copts=_cuda_copts(),
+        copts=_cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
         deps=deps + if_cuda(cuda_deps))
     cuda_deps.extend([":" + basename + "_gpu"])
 
@@ -1445,13 +1436,13 @@ def tf_py_wrap_cc(name,
   extra_linkopts = select({
       "@local_config_cuda//cuda:darwin": [
           "-Wl,-exported_symbols_list",
-          "%s.lds"%vscriptname,
+          "$(location %s.lds)"%vscriptname,
       ],
       clean_dep("//tensorflow:windows"): [],
       clean_dep("//tensorflow:windows_msvc"): [],
       "//conditions:default": [
           "-Wl,--version-script",
-          "%s.lds"%vscriptname,
+          "$(location %s.lds)"%vscriptname,
       ]
   })
   extra_deps += select({
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index a1c569951e99162c8048b7b760c25df7b2f29420..f46bb4b5fcc5d6eface8617ba5261abf29e34b02 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -101,6 +101,7 @@ genrule(
         "api/profiler/__init__.py",
         "api/python_io/__init__.py",
         "api/resource_loader/__init__.py",
+        "api/strings/__init__.py",
         "api/saved_model/__init__.py",
         "api/saved_model/builder/__init__.py",
         "api/saved_model/constants/__init__.py",
@@ -111,6 +112,7 @@ genrule(
         "api/saved_model/tag_constants/__init__.py",
         "api/saved_model/utils/__init__.py",
         "api/sets/__init__.py",
+        "api/sparse/__init__.py",
         "api/spectral/__init__.py",
         "api/summary/__init__.py",
         "api/sysconfig/__init__.py",
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 788f6d3573addada0aae49543600eff2e2d7b904..18182090dabab1f0552001e1388e4f74e3514f1a 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -20,23 +20,28 @@ from __future__ import print_function
 
 import argparse
 import collections
+import importlib
 import os
 import sys
 
-from tensorflow import python  # pylint: disable=unused-import
 from tensorflow.python.util import tf_decorator
 
 
 _API_CONSTANTS_ATTR = '_tf_api_constants'
 _API_NAMES_ATTR = '_tf_api_names'
 _API_DIR = '/api/'
+_DEFAULT_PACKAGE = 'tensorflow.python'
 _OUTPUT_MODULE = 'tensorflow.tools.api.generator.api'
 _GENERATED_FILE_HEADER = """\"\"\"Imports for Python API.
 
 This file is MACHINE GENERATED! Do not edit.
 Generated by: tensorflow/tools/api/generator/create_python_api.py script.
 \"\"\"
+
+from __future__ import print_function
+
 """
+_GENERATED_FILE_FOOTER = "\n\ndel print_function\n"
 
 
 class SymbolExposedTwiceError(Exception):
@@ -137,18 +142,26 @@ class _ModuleInitCodeBuilder(object):
     # since we import from it using * import.
     underscore_names_str = ', '.join(
         '\'%s\'' % name for name in self._underscore_names_in_root)
-    module_text_map[''] += '''
+    # We will always generate a root __init__.py file to let us handle *
+    # imports consistently. Be sure to have a root __init__.py file listed in
+    # the script outputs.
+    module_text_map[''] = module_text_map.get('', '') + '''
 _names_with_underscore = [%s]
 __all__ = [s for s in dir() if not s.startswith('_')]
 __all__.extend([s for s in _names_with_underscore])
+__all__.remove('print_function')
 ''' % underscore_names_str
 
     return module_text_map
 
 
-def get_api_init_text():
+def get_api_init_text(package):
   """Get a map from destination module to __init__.py code for that module.
 
+  Args:
+    package: Base python package containing python with target tf_export
+      decorators.
+
   Returns:
     A dictionary where
       key: (string) destination module (for e.g. tf or tf.consts).
@@ -162,7 +175,7 @@ def get_api_init_text():
   for module in list(sys.modules.values()):
     # Only look at tensorflow modules.
     if (not module or not hasattr(module, '__name__') or
-        'tensorflow.' not in module.__name__):
+        package not in module.__name__):
       continue
     # Do not generate __init__.py files for contrib modules for now.
     if '.contrib.' in module.__name__ or module.__name__.endswith('.contrib'):
@@ -215,12 +228,14 @@ def get_api_init_text():
   return module_code_builder.build()
 
 
-def create_api_files(output_files):
+def create_api_files(output_files, package):
   """Creates __init__.py files for the Python API.
 
   Args:
     output_files: List of __init__.py file paths to create.
       Each file must be under api/ directory.
+    package: Base python package containing python with target tf_export
+      decorators.
 
   Raises:
     ValueError: if an output file is not under api/ directory,
@@ -248,7 +263,7 @@ def create_api_files(output_files):
       os.makedirs(os.path.dirname(file_path))
     open(file_path, 'a').close()
 
-  module_text_map = get_api_init_text()
+  module_text_map = get_api_init_text(package)
 
   # Add imports to output files.
   missing_output_files = []
@@ -260,7 +275,7 @@ def create_api_files(output_files):
       missing_output_files.append(module_file_path)
       continue
     with open(module_name_to_file_path[module], 'w') as fp:
-      fp.write(_GENERATED_FILE_HEADER + text)
+      fp.write(_GENERATED_FILE_HEADER + text + _GENERATED_FILE_FOOTER)
 
   if missing_output_files:
     raise ValueError(
@@ -270,10 +285,7 @@ def create_api_files(output_files):
         ',\n'.join(sorted(missing_output_files)))
 
 
-def main(output_files):
-  create_api_files(output_files)
-
-if __name__ == '__main__':
+def main():
   parser = argparse.ArgumentParser()
   parser.add_argument(
       'outputs', metavar='O', type=str, nargs='+',
@@ -281,7 +293,12 @@ if __name__ == '__main__':
       'semicolon-separated list of Python files that we expect this script to '
       'output. If multiple files are passed in, then we assume output files '
       'are listed directly as arguments.')
+  parser.add_argument(
+      '--package', default=_DEFAULT_PACKAGE, type=str,
+      help='Base package that imports modules containing the target tf_export '
+           'decorators.')
   args = parser.parse_args()
+
   if len(args.outputs) == 1:
     # If we only get a single argument, then it must be a file containing
     # list of outputs.
@@ -289,4 +306,11 @@ if __name__ == '__main__':
       outputs = [line.strip() for line in output_list_file.read().split(';')]
   else:
     outputs = args.outputs
-  main(outputs)
+
+  # Populate `sys.modules` with modules containing tf_export().
+  importlib.import_module(args.package)
+  create_api_files(outputs, args.package)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/tensorflow/tools/api/generator/create_python_api_test.py b/tensorflow/tools/api/generator/create_python_api_test.py
index 218c8120453c8dca6e81146eb06e8243a3cd424d..986340cf6d4a1bb18841d781dcd11c0208279ec8 100644
--- a/tensorflow/tools/api/generator/create_python_api_test.py
+++ b/tensorflow/tools/api/generator/create_python_api_test.py
@@ -37,7 +37,7 @@ class TestClass(object):
 
 
 _TEST_CONSTANT = 5
-_MODULE_NAME = 'test.tensorflow.test_module'
+_MODULE_NAME = 'tensorflow.python.test_module'
 
 
 class CreatePythonApiTest(test.TestCase):
@@ -56,28 +56,35 @@ class CreatePythonApiTest(test.TestCase):
     del sys.modules[_MODULE_NAME]
 
   def testFunctionImportIsAdded(self):
-    imports = create_python_api.get_api_init_text()
+    imports = create_python_api.get_api_init_text(
+        package=create_python_api._DEFAULT_PACKAGE)
     expected_import = (
-        'from test.tensorflow.test_module import test_op as test_op1')
+        'from tensorflow.python.test_module '
+        'import test_op as test_op1')
     self.assertTrue(
         expected_import in str(imports),
         msg='%s not in %s' % (expected_import, str(imports)))
 
-    expected_import = 'from test.tensorflow.test_module import test_op'
+    expected_import = ('from tensorflow.python.test_module '
+                       'import test_op')
     self.assertTrue(
         expected_import in str(imports),
         msg='%s not in %s' % (expected_import, str(imports)))
 
   def testClassImportIsAdded(self):
-    imports = create_python_api.get_api_init_text()
-    expected_import = 'from test.tensorflow.test_module import TestClass'
+    imports = create_python_api.get_api_init_text(
+        package=create_python_api._DEFAULT_PACKAGE)
+    expected_import = ('from tensorflow.python.test_module '
+                       'import TestClass')
     self.assertTrue(
         'TestClass' in str(imports),
         msg='%s not in %s' % (expected_import, str(imports)))
 
   def testConstantIsAdded(self):
-    imports = create_python_api.get_api_init_text()
-    expected = 'from test.tensorflow.test_module import _TEST_CONSTANT'
+    imports = create_python_api.get_api_init_text(
+        package=create_python_api._DEFAULT_PACKAGE)
+    expected = ('from tensorflow.python.test_module '
+                'import _TEST_CONSTANT')
     self.assertTrue(expected in str(imports),
                     msg='%s not in %s' % (expected, str(imports)))
 
diff --git a/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt b/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt
index 7405202b892bba67a36d86cd43fb7a67ab3be947..cbf655498c02a6521ef45f722f30acd7c13de9cc 100644
--- a/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt
@@ -10,6 +10,14 @@ tf_class {
     name: "gradient"
     argspec: "args=[\'self\', \'target\', \'sources\', \'output_gradients\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stop_recording"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "watch"
     argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt
index 5a02bb2175e2d6ad71722799143090f2735c1a37..8c8912dfabb9b5ee7ce15725064f1bdf2fd35bfd 100644
--- a/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.Variable"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "SaveSliceInfo"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
index cbbd077c97ba7f304e13f7883254f5a12668255e..8e7e945ed1bc26669d7c7f0ed3c2002df9f1883b 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
@@ -44,7 +44,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 9a56ae8675c4fdae7424859fb5105574d83da494..5cfb2fd2f0c6a7b733e70445aa130e96c512205e 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -45,7 +45,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
index e5ec824bb8959f77f99ce3527f168bacf8de200a..3327e5b274b43c0b424933cb086c894d47ad25cb 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -45,7 +45,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
index 008239789c788692a36445dc2c51e26113b6a0c5..9d59375282b39564456b4c8aa49435c3836c58ea 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
@@ -45,7 +45,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
index be9ba4ce85bd5b9905a39e3f45873c534594e15f..cf22e39d4c8ab915ea9507960bf28ebc09e4e5aa 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
index 91fca67b6b5b1187b61f398a152793362c0c6e30..a363bceae3b57d879b4b8e5a8205a21c92e8835a 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-best-exporter.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-best-exporter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9694268199a29c51f37bc73a2f92715c78854a2f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-best-exporter.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.estimator.BestExporter"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.exporter.BestExporter\'>"
+  is_instance: "<class \'tensorflow.python.estimator.exporter.Exporter\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'serving_input_receiver_fn\', \'event_file_pattern\', \'compare_fn\', \'assets_extra\', \'as_text\', \'exports_to_keep\'], varargs=None, keywords=None, defaults=[\'best_exporter\', \'None\', \'eval/*.tfevents.*\', \'<function _loss_smaller instance>\', \'None\', \'False\', \'5\'], "
+  }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'estimator\', \'export_path\', \'checkpoint_path\', \'eval_result\', \'is_the_final_export\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index 53a903c239b5fccb5e70d0a93d5af08245f4dc46..099838fa65f6a532a594c08e8a44ead8ce008185 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index ba17c90de28899683ee5bd992cb5516b01856280..87bd19a23a3db727b5c1f13de04e3c11fd91de9b 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
index cd4f72fcf839fa89f25c7ed115ee6c61294283c3..111914f643a3b192d496c5b0857b4429da12b1d6 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
index 303fd74a64d0c7f5a0292a4eaabec63455c29381..67e4ee02d0581207e7dd316196aeb782930e7602 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
index c97ea7969eff3e6952a604e72ce140b49d304461..e1289b975e721e94f4a63889f3e0b76b0db23d81 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
index 4b5b5bf0e3599a81e2e853ae8ba34ef12cc63097..d030b2f51f019ecc179a09b76c4484e60ada9dd0 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
index 42a0d595216ad28363727b9d7c066fc37fddd02c..d72b5769778d2ee8e5da34c531878a6d53ef44f5 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
@@ -22,6 +22,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'model_fn\', \'model_dir\', \'config\', \'params\', \'warm_start_from\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
index 2de52d6c57cc70b562c3c10b7f23cd15b63e25f8..cb578759eee2ed43465195a8c4e8760443a60b71 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
index e552f33720bb939b8a98d34ef3de78bda7db976c..fcd01bb663c7af22791c3855e6da22d93c667f84 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
index 4946f2c51a62af85d61b8e38e982c59dd0d61e36..f1d204a3ef96f35e31f642bcb0a61351b263d273 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "BaselineRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "BestExporter"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "BoostedTreesClassifier"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 3fc64dae888012169af3ea7695154b73f24d90c8..acc3fc4c5bbb767997b9844c7268b717184e4ea8 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -110,7 +110,7 @@ tf_module {
   }
   member_method {
     name: "non_max_suppression"
-    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'None\'], "
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'0.0\', \'None\'], "
   }
   member_method {
     name: "pad_to_bounding_box"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
index cee76bdc1db69d2f61566ad3cc4d52a226539e04..11cdd6f0b5e48f5835385fdd4e3e5144fb7d5166 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.Model"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -155,7 +155,7 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
index 02718cb5f9e3ca0422647f20fbde355e21620c09..4afad3e4df308d412a1c18dea3b4e99aa1d2c84f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.Sequential"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.sequential.Sequential\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -160,7 +160,7 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
index ba2d083a755384d4ec2076ac0dea580a1a878f1d..c6149e8aa7e3650e628e37b0e00a54348012475b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
@@ -450,7 +450,7 @@ tf_module {
   }
   member_method {
     name: "softmax"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "softplus"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-base-logger.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-base-logger.pbtxt
index 454823fd23e72c6aa6bf6aa608707fa3b893b986..9eee9b378964a9947b067b7ec495ef6556ab6d0c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-base-logger.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-base-logger.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.BaseLogger"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.BaseLogger\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.BaseLogger\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
index 86b264c79f63ff78133f0989b5086984a3b16dbd..5bb949c5bb650acee91b14a4d6bf95b36029edf7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.CSVLogger"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.CSVLogger\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.CSVLogger\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-callback.pbtxt
index 1474b392ff38c0c224725867006721096b951567..a5340d52c1af6d69da30fd710bcee9d832917574 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-callback.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-callback.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.callbacks.Callback"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-early-stopping.pbtxt
index 27d4a208a4108b107ed6a0ffbab733cb1e3d8f46..7b0ad85eaac5b83835a9e1c4b152e38e7051a2f6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.EarlyStopping"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.EarlyStopping\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.EarlyStopping\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-history.pbtxt
index a7b2deea8286df935db3a85e9569c3097b0b39ce..ee400b31c43829efba156298d5ee807cdafc8a98 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-history.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-history.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.History"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.History\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.History\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-lambda-callback.pbtxt
index 5ee22948ad52ed082a8790a2127bbe4afc182049..df8d7b0ef7afca17338a26388c38827b5b306f95 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-lambda-callback.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-lambda-callback.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.LambdaCallback"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.LambdaCallback\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.LambdaCallback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
index d4c85a4519eb922629f107ef7b61c3f11cb27163..ce1a9b694d8708720e0eb677afd25607c6262e9c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.LearningRateScheduler"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.LearningRateScheduler\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.LearningRateScheduler\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 79f9c88bbcaba136c544be1cb4b620b4ae55e17a..48bb24a05274addca03f11acef99607f78b92e51 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.ModelCheckpoint"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.ModelCheckpoint\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.ModelCheckpoint\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-progbar-logger.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-progbar-logger.pbtxt
index 543de0ad48b86502fc83374e5e6d82822485f331..d8bb8b2a7d0f491c7ec2b30096a1acaf04681a56 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-progbar-logger.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-progbar-logger.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.ProgbarLogger"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.ProgbarLogger\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.ProgbarLogger\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
index 5838d583125e462e1961beabfd80130a794f468d..dc27af9552a88650261b4f0694ea0265e6bda05c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.callbacks.ReduceLROnPlateau"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.ReduceLROnPlateau\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.ReduceLROnPlateau\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'monitor\', \'factor\', \'patience\', \'verbose\', \'mode\', \'epsilon\', \'cooldown\', \'min_lr\'], varargs=None, keywords=None, defaults=[\'val_loss\', \'0.1\', \'10\', \'0\', \'auto\', \'0.0001\', \'0\', \'0\'], "
+    argspec: "args=[\'self\', \'monitor\', \'factor\', \'patience\', \'verbose\', \'mode\', \'min_delta\', \'cooldown\', \'min_lr\'], varargs=None, keywords=kwargs, defaults=[\'val_loss\', \'0.1\', \'10\', \'0\', \'auto\', \'0.0001\', \'0\', \'0\'], "
   }
   member_method {
     name: "in_cooldown"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-remote-monitor.pbtxt
index 3d0acfed1d8f5a2c811b442784992400ed958537..5a3b791c0adc0d61129d38b2995ee9077cf0988b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-remote-monitor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-remote-monitor.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.callbacks.RemoteMonitor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.RemoteMonitor\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.RemoteMonitor\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'root\', \'path\', \'field\', \'headers\'], varargs=None, keywords=None, defaults=[\'http://localhost:9000\', \'/publish/epoch/end/\', \'data\', \'None\'], "
+    argspec: "args=[\'self\', \'root\', \'path\', \'field\', \'headers\', \'send_as_json\'], varargs=None, keywords=None, defaults=[\'http://localhost:9000\', \'/publish/epoch/end/\', \'data\', \'None\', \'False\'], "
   }
   member_method {
     name: "on_batch_begin"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt
index 7de4008c4541b9054543927cad167293c5a4cf5c..2f52464315d8c1b526792c92f5cf8e83ce3ce087 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.TensorBoard"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.TensorBoard\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.TensorBoard\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
index bf17e8736c50031c484f5c08bac65ee3566f7da3..5c2d336353aee7fc98b45620adac4f4bcda05ea0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.TerminateOnNaN"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.TerminateOnNaN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.TerminateOnNaN\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-constraint.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-constraint.pbtxt
index 14977c696fbe70a9d19f37581c926b6c0fdb3d11..8e07b7d98e1d832628f65bed19eddca76bfbd51a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-constraint.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-constraint.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.constraints.Constraint"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-max-norm.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-max-norm.pbtxt
index a2269f8a18f5b55ffa88031e8ef3d1c39e0bd423..2b81174b6cd4d57d8d6e20da7f6961442045d908 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-max-norm.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-max-norm.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.MaxNorm"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.MaxNorm\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.MaxNorm\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-min-max-norm.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-min-max-norm.pbtxt
index afe0d6478dde929aa98556d52ceece03c28c8e5f..a41eda86ac2583b1adfe745f713ac8f8647f7a31 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-min-max-norm.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-min-max-norm.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.MinMaxNorm"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.MinMaxNorm\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.MinMaxNorm\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-non-neg.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-non-neg.pbtxt
index e8c4bb90881ae389cd5215c21e44380b62cb7c9c..572e3eea4d985999f513a066b348d088ab01fe54 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-non-neg.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-non-neg.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.NonNeg"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.NonNeg\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.NonNeg\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-unit-norm.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-unit-norm.pbtxt
index d457cb6419ef86e83b5440554b2e97706440a734..fe16c38cc83fb9979ecf0d08ab2cba7a2c38f9b6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-unit-norm.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-unit-norm.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.UnitNorm"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.UnitNorm\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.UnitNorm\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.max_norm.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.max_norm.pbtxt
index 48128096d4638388c99cc62ecc23322a8d368124..6650bae07a0d32448e748598af3426f85ca8e199 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.max_norm.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.max_norm.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.max_norm"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.MaxNorm\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.MaxNorm\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.min_max_norm.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.min_max_norm.pbtxt
index 02eb3fb00c0ae516bac336066fc8ae5818e455d8..9dd3bc92fc4fadee863f30b300ddb60fe0b3d340 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.min_max_norm.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.min_max_norm.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.min_max_norm"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.MinMaxNorm\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.MinMaxNorm\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.non_neg.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.non_neg.pbtxt
index cc1101097ce9c4888e4b239f8ae16a58cabf31db..a565840939f99080b784e4e95302071600a1fa7c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.non_neg.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.non_neg.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.non_neg"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.NonNeg\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.NonNeg\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.unit_norm.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.unit_norm.pbtxt
index 086f9f2d43c3d340850f02df3e5bcb0cc5a5b8e5..5cbe0da4c1d1ff97fe836f76402cfca92e1cc511 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.unit_norm.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.unit_norm.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.unit_norm"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.UnitNorm\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.UnitNorm\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
index 96272d1b7d61430188bbbf2680bd2beb9f1e9675..2bf973debb175d27bb80e627d7ccbb41b567020d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Activation"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Activation\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Activation\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
index 8fd55c8686de77ec764e9d564c78d0df4f545915..03f20e72c2a325cec000cf4a5cfc0f1bbf255c8f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ActivityRegularization"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.ActivityRegularization\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.ActivityRegularization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
index 47d1532c3c8cf248f6e9a9a35e10b9559286263f..4b46b8d15afb0a2f636962b762e1808312c2f7c3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Add"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Add\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge.Add\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 797d422a90a5ad21d0014a0003b11d281c25e579..d8a1c76fd07634ef413152020a397897f2d5b97c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.AlphaDropout"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.AlphaDropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.noise.AlphaDropout\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 269be1455b6bf3bbe325f3928584960578e3793d..622926bc4b8b2430ee1ab936665acb5744155e0d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.AveragePooling1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 344813621534dbb5de3719088c06313e55519dd7..82100d8e09c8e95730993527293d2b72ce69f1d4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.AveragePooling2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 979008d0edb7d0f9d9c1246e1fcc7d2e2871d28c..408061077cdeab2f8fd08c7e972744e5ee383f52 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.AveragePooling3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
index 0ffdffd4cdee14fbfeb68f2575300632ab21d7a6..a3c80311043eeb95b06855f662a5e3d344803ba3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Average"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Average\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge.Average\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 6b00f110eea2aaa551bffe8ec225042c5469210e..e2dfaca29f86bd9d91d524ec337afad81e7f2da3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.AvgPool1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index caff5a2f1db61c6958980446d3bd54009776e1a4..4f068d2066a450bab77becc85a33662b78ad03e2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.AvgPool2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 4a7239492177aae2ee098fc3033904d3d1a31ba4..b8c261a74364e9bb6bf8f6c7463993fbff5e9552 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.AvgPool3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
index 9804394fa53d6b3c1ff136a73212863143bbdb39..4ccd6cace650e2efd1583c75f6639c8598bb8f20 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.normalization.BatchNormalization\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
index 5e5b04c7c695c6d88e7f42b77290a582be087763..2790e5fd850c24bd3e94cd15a6e079e1c9f79868 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Bidirectional"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Bidirectional\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Bidirectional\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\', \'initial_state\', \'constants\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
index b8eb4079b9eea3b054f9c2ad4298f6a1669de79e..b1326bd0e6054b2a3fd36e7ad42cd3d4a0cad8dc 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Concatenate"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Concatenate\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge.Concatenate\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 3fdb101425d0f010f77cd70bc3721af269a36b0a..e3ac3dbf28da731e14640d5f464547d62391a28f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.ConvLSTM2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvLSTM2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional_recurrent.ConvLSTM2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
index 0be42471e35eeba224376b24aa846db69e011274..1117a695a395f495d988464bbf59d4b8e01877e6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Conv1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 39ba31a70942811cbd36caf33c9bda90a5449703..b9de1421428dcf61b988df343a22996cfb8fecef 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.Conv2DTranspose"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
index 26d9d8c476f4e429ffe112cd490ad277b478c65b..deb535e06e06008a17b80c8e13d8f01ad1535059 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Conv2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 43611017fa37077c7ff05e690dfd50a0e6e5ae1e..9a9a223fbad11cafd8620110d80b27d5382dd29c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.Conv3DTranspose"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
index fa4925ab99d719104c5ca1a0003c25d85f78f3df..1c59b0bdf624b09a7454f2d51698951a790f393a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Conv3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
index c5c5d5e7c083dca63972cb7033e675aa483d8039..30cf5489f4fcd4af3d0bd957fc9c576c57ee2bbd 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Convolution1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 36dc2d2e9a70fe7a1d32352a14dabbade2a2efc2..0ec69508d5a1992b46d1a7c65255cfb5408ab439 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.Convolution2DTranspose"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
index 23ec74370bab010d2b5ba257502a32c8ca7e4a57..4cd8928403c98abad85bc1349a29148c73003c9d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Convolution2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 0e4089c5785ccdcafbb8b3dc1ca75ffbe49d9434..4b4912496deac2a79a5b0ea3d1ca0f8fa625301a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.Convolution3DTranspose"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
index 23ddbe1a925e33432727d13dc875972136083056..d0ad9cf56702e585e31a79de0f93d9efd48ed484 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Convolution3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
index e04ab6bea85baa1252c9c43f891f9ed5a9dedb87..98cff95a7fe9d4e58cf883502df08c58c651cd76 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Cropping1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
index 655314afffd1e4fb0987c913eb69edc4254c77ef..2357498b46376ef13de102944b69931a9e7d3584 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Cropping2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
index d5215f1330a2fd0db4696a7b743d931e164a5a9f..3324cbff304c5106360f3f3d3d608a528fa5fc31 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Cropping3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index 8ce4db85f8e42f7c85116c8b798a9cfc17e47242..6c81823654b78a936cded4a1d5a6f54e02dc7fc9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.CuDNNGRU"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.cudnn_recurrent.CuDNNGRU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent.CuDNNGRU\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 98221c11650eaf0527b4d32a104e5b29fd34e820..487e04fd0790cb39ef6aee8d0498b3aae6726084 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.CuDNNLSTM"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.cudnn_recurrent.CuDNNLSTM\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent.CuDNNLSTM\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
index 310a3c3b918684803c86d8e2ea1731604a041cdb..137e7cced4e8113dd6a54a837e08cfd5af35c94d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Dense"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dense\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 2d67b5f720209e3648e69659d44c3d8c4e639231..7161665d2550c1cc3aff1c28f9d7676276b62303 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.DepthwiseConv2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.DepthwiseConv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.DepthwiseConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
index 0e493a7f2bffc772e9cc9cd5afdc4c092fb92118..24affa248121bcb1e1a947417a95ad4f5ba55ab2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Dot"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Dot\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge.Dot\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
index 14726b4b6cecc39da98af4220211a1c0351b2ff1..7ba19a42695da37b4ad43cdde2c0d4978fd0a1eb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Dropout"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
index 32a50455ed8a2a146a81fde0d884ebc867b8d0ed..503aa9162c3a78e9bb42ce16af98451441adbbb7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ELU"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.ELU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ELU\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
index 2f615d81124a7ef5e1bd7181a10abfb1b7a8df24..1737e590a29c5777b5eca2b4cb23081aa8ece738 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Embedding"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.embeddings.Embedding\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.embeddings.Embedding\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
index 82dc878a8c7f7f011df4dd3fa0445217fa250a98..021d024dc2150a75532ea7597d85f36efd2a3cf2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Flatten"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Flatten\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -82,7 +82,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index d79d02b95433a7399f27ff1354cba315f8a2c3ae..65387008bf3f78e404d8d8bbd7bb8cd3789bf256 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.GRUCell"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRUCell\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRUCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
index 1d38ae64bb86d11ecd352371608a11f6736bb0ac..4f791acf0585c95d6c0f1d5ea48e607f9a05188d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GRU"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRU\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 135de9cd95141a93d1a45b80ada66e339c484c89..abc30e54e0630a2d7b4de6074445e155e0ac2782 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.GaussianDropout"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.GaussianDropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianDropout\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 5db6e433ee02b8050822f76bb762329055c11aa4..20791bb448d17788ea4aebe4900169a70a9703d6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.GaussianNoise"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.GaussianNoise\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianNoise\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index bf0dba0a925b4fb3a88173f4a6dfcc565c6edd91..449a91d8735c59f563360307cdb35c5a30344d82 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 6da98036094228603f12f65f66d958e2e4b9daeb..bb361e129728ddd42c21144937efbc617d98ba30 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 345593dec812a240251c0c07da759e131fefaad9..e564bf3216104a902fb6cfbe65b1e2b6dafc2524 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 5d3be9085e51520c48a8d33b1dde6480a0039bad..4cb9cc3ec84d679b78465e43caa5a257466d5676 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAvgPool1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 0b79a87e0507ecc795f14a63684dd5b5d7dafc1e..5ed52b88ae3e2dd25b560206db404952034a04cd 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAvgPool2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 68cdbac652f74b72d1cb769fbefbee750025767d..f4559d29d75ef7cd8fcbdeac0a1a2c9e633246bc 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAvgPool3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index d5872b444fa9ae617e6ad55bc39f43ec4be7d92f..64e2d061e26997365c461113d3ea15140fef64dd 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPool1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 4b0cf9a5d38f868d9fdd16a042daea6d0f56fdf8..3372ad645388beb54f7ed9e3715449facba07f87 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPool2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 4c1adb2131f204121cf74c9a77d346902632fef2..08a6860bcd7d9a260e44af87c51796a9cc2af379 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPool3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 815f1cf580562e62db99862e51ba02e2b2051b57..22c9eab64fde41e1199ecbb1b8b03939653ecd00 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index e027dd6cc282b75a6a6c2a78878fa57d6706b547..74c405ba9b1b465f89c4fef43020181a1a7f3d31 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index c647b24a23258b96a3c1780fd2f3e499658cfe7d..39f6f981931296eb6d31eb6580f93b479ff64ce6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
index 75d70734b4144627f02a8f619991356e38889389..7b25e80b6b7653c5e76bf176b54110b1aabaf5ea 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.InputLayer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.input_layer.InputLayer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.input_layer.InputLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
index 29edabe0483a21d7db35eec04d6ae7a855a82da1..5fd0a47a68c0d4ad218c4c64cc6be8f603d9673a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 0ed383a3554f81c3db490cf5d242546a14b64d15..3619b8bfc44373ba6b8e306b020ac63d4b498573 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.LSTMCell"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTMCell\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 6d14c9c8f69286552271ca6dab5271a5af48593f..8ef3d71dd82efc79e333770d4a7a7c8aee1a4202 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.LSTM"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTM\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTM\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
index ddf96aba34bf574f4b9046ed932d8f136864f157..ecbaa9ce2c76bf3d2964a6c79c96c4d67cc3b80e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Lambda"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Lambda\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Lambda\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
index aca282d62427f5c8186fa3ac86daebd6fba09ce5..9b90db1e5e56d1e5749669bba8dba1cdbd45bb55 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.Layer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -105,7 +105,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index b9c53b43c87bd8c1047663001159d7286360a008..3c60eaab7f1df15331004685676d74943d5d538f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.LeakyReLU"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.LeakyReLU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.LeakyReLU\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 2ee566d03b4631ebcff6bcb0b93ec274849c5b67..3dac1ff342ac1b7f984e9af5a6028ef71da701df 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.LocallyConnected1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.local.LocallyConnected1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index db0d0e816a6d863f13c7eb085edf269d71e2f252..7f1b5db4d34f706f2107ef43ab9c5acf67dac9f6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.LocallyConnected2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.local.LocallyConnected2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
index 82008b89d038e26155b8ae952c2943557ed8c35e..b3e31000f3bca0821377d70b1d88a20aa8f8e4ef 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Masking"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Masking\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Masking\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 31a34a17d04129c8dfe8ec6f98b9fdbc110e13f4..bbd9d1b0dc075bb9241f240b423933db20b38b75 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.MaxPool1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 70d24ac75c4f850374fa8cffb881652f97e97d22..fe72beea802d12b996948b00436b274ee7e83177 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.MaxPool2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 55b16564b30919fb48d97862e2e8cfc0fdda8de5..e9bf57b2b0e60376a28c0abfc16fba393df3e73c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.MaxPool3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index a230b74c383eff93443d07e48f6818352302d9e7..0eecc58a2b6a2846a2c92502cc23bd328f8b5193 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.MaxPooling1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index d98f7c39f546048b1483617b24f004d5c9759d14..96785a7d8559611a19b7f36216dbf0f8a3e39e61 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.MaxPooling2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index b2e96a4203758270afb8c225a05a481dbc329a84..42c46cccb37b1ab7ece7760e6858b2180ea833b9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.MaxPooling3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
index 0c45bbdf171ad9831e51d2b1ba952fab9eb3d0e5..ac816f68d492cbfc5503c057a869e3e981de9190 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Maximum"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Maximum\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge.Maximum\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
index 6423d83418aa40b57afb3d5ff22f4ec605183587..9ae99563e9a1b3b0700116ed88c13f94fafe1658 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Multiply"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Multiply\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge.Multiply\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 6e17081375b7fbdcb000dfcbc0cd48ab072fe6e5..815f3bc2d142069adb4e418a4dc6ef82d683373f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.PReLU"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.PReLU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.PReLU\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
index d01d371da596256873c3799b99c45db01c674ff9..e704992b4a18f6bdbd9474af2ee59ea81534d80a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Permute"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Permute\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Permute\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
index d3f5508640e268df484af5adaf66621fa3d92d5b..b3a58fa11eda61baa5c932bcc04fdca7459a215f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.RNN"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
index 44e1007f5420bbb8feda891901c138cb776c071e..78f464583b4e8083f4cdd1a8c6b9f377645cd562 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.RepeatVector"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.RepeatVector\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.RepeatVector\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
index 8fc3ec33310f531a6ca6948dc20db67543cde69a..222344fd0497afe9a32d1d05ec37aa160479d88a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Reshape"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Reshape\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Reshape\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 457d27749504167f2865bc272251bd89b5d3297e..55fddf576cac6afabe984cd51e2ddbf112a55d25 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.SeparableConv1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index 54eda8ee2121d4fadd73a33010202850d743cc65..96314ce49849a50ccc6b968b50c98ddae74c6c70 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.SeparableConv2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -84,7 +84,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 711196554698b79c64e1c67bc875ce33c70563c6..88bdf9956603c590940e3ef857765586df7e91d7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.SeparableConvolution1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 815e34a48de542feb078d0002dbebbbf4d199e63..6eeea7a8d1312ada423206378b4c6ee079ffdd73 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.SeparableConvolution2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -84,7 +84,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 6614760e5e72556ac61c3788065b3faf2d286800..3050d46249003716eb0778104b729ee9cb52b34f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.SimpleRNNCell"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index bfcfd71ecddfb618ceb53969e8782f535786009b..dda4c9358ba5faa084ad2e6cf75ff83b6a7b2b20 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.SimpleRNN"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
index 9c4618c4e91ae288284b35c4a0c6bbbfe604d91d..cc6275158b67e94c3c39802cc7c0f9e169c8b144 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Softmax"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.Softmax\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.Softmax\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 9a0a19d2d52f34e61c273bd4d4a27c46940dd5bf..5eb7e750477b17571ef861305806894dd2b9ac38 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.SpatialDropout1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 446f7122a6a2dc1d2f7377cef00d7b5b9a89cd3a..500cb8c14ead3eeff28d11b72e2300cc471756d2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.SpatialDropout2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 52a0485b5cea2be6f2f4d9b0ee31cb2388adcef7..1113a7634fa98b499175d90ae7da2d3fb9fb1a13 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.SpatialDropout3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index c82e7a192dfca8ca38832a53f9135125b4c34286..c4b9f93561de6a5d8ecc19bbae17831466b51fe6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.StackedRNNCells"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.StackedRNNCells\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.StackedRNNCells\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index 9ccf251a18034371607cfdc6091f2282136feec9..282c98d79a6e1da46e4d7ea2e5c7228754792f09 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ThresholdedReLU"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.ThresholdedReLU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ThresholdedReLU\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
index e080a07799fe1b7ec5f73b4e7bf78053a2c9dd3c..acab93706b29fedc1bf7b48da2f5b6636dea48e5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.TimeDistributed"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.TimeDistributed\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.wrappers.TimeDistributed\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 5fadca0b8386951976e8a6330bacf0dcf169e2d1..a5ec228a074721775d4ec0369345b5439d84e186 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.UpSampling1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index 2d395bf7e87b8835bcf63d792d68dc3ac4083051..d8d8e0bfe95a6cf2ef61cdb344b963df3f21aabb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.UpSampling2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 18d58ec3b23f7d4fd13ff45f4c1d4d95e7722ed5..97d6dc06fb2e883b20540e4496efa5b39a538263 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.UpSampling3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
index 6223cb2f3c1230a60f3cf3dd57a0e803cf4f15d9..ea9bb41b9979de9049397892372f37aafc719a68 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Wrapper"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index e71bba6a7f1df713cd13b4a0249d001d56bb31b0..e6d1d2e089b01c4eb212d01c456f6fa6b850f7de 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ZeroPadding1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index aba6d8cb1f43bb070f9b17b5290afc5ce30246b5..f62017305f26519181b1ef86bdd0946d44d16b88 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ZeroPadding2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index ce545ecc954e84e702fbd24047370a3417dc0fb1..07a1fde5bdc35535ca5d8443a97cb85adc54b14a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ZeroPadding3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
index dd78384005fce159c5337c06760d1ba37ecbf390..62aa929d32b57518abbe924c036062eb7ccd3acf 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.models.Model"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -155,7 +155,7 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
index 9fcb03f47e7701106b275a9e7ee2adf1243bbdb3..93ecbbce9b17b9ca6157e65bbabd6c36008c3992 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.models.Sequential"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.sequential.Sequential\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -160,7 +160,7 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
index 32667cf31e4aaacf3374ca4a434f32eec5b3e07e..b9ce154bddef609e0aaf6627d6f59de551e51e3b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Adadelta"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Adadelta\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Adadelta\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
index efca59e8e427d28de36446a49ea4e1ca0bb385eb..d0dc9e37a386a26143365eb443d5ba5fce8a87d9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Adagrad"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Adagrad\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Adagrad\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
index 5546e2067ab65abce928d609b41b65bbc40246f6..06815fa99a4a474ec131c29d0cbc78bb2b9cb72d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Adam"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Adam\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Adam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
index aaa54a106066266d0a7c19f4609e4cc7ed766d95..47b55fdb44e79e976b6de13d760a7cf175323c6c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Adamax"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Adamax\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Adamax\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
index 1fada7fd9c6eefbb16f1b5a042e6fea607a461a9..8c63a7dda98568b24ea1b3cda15d4c840fbfd804 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Nadam"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Nadam\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Nadam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-optimizer.pbtxt
index ca47e952282e0c1a9ee85d8912e479a0ed5b4e86..53d64dae932e250b9d81b2767a833de3bac8c403 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.optimizers.Optimizer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index fd3f97f35dcb18c82188c51345c2e3276a88f23f..a1e9b8cceb95e8f25ac5f414fadacf237be33cd9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.RMSprop"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.RMSprop\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.RMSprop\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-s-g-d.pbtxt
index 25adfd3f0bc89d9dbd3b2b8068e7b4ff99170909..a67fefb1bafebd62db9f6108f0fe1847b5d2e0cb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.SGD"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.SGD\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.SGD\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
index ec0f3d892d9d03a738d34a40afe701e788908a8e..dddace87dca85cae378618fcf4d4e6d005ca9d4a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.preprocessing.image.DirectoryIterator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.DirectoryIterator\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.Sequence\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.DirectoryIterator\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.Iterator\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
index f5bc04e44c198e5bc60f8361dd32e4ae00250468..c1e2e94f0bea933a630655eda205b6b6daf2eb93 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.preprocessing.image.ImageDataGenerator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.ImageDataGenerator\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.ImageDataGenerator\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-iterator.pbtxt
index 69488d63bf118272d9b3f62027f10ff1c2dd0eff..825d9f1d1d6a828296458b831c65eecae391e0f6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-iterator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.preprocessing.image.Iterator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.Sequence\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.Iterator\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
index 42196ddeee7aab144537eef250c07060923fa6a9..75924a254a6a59232b1e9c9bd01ddb7445cda5d2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.preprocessing.image.NumpyArrayIterator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.NumpyArrayIterator\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.Sequence\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.NumpyArrayIterator\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.Iterator\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt
index d9c3215b555c19bc5cf4b32b0d227a9e1b63ce1e..326b1fa4fda1c0554efd8e6ba8dc93fdef0ede0f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.preprocessing.sequence.TimeseriesGenerator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.sequence.TimeseriesGenerator\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.Sequence\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.sequence.TimeseriesGenerator\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
index ce91caa1afe081ccf05ecdd4884a3e29ea93d496..b42b12b6c060f59c30590f7cc4892a09881d08d7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.preprocessing.text.Tokenizer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.text.Tokenizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.text.Tokenizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-l1-l2.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-l1-l2.pbtxt
index 04dcda38609c7114bdf6e2784938905fc3ef8af3..a45fb7b55e58a5679427752af22dce49203dc1cc 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-l1-l2.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-l1-l2.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.regularizers.L1L2"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.regularizers.L1L2\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.regularizers.Regularizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-regularizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-regularizer.pbtxt
index b0a125f238e58fb8b1213f52fc1fb85781ca5807..641001a646564d0a466739ee6d2bdd31a27beab7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-regularizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-regularizer.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.regularizers.Regularizer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.regularizers.Regularizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-custom-object-scope.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.-custom-object-scope.pbtxt
index dda39ed221a06827601a9432f887ddc5f5ee9b01..109682046b990107915d65be3cad86ead3e22688 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.-custom-object-scope.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.-custom-object-scope.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.utils.CustomObjectScope"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.generic_utils.CustomObjectScope\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.generic_utils.CustomObjectScope\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt
index 1c5868e711beeeb072e41630f06ba7d9841defbb..939fd547d06bbd03b7e1a1db1404263ff01fd07c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.utils.GeneratorEnqueuer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.GeneratorEnqueuer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.SequenceEnqueuer\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.GeneratorEnqueuer\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.SequenceEnqueuer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
index ce62c8bafcaec1bf2e6ab3989da68588f7c848e9..6b832051a975b61ba05874c3dda558c63aeaa055 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.utils.HDF5Matrix"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.io_utils.HDF5Matrix\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.io_utils.HDF5Matrix\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dtype"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt
index 16e1cbe650e1662f8694fd7137ad20a48a90675b..be4496e753f8bdcd76a4761f9bd1804a77380359 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.utils.Progbar"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.generic_utils.Progbar\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.generic_utils.Progbar\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence-enqueuer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
index 5cf2a07b0b265ba88d7942698640520d53a2f407..a9e499d1009b5a7458080db6c10a948af21c7b6c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.utils.SequenceEnqueuer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.SequenceEnqueuer\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.SequenceEnqueuer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence.pbtxt
index 5b272253e3767941b10d42ef5fef9c09433e9f59..e2dc932dc86dbba49d186e1dbc4bc026a52f6ef5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.utils.Sequence"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.Sequence\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt
index 5a446c09d0130e173394b02a30f56a5c7ec9c34c..4d7a1519ce59b6f0a7f0bbfb3292842a6f21dffd 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt
@@ -46,7 +46,7 @@ tf_module {
   }
   member_method {
     name: "multi_gpu_model"
-    argspec: "args=[\'model\', \'gpus\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'model\', \'gpus\', \'cpu_merge\', \'cpu_relocation\'], varargs=None, keywords=None, defaults=[\'True\', \'False\'], "
   }
   member_method {
     name: "normalize"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
index 8d200f99fd14d6a7735e1a74299159d6b198cd68..67cca3af41dbf68b963fb2315b65f9f843c9a42d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.wrappers.scikit_learn.KerasClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.wrappers.scikit_learn.KerasClassifier\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.wrappers.scikit_learn.BaseWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier\'>"
+  is_instance: "<class \'tensorflow.python.keras.wrappers.scikit_learn.BaseWrapper\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
index 7a971346d86f4930c7bba872031e049a93445d1d..f4b9b7e277ecdb155327d83c57ec2a997c043555 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.wrappers.scikit_learn.KerasRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.wrappers.scikit_learn.KerasRegressor\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.wrappers.scikit_learn.BaseWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor\'>"
+  is_instance: "<class \'tensorflow.python.keras.wrappers.scikit_learn.BaseWrapper\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
index 38fd78a5a828c7d0da98c97fdc01f504397c6fe6..11067058d5852669e1672bf3eb8b7c680d0e5dc9 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.AveragePooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
index 86a524cc91e10616cd049cec93843e419ec670c7..3259e706d7f7ea4d0348c1ee586c50f5a2c82b39 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.AveragePooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
index 8a811fe4561ac3790e43a5553ca04fca002e420d..e561f2f415018840420232a97f0ece3f3c60d0d7 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.AveragePooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
index 3923e706be7a352d770bf309aecd1fadb2a05e81..3124a35c7852a97e79a3cfe575017484f2f5731f 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
index 7a0a8a2a51295d9009f44e2ea126e8a4694147af..b5ec61255ace78c1fa13370727eb5f5084522f4a 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.Conv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
index 7ed3a652519a6429ff429925c29f3c296a6d2958..b2c89ae66f53299289508eef174b5c44a6be2606 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.layers.Conv2DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
index 23831aa74f1c3dea99e6da407e5a63693f94e37a..9e4f4969dc6e1b6a39cf1d25c5e5e6175fa87c7c 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.Conv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
index 9d41a6b09900d984706accd70a353cc26585d9b5..9850e6d7659d311c93dabad73d35f2fcd028dd52 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.layers.Conv3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
index 865fe08e63c81222395d125938c8830c02030733..be113826cc2b9589e1f8bbde896fbcbe183d4d1b 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.Conv3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
index ee164aae204d3f6c09af79f7fbac825ce470098d..0d951bf6336ac7b65be57535c1065e5f87a77a0b 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Dense\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dense\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
index 8167dc79cdf9f83b9b97557638bf0702a1be86da..f1beeed9ef0cb54318249e42b1279680ea117ba8 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
index efa4419692993aa9975c1af2e647288ae9f38eba..b75a012811ff10f055382ea1315eaba506c24ed8 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Flatten\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Flatten\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -92,7 +92,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
index 2ff89f0a6faef905bcafdcb36121f506e9a9977a..fd02c919aeb5a536bd052324618983af699e7c47 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
index b3a6dfdffa28d3628e09e6aab823534ba84edf16..80e0fb228b034727854ab1a4df97e25c6bc2cd97 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
index cef396489dde24698cd9a63b6247292958cfec4e..50ff484d733633e20e9923dbbf1344af7b51ba9a 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.MaxPooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
index 565f0c7a79661f77d5987d671266ff69268b03b2..cea809744cd07cc6ed0d1655f217cb5821e503e4 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.MaxPooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
index 595ce2eeadfc95fe44895ffd976024aee80ee948..ab9e89554c81decf5ee7e42dc963da9ab35e65c7 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.MaxPooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
index ccca96f72248e390ca65db061d836ee58c8e3205..4362568445e892d6127759c925d47426d49d9927 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.layers.SeparableConv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
index 1c99c9618254cadb1b4e95c7223ca9361e4fa861..3cad824cd3b197b91a749347c860ff926610c081 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.layers.SeparableConv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index e1abd43ab54f9d0c131fd94cfeefcdd7ccdb00f1..a8d9e120cb4aa965c1d85df59de1fbabc196bf54 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 93e7e40199884fa77ae7da6d15113b8b4ca125ab..c039890e1f4c1d57e7b795f1f09cff71921f6554 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 3c3e38229738fec3b25f437a73f3a9d216d970af..62c393de34475a8806015bed187572f79cf2a196 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DeviceWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index db16660f1145b55c824c698653094977dd6c718b..f121ba7939acb14681aa6b04b333668dded37aad 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index 465fc1cd9c886acaeb8ac49ff54417b9fd101b04..4583dc32b2e98d4a9912378fe0e3d841882772fd 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 38a387d55a4d34524cdc8af1f7ce1617eb120373..5016b6ac3010e2e184674db4837173c57c44b97e 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index b9e3d934759accd885036fa4c5a7013ef64736f3..59623fc983a63c2966882aa5113423c0a9e23b72 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.MultiRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index 75b5898c591cbe2b761c0f709159c5489cb8f76a..e2ab5aaee9456ffbe42894f2384d7bc9c7ad6a6f 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.nn.rnn_cell.RNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index fee0dc63b997f328a4e3d44040c4056de4128eb7..bd2a6d61f8578a2a3c8d94d3a8d5eb49679df2f7 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.ResidualWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 0b12bc060efc2ecfa4819e77cbb0d26c5a6dc419..3051c4437e9a14bf0ef86adfa8c596b736a6172d 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -488,6 +488,10 @@ tf_module {
     name: "sets"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "sparse"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "spectral"
     mtype: "<type \'module\'>"
@@ -496,6 +500,10 @@ tf_module {
     name: "string"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "strings"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "summary"
     mtype: "<type \'module\'>"
@@ -1524,6 +1532,10 @@ tf_module {
     name: "pow"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "print"
+    argspec: "args=[\'input_\', \'data\', \'message\', \'first_n\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "py_func"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
@@ -1686,7 +1698,7 @@ tf_module {
   }
   member_method {
     name: "scan"
-    argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'True\', \'None\'], "
+    argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'True\', \'False\', \'None\'], "
   }
   member_method {
     name: "scatter_add"
diff --git a/tensorflow/tools/api/golden/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/tensorflow.sparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bbfe395031aece42363ca7d6577fee856df6bde8
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.sparse.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.sparse"
+tf_module {
+  member_method {
+    name: "cross"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "cross_hashed"
+    argspec: "args=[\'inputs\', \'num_buckets\', \'hash_key\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a3fbe95bbad4b8c1d803e1002b2cf9ef2812fed0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.strings"
+tf_module {
+  member_method {
+    name: "regex_full_match"
+    argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
index 16bfbf20d5227d6308248bebcb62f32a2df8ef41..1f1d8b6f9e2cde4800cdef9c417191b1a0ce07b5 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdadeltaOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adadelta.AdadeltaOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
index 61cde9181c2367153b7b289b41bd932482bb92fd..a7c05d484905a0af26c80a52d92623ef4a3eb6c4 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdagradDAOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adagrad_da.AdagradDAOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
index 0a998c1afe4fff6e215360bc1cf8fc135754223c..bc8b92389c6ed7dcb0fa23ff3abd86bb0d1c488a 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdagradOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adagrad.AdagradOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
index cc5954152577796ee7a5a6e1cedc873647d64f7c..5d17be9378fd130b89e199544f85e03a23a71d3c 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdamOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adam.AdamOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
index 17f393d27c4b917a78902a4d6f13630311cb44ef..ddc553d7c984b24fe33c03bb90e00e7e81f55d26 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.train.Checkpoint"
 tf_class {
-  is_instance: "<class \'tensorflow.python.training.checkpointable_utils.Checkpoint\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.util.Checkpoint\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "save_counter"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
index 1add3a902122341a706c38b19ea6ff5882c26445..d265fdeb01c38d8a1347e630d7f7bff111999634 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.FtrlOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.ftrl.FtrlOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
index ef5bbd6ace29abb5c73516176fcc7594a58d493a..c673e29cd4dd6cd3c01582abfbc306c092818892 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.GradientDescentOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.gradient_descent.GradientDescentOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
index 3d6e87f5eb44de9d6ce1bdd25a54b8df9020cc03..8199f63b9b8c64c73a3d62294277838cdc240280 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.MomentumOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.momentum.MomentumOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
index e73861ff7cb2d90d8efac72cdd7de3b27395f29e..876bb35e391885e751066a415967af848280c714 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.train.Optimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
index 301b35b199c87890a0aef4139eb06253592ce0c4..14349a74efb61124fc7b5568d5ec023f08b1b62f 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.ProximalAdagradOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.proximal_adagrad.ProximalAdagradOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
index 8815befa936a85522011111a4a6270d22cbc25ae..7d982dc51f6edce1cf691671e31ddd07664f0dc1 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.ProximalGradientDescentOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.proximal_gradient_descent.ProximalGradientDescentOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
index e9819683ba5ec1bcacb3cdbcb2d787e866a77b6f..906384a2875bf7b05ac26fc43207f4ef9b5a7472 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.RMSPropOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.rmsprop.RMSPropOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
index 3db96aff876b88b80b647570cf68b1ebc0b2da3b..2c0fda3c72b7e1f02265827b9dc1929500935cd1 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.SyncReplicasOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.sync_replicas_optimizer.SyncReplicasOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index 15523028c726fefa13641a1369cf4274bcfb9973..eeb1fab40c4c7a2fff417b18111ecfe8ceabc71a 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -261,6 +261,10 @@ Status InitializeSession(int num_threads, const string& graph,
   graph_def->reset(new GraphDef());
   tensorflow::GraphDef tensorflow_graph;
   Status s = ReadBinaryProto(Env::Default(), graph, graph_def->get());
+  if (!s.ok()) {
+    s = ReadTextProto(Env::Default(), graph, graph_def->get());
+  }
+
   if (!s.ok()) {
     LOG(ERROR) << "Could not create TensorFlow Graph: " << s;
     return s;
diff --git a/tensorflow/tools/benchmark/benchmark_model_test.cc b/tensorflow/tools/benchmark/benchmark_model_test.cc
index 16ab2ff66e763a0ca5130f075f988bade9c8abd1..6813045d6320055b5d75fdf549b8c3c2e63e3022 100644
--- a/tensorflow/tools/benchmark/benchmark_model_test.cc
+++ b/tensorflow/tools/benchmark/benchmark_model_test.cc
@@ -26,30 +26,36 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-TEST(BenchmarkModelTest, InitializeAndRun) {
-  const string dir = testing::TmpDir();
-  const string filename_pb = io::JoinPath(dir, "graphdef.pb");
-
+void CreateTestGraph(const ::tensorflow::Scope& root,
+                     benchmark_model::InputLayerInfo* input,
+                     string* output_name, GraphDef* graph_def) {
   // Create a simple graph and write it to filename_pb.
   const int input_width = 400;
   const int input_height = 10;
-  benchmark_model::InputLayerInfo input;
-  input.shape = TensorShape({input_width, input_height});
-  input.data_type = DT_FLOAT;
+  input->shape = TensorShape({input_width, input_height});
+  input->data_type = DT_FLOAT;
   const TensorShape constant_shape({input_height, input_width});
 
   Tensor constant_tensor(DT_FLOAT, constant_shape);
   test::FillFn<float>(&constant_tensor, [](int) -> float { return 3.0; });
 
-  auto root = Scope::NewRootScope().ExitOnError();
   auto placeholder =
-      ops::Placeholder(root, DT_FLOAT, ops::Placeholder::Shape(input.shape));
-  input.name = placeholder.node()->name();
+      ops::Placeholder(root, DT_FLOAT, ops::Placeholder::Shape(input->shape));
+  input->name = placeholder.node()->name();
   auto m = ops::MatMul(root, placeholder, constant_tensor);
-  const string output_name = m.node()->name();
+  *output_name = m.node()->name();
+  TF_ASSERT_OK(root.ToGraphDef(graph_def));
+}
+
+TEST(BenchmarkModelTest, InitializeAndRun) {
+  const string dir = testing::TmpDir();
+  const string filename_pb = io::JoinPath(dir, "graphdef.pb");
+  auto root = Scope::NewRootScope().ExitOnError();
 
+  benchmark_model::InputLayerInfo input;
+  string output_name;
   GraphDef graph_def;
-  TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+  CreateTestGraph(root, &input, &output_name, &graph_def);
   string graph_def_serialized;
   graph_def.SerializeToString(&graph_def_serialized);
   TF_ASSERT_OK(
@@ -69,5 +75,30 @@ TEST(BenchmarkModelTest, InitializeAndRun) {
   ASSERT_EQ(num_runs, 10);
 }
 
+TEST(BenchmarkModeTest, TextProto) {
+  const string dir = testing::TmpDir();
+  const string filename_txt = io::JoinPath(dir, "graphdef.pb.txt");
+  auto root = Scope::NewRootScope().ExitOnError();
+
+  benchmark_model::InputLayerInfo input;
+  string output_name;
+  GraphDef graph_def;
+  CreateTestGraph(root, &input, &output_name, &graph_def);
+  TF_ASSERT_OK(WriteTextProto(Env::Default(), filename_txt, graph_def));
+
+  std::unique_ptr<Session> session;
+  std::unique_ptr<GraphDef> loaded_graph_def;
+  TF_ASSERT_OK(benchmark_model::InitializeSession(1, filename_txt, &session,
+                                                  &loaded_graph_def));
+  std::unique_ptr<StatSummarizer> stats;
+  stats.reset(new tensorflow::StatSummarizer(*(loaded_graph_def.get())));
+  int64 time;
+  int64 num_runs = 0;
+  TF_ASSERT_OK(benchmark_model::TimeMultipleRuns(
+      0.0, 10, 0.0, {input}, {output_name}, {}, session.get(), stats.get(),
+      &time, &num_runs));
+  ASSERT_EQ(num_runs, 10);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 797e0a6db52aa6216486bdc9c6a88ff353c57e15..e621f85652588f7b5cba6dc5128f857f9eb0fe09 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -168,7 +168,6 @@ else
   BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:embedding_lookup_test"
   BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:embedding_lookup_sparse_test"
   BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:fully_connected_test"
-  # BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/testing:generated_examples_zip_test"
   BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:hashtable_lookup_test"
   BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:local_response_norm_test"
   BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:lsh_projection_test"
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 8e8b2191e5c8f3fb6ada929cbc6b327fa0a67584..05676f9551d4a1e0cb55d0693f99e458381887df 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -100,9 +100,9 @@ do_pylint() {
 "^tensorflow/contrib/eager/python/evaluator\.py.*\[E0202.*method-hidden "\
 "^tensorflow/contrib/eager/python/metrics_impl\.py.*\[E0202.*method-hidden "\
 "^tensorflow/python/platform/gfile\.py.*\[E0301.*non-iterator "\
-"^tensorflow/python/keras/_impl/keras/callbacks\.py.*\[E1133.*not-an-iterable "\
-"^tensorflow/python/keras/_impl/keras/engine/base_layer.py.*\[E0203.*access-member-before-definition "\
-"^tensorflow/python/keras/_impl/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition "\
+"^tensorflow/python/keras/callbacks\.py.*\[E1133.*not-an-iterable "\
+"^tensorflow/python/keras/engine/base_layer.py.*\[E0203.*access-member-before-definition "\
+"^tensorflow/python/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition "\
 "^tensorflow/python/kernel_tests/constant_op_eager_test.py.*\[E0303.*invalid-length-returned"
 
   echo "ERROR_WHITELIST=\"${ERROR_WHITELIST}\""
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh b/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh
index 0beabcf5ef83000bb0b0c5d3dc077a3d2bbe118a..721590f4d6081de254b4dbc35849d8b988c4ddb0 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh
@@ -20,8 +20,8 @@ if [ ! -f /usr/bin/x86_64-linux-gnu-gcc ]; then
   ln -s /usr/local/bin/clang /usr/bin/x86_64-linux-gnu-gcc
 fi
 
-pip2 install --upgrade setuptools
-pip3 install --upgrade setuptools
+easy_install -U pip==9.0.3
+easy_install3 -U pip==9.0.3
 
 # The rest of the pip packages will be installed in
 # `install_pip_packages.sh`
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
index 6d017c8a1f0232deab82278b26797a73b3a8ea9c..c7980812507d05a100cf54158fb0581bd4a71296 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
@@ -52,7 +52,7 @@ bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test -k \
     //tensorflow/contrib/lite/kernels:embedding_lookup_test \
     //tensorflow/contrib/lite/kernels:embedding_lookup_sparse_test \
     //tensorflow/contrib/lite/kernels:fully_connected_test \
-    //tensorflow/contrib/lite/testing:generated_examples_zip_test \
+    //tensorflow/contrib/lite/testing:generated_zip_tests \
     //tensorflow/contrib/lite/kernels:hashtable_lookup_test \
     //tensorflow/contrib/lite/kernels:local_response_norm_test \
     //tensorflow/contrib/lite/kernels:lsh_projection_test \
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
index bf992cf63d27f0f169185a38fa33a01cd5375051..f958b3c9b75c302c9e0a0fb84dae9561e939ba73 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
@@ -21,7 +21,12 @@
 # See libtensorflow_cpu.sh and libtensorflow_gpu.sh
 
 set -ex
+
+# Current script directory
+
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+source "${SCRIPT_DIR}/../builds/builds_common.sh"
 DOCKER_CONTEXT_PATH="$(realpath ${SCRIPT_DIR}/..)"
 ROOT_DIR="$(realpath ${SCRIPT_DIR}/../../../../)"
 
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 1bd1852ffc570166ecc6efca1420bc54d702ed89..30ea8539aa33fb5b033b5ba3c2e53ae66fd17203 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -100,6 +100,7 @@ bazel build -c opt ${PI_COPTS} \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
   --verbose_failures \
+  --distinct_host_configuration=true \
   //tensorflow/tools/benchmark:benchmark_model \
   //tensorflow/tools/pip_package:build_pip_package
 
diff --git a/tensorflow/tools/ci_build/update_version.py b/tensorflow/tools/ci_build/update_version.py
index 9ddb2190487c2667aed1f9a1a647ae54ebc972d0..00bfcfd49bd1d90dccf094de21173ca9e4307319 100755
--- a/tensorflow/tools/ci_build/update_version.py
+++ b/tensorflow/tools/ci_build/update_version.py
@@ -250,7 +250,7 @@ def update_md_files(old_version, new_version):
 
   # Update any links to colab notebooks.
   def colab_url(version):
-    version_string = "%d.%d.%d" % (version.major, version.minor, version.patch)
+    version_string = "%s.%s.%s" % (version.major, version.minor, version.patch)
     prefix = "https://colab.research.google.com/github/tensorflow/models/blob/r"
     return prefix + version_string + "/"
 
diff --git a/tensorflow/tools/common/BUILD b/tensorflow/tools/common/BUILD
index b9032c046e93527fd0f41f183e49e4933029ec62..8c01d15a8060040825fff381367208ff1e322b20 100644
--- a/tensorflow/tools/common/BUILD
+++ b/tensorflow/tools/common/BUILD
@@ -40,7 +40,24 @@ py_test(
     srcs = ["traverse_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":test_module1",
+        ":test_module2",
         ":traverse",
         "//tensorflow/python:platform_test",
     ],
 )
+
+py_library(
+    name = "test_module1",
+    srcs = ["test_module1.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_module2",
+    ],
+)
+
+py_library(
+    name = "test_module2",
+    srcs = ["test_module2.py"],
+    srcs_version = "PY2AND3",
+)
diff --git a/tensorflow/python/keras/datasets/cifar10/__init__.py b/tensorflow/tools/common/test_module1.py
similarity index 70%
rename from tensorflow/python/keras/datasets/cifar10/__init__.py
rename to tensorflow/tools/common/test_module1.py
index 68d3eb789ea2c410095c0c75e0b79a9b07d209a3..cc185cf36e2616e3d1ba46d356a57bc184a35ae4 100644
--- a/tensorflow/python/keras/datasets/cifar10/__init__.py
+++ b/tensorflow/tools/common/test_module1.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,14 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""CIFAR10 small image classification dataset."""
+"""A module target for TraverseTest.test_module."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.cifar10 import load_data
+from tensorflow.tools.common import test_module2
+
+
+class ModuleClass1(object):
+
+  def __init__(self):
+    self._m2 = test_module2.ModuleClass2()
+
+  def __model_class1_method__(self):
+    pass
 
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/datasets/fashion_mnist/__init__.py b/tensorflow/tools/common/test_module2.py
similarity index 83%
rename from tensorflow/python/keras/datasets/fashion_mnist/__init__.py
rename to tensorflow/tools/common/test_module2.py
index 7f5ddecc4707334d52ebf4966f2ec6141cce0d46..d9da99d9c0f8141ad35b9d0d2e6c830b4c3828a8 100644
--- a/tensorflow/python/keras/datasets/fashion_mnist/__init__.py
+++ b/tensorflow/tools/common/test_module2.py
@@ -12,14 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Fashion-MNIST dataset."""
+"""A module target for TraverseTest.test_module."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.fashion_mnist import load_data
 
-del absolute_import
-del division
-del print_function
+class ModuleClass2(object):
+
+  def __init__(self):
+    pass
+
+  def __model_class1_method__(self):
+    pass
+
diff --git a/tensorflow/tools/common/traverse_test.py b/tensorflow/tools/common/traverse_test.py
index eb195ec18efbc77e7b6edceff6970aa98f683948..ed410694ce1c86af8f9483575fbc12c8ddde312d 100644
--- a/tensorflow/tools/common/traverse_test.py
+++ b/tensorflow/tools/common/traverse_test.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
 from tensorflow.python.platform import googletest
+from tensorflow.tools.common import test_module1
+from tensorflow.tools.common import test_module2
 from tensorflow.tools.common import traverse
 
 
@@ -30,10 +30,6 @@ class TestVisitor(object):
     self.call_log = []
 
   def __call__(self, path, parent, children):
-    # Do not traverse googletest, it's very deep.
-    for item in list(children):
-      if item[1] is googletest:
-        children.remove(item)
     self.call_log += [(path, parent, children)]
 
 
@@ -51,13 +47,12 @@ class TraverseTest(googletest.TestCase):
 
   def test_module(self):
     visitor = TestVisitor()
-    traverse.traverse(sys.modules[__name__], visitor)
+    traverse.traverse(test_module1, visitor)
 
     called = [parent for _, parent, _ in visitor.call_log]
 
-    self.assertIn(TestVisitor, called)
-    self.assertIn(TraverseTest, called)
-    self.assertIn(traverse, called)
+    self.assertIn(test_module1.ModuleClass1, called)
+    self.assertIn(test_module2.ModuleClass2, called)
 
   def test_class(self):
     visitor = TestVisitor()
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
index 47539b2423e602bb9771541ae5b01ba76c79f56f..f8f63e276cab61900cba9de599a11efc7718d078 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -31,7 +31,11 @@ def _def_file_filter_configure_impl(repository_ctx):
   vc_path = find_vc_path(repository_ctx)
   if vc_path == "visual-studio-not-found":
     auto_configure_fail("Visual C++ build tools not found on your machine")
-  undname_bin_path = find_msvc_tool(repository_ctx, vc_path, "undname.exe").replace("\\", "\\\\")
+
+  undname = find_msvc_tool(repository_ctx, vc_path, "undname.exe")
+  if undname == None:
+    auto_configure_fail("Couldn't find undname.exe under %s, please check your VC installation and set BAZEL_VC environment variable correctly." % vc_path)
+  undname_bin_path = undname.replace("\\", "\\\\")
 
   repository_ctx.template(
     "def_file_filter.py",
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index caae7fd5305af9846628eaf00348dd08df4e827f..99e09502be3439888d478b256b47a65c54a07a3e 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -64,9 +64,6 @@ die() {
 # Configurations
 DOCKER_IMG_NAME="tensorflow/tf-dist-test-local-cluster"
 
-# Use TensorFlow v1.5.0 for Python 2.7 and CPU only as we set num_gpus to 0 in the below
-DEFAULT_WHL_FILE_LOCATION="https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl"
-
 # Parse input arguments
 LEAVE_CONTAINER_RUNNING=0
 MODEL_NAME=""
@@ -77,8 +74,7 @@ SYNC_REPLICAS_FLAG=""
 
 WHL_FILE_LOCATION=${1}
 if [[ -z "${WHL_FILE_LOCATION}" ]]; then
-  WHL_FILE_LOCATION=${DEFAULT_WHL_FILE_LOCATION}
-  echo "use default whl file location"
+  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
 fi
 
 while true; do
@@ -131,7 +127,11 @@ echo "Building in temporary directory: ${BUILD_DIR}"
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
-if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
+# Download whl file into the build context directory.
+if [[ -z "${WHL_FILE_LOCATION}" ]]; then
+  pip2 download --no-deps tf-nightly
+  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
+elif [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
     # Download whl file into the build context directory.
     wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
         die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
diff --git a/tensorflow/tools/dist_test/remote_test.sh b/tensorflow/tools/dist_test/remote_test.sh
index 935535312d326a70aaf949332435569567cb1ca7..e188c88c8fa725daa619e244072fdb58765ea0a0 100755
--- a/tensorflow/tools/dist_test/remote_test.sh
+++ b/tensorflow/tools/dist_test/remote_test.sh
@@ -108,7 +108,7 @@ fi
 # Parse command-line arguments.
 WHL_URL=${1}
 if [[ -z "${WHL_URL}" ]]; then
-  die "whl URL is not specified"
+  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
 fi
 
 # Create docker build context directory.
@@ -121,8 +121,13 @@ cp -r ${DIR}/* ${BUILD_DIR}/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
 # Download whl file into the build context directory.
-wget -P "${BUILD_DIR}" ${WHL_URL} || \
-  die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+if [[ -z "${WHL_URL}" ]]; then
+  pip2 download --no-deps tf-nightly
+  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
+else
+  wget -P "${BUILD_DIR}" ${WHL_URL} || \
+    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+fi
 
 # Build docker image for test.
 docker build ${NO_CACHE_FLAG} \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index b9996395d02bfb58c4bcee1dfc0eb3707c28b209..406d134699ff182dde219c137f79a27094b09169 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -85,7 +85,7 @@ RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.g
 ENV CI_BUILD_PYTHON python
 
 RUN tensorflow/tools/ci_build/builds/configured CPU \
-    bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+    bazel build -c opt --copt=-mavx --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
         # For optimized builds appropriate for the hardware platform of your choosing, uncomment below...
         # For ivy-bridge or sandy-bridge
         # --copt=-march="ivybridge" \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 7e5e6ef2d5b0245607bacb9a99f1352c052d0ea4..2fe47f3356ce26da4174b95d59dce1889d3ec90c 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -98,7 +98,7 @@ ENV TF_CUDNN_VERSION=7
 RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
     LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
     tensorflow/tools/ci_build/builds/configured GPU \
-    bazel build -c opt --config=cuda \
+    bazel build -c opt --copt=-mavx --config=cuda \
 	--cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
         tensorflow/tools/pip_package:build_pip_package && \
     rm /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
diff --git a/tensorflow/tools/graph_transforms/transform_graph.cc b/tensorflow/tools/graph_transforms/transform_graph.cc
index 3b9dd3dd2d4e24366b0eb61ef42f01e349d55a33..5cae8f8d8f32ef1b2a26a5062c6d59b82c8ada52 100644
--- a/tensorflow/tools/graph_transforms/transform_graph.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph.cc
@@ -141,7 +141,7 @@ std::string ExpandPath(const std::string& path_string) {
     return path_string;
   }
 
-  const char* home = NULL;
+  const char* home = nullptr;
   std::string::size_type prefix = path_string.find_first_of('/');
   if (path_string.length() == 1 || prefix == 1) {
     // The value of $HOME, e.g., ~/foo
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 569b6678cabddd17eaf21921d668221f1a865625..77f83b77a0214110e520c85d15ffa38bce65955f 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -124,6 +124,9 @@ genrule(
         "@fft2d//:fft/readme.txt",
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
+        "@grpc//:LICENSE",
+        "@grpc//third_party/address_sorting:LICENSE",
+        "@grpc//third_party/nanopb:LICENSE.txt",
         "@highwayhash//:LICENSE",
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 3af79ee170c20b70687c3181f00066f5fe4763aa..1a83c6e7578fed88f0bd7db5a5b620a5281fd95a 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -53,6 +53,7 @@ function main() {
   PKG_NAME_FLAG=""
   GPU_BUILD=0
   NIGHTLY_BUILD=0
+  PROJECT_NAME=""
   while true; do
     if [[ "$1" == "--nightly_flag" ]]; then
       NIGHTLY_BUILD=1
@@ -60,6 +61,12 @@ function main() {
       GPU_BUILD=1
     elif [[ "$1" == "--gpudirect" ]]; then
       PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
+    elif [[ "$1" == "--project_name" ]]; then
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
     fi
     shift
 
@@ -68,7 +75,9 @@ function main() {
     fi
   done
 
-  if [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
+  if [[ -n ${PROJECT_NAME} ]]; then
+    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
+  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
     PKG_NAME_FLAG="--project_name tf_nightly_gpu"
   elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
     PKG_NAME_FLAG="--project_name tf_nightly"
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 0d10162c303992a93726c90424173eaa396561df..319878e1b5ae9ff9d72132e2421062f6ca26197a 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -33,6 +33,21 @@ from setuptools.dist import Distribution
 # result for pip.
 _VERSION = '1.8.0'
 
+_SHORT_DESCRIPTION = ('TensorFlow is an open source machine learning framework '
+                      'for everyone.')
+
+_LONG_DESCRIPTION = ('TensorFlow is an open source software library for high '
+                     'performance numerical computation. Its flexible '
+                     'architecture allows easy deployment of computation across'
+                     ' a variety of platforms (CPUs, GPUs, TPUs), and from '
+                     'desktops to clusters of servers to mobile and edge '
+                     'devices. Originally developed by researchers and '
+                     'engineers from the Google Brain team within Google\'s AI '
+                     'organization, it comes with strong support for machine '
+                     'learning and deep learning and the flexible numerical '
+                     'computation core is used across many other scientific '
+                     'domains.')
+
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
@@ -214,8 +229,8 @@ headers = (list(find_files('*.h', 'tensorflow/core')) +
 setup(
     name=project_name,
     version=_VERSION.replace('-', ''),
-    description='TensorFlow helps the tensors flow',
-    long_description='',
+    description=_SHORT_DESCRIPTION,
+    long_description=_LONG_DESCRIPTION,
     url='https://www.tensorflow.org/',
     author='Google Inc.',
     author_email='opensource@google.com',
@@ -261,4 +276,5 @@ setup(
         'Topic :: Software Development :: Libraries :: Python Modules',
     ],
     license='Apache 2.0',
-    keywords='tensorflow tensor machine learning',)
+    keywords='tensorflow tensor machine learning',
+)
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
index 62e29b5128f3a2c9a22eadb02df6162fce352c60..aa56cc676d0dfdf88f449f07de4189eaabfa3112 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@@ -803,6 +803,9 @@ void Generator::Generate(const FileDescriptor& fd) {
   // Add header to cc file.
   SetOutput(&cc_);
   Print("// GENERATED FILE - DO NOT MODIFY");
+  Print();
+  Print("#include <algorithm>");  // for `std::stable_sort()`
+  Print();
   headers = {GetProtoTextHeaderName(fd, true /* impl */)};
   AddHeadersToCurrentSection(headers);
   Print();
diff --git a/tensorflow/tools/test/upload_test_benchmarks.py b/tensorflow/tools/test/upload_test_benchmarks.py
index 9c45359ee1b037ffb01820f874b88b6cabc6d14b..c0305751092c3ed1916f671bd515cb4253f5ada2 100644
--- a/tensorflow/tools/test/upload_test_benchmarks.py
+++ b/tensorflow/tools/test/upload_test_benchmarks.py
@@ -89,7 +89,6 @@ import shutil
 
 from six import text_type
 from google.cloud import datastore
-from six import text_type
 
 
 def is_real_file(dirpath, fname):
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 8b6ad0a138928c0277c239e3c61967b43178b98c..07c8f5e41622a034682724cbc6fc04130ffc5ad2 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -50,31 +50,31 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_linux",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz"
       ],
-      sha256 = "74844bd77294742bf2396ff040369d1aa4cdd9e826fcd38cf8398ae83564d146",
-      strip_prefix = "mklml_lnx_2018.0.2.20180127",
+      sha256 = "d2305244fdc9b87db7426ed4496e87a4b3977ad3374d73b8000e8b7a5b7aa725",
+      strip_prefix = "mklml_lnx_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_windows",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip"
       ],
-      sha256 = "d8fbf0faa0684bffa3548005d05fe5cfe56ff9dbc0e15e7612d7ac01055a6ded",
-      strip_prefix = "mklml_win_2018.0.2.20180127",
+      sha256 = "a584a5bf1c8d2ad70b90d12b52652030e9a338217719064fdb84b7ad0d693694",
+      strip_prefix = "mklml_win_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_darwin",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz"
       ],
-      sha256 = "aa740d71e14562bfea56e6829e6dc186e7487cbcf6748a88dec73826b7ec1943",
-      strip_prefix = "mklml_mac_2018.0.2.20180127",
+      sha256 = "094e3dfd61c816136dc8d12a45cc611ce26c5f4828176a3644cd0b0efa15a25b",
+      strip_prefix = "mklml_mac_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
 
@@ -85,11 +85,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
-          "https://github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
+          "https://github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
       ],
-      sha256 = "d2cfd93a70cfe86ebe054477c530c9b5c1218b70f75856eb6d1956c68ee89e8f",
-      strip_prefix = "mkl-dnn-0.13",
+      sha256 = "efebc53882856afec86457a2da644693f5d59c68772d41d640d6b60a8efc4eb0",
+      strip_prefix = "mkl-dnn-0.14",
       build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
   )
 
@@ -167,8 +167,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "gemmlowp",
       urls = [
-          # TODO (yongtang): uncomment once mirror.bazel.build is propagated.
-          # "https://mirror.bazel.build/github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
+          "https://mirror.bazel.build/github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
           "https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
       ],
       sha256 = "b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658",
@@ -189,11 +188,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "highwayhash",
       urls = [
-          "https://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
-          "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
+          "http://mirror.bazel.build/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
+          "https://github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
       ],
-      sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
-      strip_prefix = "highwayhash-dfcb97ca4fe9277bf9dc1802dd979b071896453b",
+      sha256 = "9c3e0e87d581feeb0c18d814d98f170ff23e62967a2bd6855847f0b2fe598a37",
+      strip_prefix = "highwayhash-fd3d9af80465e4383162e4a7c5e2f406e82dd968",
       build_file = clean_dep("//third_party:highwayhash.BUILD"),
   )
 
@@ -228,6 +227,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       sha256 = "e45ce5f68b1d80e2cb9a2b601605b374bdf51e1798ef1c2c2bd62131dfcf9eef",
       strip_prefix = "libpng-1.6.34",
       build_file = clean_dep("//third_party:png.BUILD"),
+      patch_file = clean_dep("//third_party:png_fix_rpi.patch"),
   )
 
   tf_http_archive(
@@ -316,7 +316,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       strip_prefix = "backports.weakref-1.0rc1/src",
       build_file = clean_dep("//third_party:backports_weakref.BUILD"),
   )
-  
+
   filegroup_external(
       name = "org_python_license",
       licenses = ["notice"],  # Python 2.0
@@ -452,11 +452,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/b3f6a6a61625296bb532a65c0bf51b91b05b3361.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/b3f6a6a61625296bb532a65c0bf51b91b05b3361.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/638915a37f90f26599941977846408864f70ab35.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/638915a37f90f26599941977846408864f70ab35.tar.gz",
       ],
-      sha256 = "93895b289a78a47a1e75652e12a1b9a6c119f086a509b00e0084cf2bb944b709",
-      strip_prefix = "llvm-b3f6a6a61625296bb532a65c0bf51b91b05b3361",
+      sha256 = "aae3cacefa318cef030b4ca1e81ee9906752bbd89013cf9d47e156b5ad04b3a5",
+      strip_prefix = "llvm-638915a37f90f26599941977846408864f70ab35",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
@@ -744,6 +744,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = clean_dep("//third_party:tflite_smartreply.BUILD"),
   )
 
+  tf_http_archive(
+      name = "tflite_ovic_testdata",
+      sha256 = "a9a705d8d519220178e2e65d383fdb21da37fdb31d1e909b0a1acdac46479e9c",
+      urls = [
+          "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/data/ovic.zip",
+          "https://storage.googleapis.com/download.tensorflow.org/data/ovic.zip",
+      ],
+      build_file = clean_dep("//third_party:tflite_ovic_testdata.BUILD"),
+      strip_prefix = "ovic",
+  )
+
   ##############################################################################
   # BIND DEFINITIONS
   #
diff --git a/third_party/png_fix_rpi.patch b/third_party/png_fix_rpi.patch
new file mode 100644
index 0000000000000000000000000000000000000000..80da7b3c06444ff22a869de48bca273f99963728
--- /dev/null
+++ b/third_party/png_fix_rpi.patch
@@ -0,0 +1,16 @@
+diff -r -u /tmp/libpng-1.6.34/scripts/pnglibconf.h.prebuilt ./scripts/pnglibconf.h.prebuilt
+--- /tmp/libpng-1.6.34/scripts/pnglibconf.h.prebuilt	2017-09-29 01:42:33.000000000 -0700
++++ ./scripts/pnglibconf.h.prebuilt	2018-05-01 09:51:24.719318242 -0700
+@@ -20,6 +20,12 @@
+ #define PNG_ALIGNED_MEMORY_SUPPORTED
+ /*#undef PNG_ARM_NEON_API_SUPPORTED*/
+ /*#undef PNG_ARM_NEON_CHECK_SUPPORTED*/
++
++/* Workaround not having a great build file by forcing
++ * png filter optimization to be disabled on arm */
++#define PNG_ARM_NEON_OPT 0
++
++
+ /*#undef PNG_POWERPC_VSX_API_SUPPORTED*/
+ /*#undef PNG_POWERPC_VSX_CHECK_SUPPORTED*/
+ #define PNG_BENIGN_ERRORS_SUPPORTED
diff --git a/third_party/tflite_ovic_testdata.BUILD b/third_party/tflite_ovic_testdata.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..de47ed61f9db9ad980468aa325e3c770e0aae4f1
--- /dev/null
+++ b/third_party/tflite_ovic_testdata.BUILD
@@ -0,0 +1,12 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(
+    glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 1c1e6afb65ab8da5b689d58ecaec6ac6c8a69bb8..03aa52da1f6e9c113d6db6cb9c1d38b5be21927d 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -1,8 +1,14 @@
+# By default, we don't distinct target and host platfroms.
+# When doing cross compilation, use --config=cross_compile to distinct them.
+build --distinct_host_configuration=false
+build:cross_compile --distinct_host_configuration=true
+
 # Android configs. Bazel needs to have --cpu and --fat_apk_cpu both set to the
 # target CPU to build transient dependencies correctly. See
 # https://docs.bazel.build/versions/master/user-manual.html#flag--fat_apk_cpu
 build:android --crosstool_top=//external:android/crosstool
 build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:android --config=cross_compile
 build:android_arm --config=android
 build:android_arm --cpu=armeabi-v7a
 build:android_arm --fat_apk_cpu=armeabi-v7a